|
| M0_BOB_DEFINE (static, &pdclust_bob, m0_pdclust_layout) |
|
| M0_BOB_DEFINE (static, &pdclust_instance_bob, m0_pdclust_instance) |
|
static bool | pdclust_allocated_invariant (const struct m0_pdclust_layout *pl) |
|
static bool | pdclust_invariant (const struct m0_pdclust_layout *pl) |
|
static bool | pdclust_instance_invariant (const struct m0_pdclust_instance *pi) |
|
static int | pdclust_register (struct m0_layout_domain *dom, const struct m0_layout_type *lt) |
|
static void | pdclust_unregister (struct m0_layout_domain *dom, const struct m0_layout_type *lt) |
|
static void | pdclust_fini (struct m0_ref *ref) |
|
static int | pdclust_allocate (struct m0_layout_domain *dom, uint64_t lid, struct m0_layout **out) |
|
static void | pdclust_delete (struct m0_layout *l) |
|
static int | pdclust_populate (struct m0_pdclust_layout *pl, const struct m0_pdclust_attr *attr, struct m0_layout_enum *le, uint32_t user_count) |
|
M0_INTERNAL int | m0_pdclust_build (struct m0_layout_domain *dom, uint64_t lid, const struct m0_pdclust_attr *attr, struct m0_layout_enum *le, struct m0_pdclust_layout **out) |
|
M0_INTERNAL bool | m0_pdclust_attr_check (const struct m0_pdclust_attr *attr) |
|
M0_INTERNAL uint32_t | m0_pdclust_N (const struct m0_pdclust_layout *pl) |
|
M0_INTERNAL uint32_t | m0_pdclust_K (const struct m0_pdclust_layout *pl) |
|
M0_INTERNAL uint32_t | m0_pdclust_S (const struct m0_pdclust_layout *pl) |
|
M0_INTERNAL uint32_t | m0_pdclust_P (const struct m0_pdclust_layout *pl) |
|
M0_INTERNAL uint32_t | m0_pdclust_size (const struct m0_pdclust_layout *pl) |
|
M0_INTERNAL uint64_t | m0_pdclust_unit_size (const struct m0_pdclust_layout *pl) |
|
M0_INTERNAL struct m0_pdclust_layout * | m0_layout_to_pdl (const struct m0_layout *l) |
|
M0_INTERNAL struct m0_layout * | m0_pdl_to_layout (struct m0_pdclust_layout *pl) |
|
M0_INTERNAL struct m0_pdclust_instance * | m0_layout_instance_to_pdi (const struct m0_layout_instance *li) |
|
static struct m0_pdclust_layout * | pi_to_pl (struct m0_pdclust_instance *pi) |
|
static struct m0_layout_enum * | pdclust_instance_to_enum (const struct m0_layout_instance *li) |
|
M0_INTERNAL enum m0_pdclust_unit_type | m0_pdclust_unit_classify (const struct m0_pdclust_layout *pl, int unit) |
|
static m0_bcount_t | pdclust_max_recsize (struct m0_layout_domain *dom) |
|
static int | pdclust_decode (struct m0_layout *l, struct m0_bufvec_cursor *cur, enum m0_layout_xcode_op op, struct m0_be_tx *tx, uint32_t user_count) |
|
static int | pdclust_encode (struct m0_layout *l, enum m0_layout_xcode_op op, struct m0_be_tx *tx, struct m0_bufvec_cursor *out) |
|
static m0_bcount_t | pdclust_recsize (const struct m0_layout *l) |
|
static uint64_t | m_enc (uint64_t width, uint64_t row, uint64_t column) |
|
static void | m_dec (uint64_t width, uint64_t pos, uint64_t *row, uint64_t *column) |
|
static void | permute (uint32_t n, uint32_t *k, uint32_t *s, uint32_t *r) |
|
static uint64_t | permute_column (struct m0_pdclust_instance *pi, uint64_t omega, uint64_t t) |
|
M0_INTERNAL void | m0_pdclust_instance_map (struct m0_pdclust_instance *pi, const struct m0_pdclust_src_addr *src, struct m0_pdclust_tgt_addr *tgt) |
|
M0_INTERNAL void | m0_pdclust_instance_inv (struct m0_pdclust_instance *pi, const struct m0_pdclust_tgt_addr *tgt, struct m0_pdclust_src_addr *src) |
|
M0_INTERNAL void | pdclust_instance_fini (struct m0_layout_instance *li) |
|
M0_INTERNAL void | m0_pdclust_perm_cache_destroy (struct m0_layout *layout, struct m0_pdclust_instance *pi) |
|
M0_INTERNAL int | m0_pdclust_perm_cache_build (struct m0_layout *layout, struct m0_pdclust_instance *pi) |
|
M0_INTERNAL bool | m0_pdclust_is_replicated (struct m0_pdclust_layout *play) |
|
static int | pdclust_instance_build (struct m0_layout *l, const struct m0_fid *fid, struct m0_layout_instance **out) |
|
| M0_BASSERT (M0_IS_8ALIGNED(sizeof(struct m0_layout_pdclust_rec))) |
|
Parity de-clustered layouts. See the link below for HLD and references to the literature. Parity de-clustering generalises higher RAID patterns (N+K, with K > 1) for the case where an object is striped over more target objects ("devices" in the traditional RAID terminology) than there are units in a parity group. Due to this, parity de-clustered layouts are parametrised by three numbers:
- N—number of data units in a parity group;
- S—number of spare units in a parity group;
- K—number of parity units in a parity group. Data in an object striped with a given K can survive a loss of up to K target objects. When a target object failure is repaired, distributed spare units are used to store re-constructed data. There are S spare units in each parity group, making the latter consisting of N+K+S units;
- P—number of target objects over which layout stripes data, parity and spare units. A target object is divided into frames of unit size.
Layout maps source units to target frames. This mapping is defined in terms of "tiles" which are groups of frames. A tile can be seen either as an L*P block of frames, L rows of P columns each, each row containing frames with the same offset in every target object, or as a C*(N+K+S) block of C groups, N+K+S frames each. Here L and C are two additional layout parameters selected so that C*(N+K+S) == L*P.
Looking at a tile as a C*(N+K+S) block, map C consecutive parity groups (each containing N+K+S units) to it, then switch to L*P view and apply a certain permutation (depending on tile number) to columns.
HLD explains why resulting layout mapping function possesses a number of desirable properties.
For documentation links, please refer to this file : doc/motr-design-doc-list.rst
Implementation overview.
Parity de-clustering layout mapping function requires some amount of code dealing with permutations, random sequences generations and conversions between matrices of different shapes.
First, as explained in the HLD, an efficient way to generate permutations uniformly scattered across the set of all permutations of a given set is necessary. To this end permute_column() uses a sequence of pseudo-random numbers obtained from a PRNG (m0_rnd()). Few comments are in order:
- to seed a PRNG, layout seed and tile number are hashed by a multiplicative cache (m0_hash());
- system PRNG cannot be used, because reproducible sequences are needed. m0_rnd() is a very simple linear congruential generator straight from TAOCP. It takes care to return higher, more random, bits of result;
- layout behavior is quite sensitive to the PRNG properties. For example, if m0_rnd() is changed to return lower bits (result % max), resulting distribution of spare and parity units is not uniform even for large number of units. Experiments with different PRNG's are indicated.
Once permutation's Lehmer code is generated, it has to be applied to the set of columns. permute() function applies a permutation, simultaneously building an inverse permutation.
Finally, layout mapping function is defined in terms of conversions between matrices of different shapes. Let's call a matrix having M columns and an arbitrary (probably infinite) number of rows an M-matrix. An element of an M-matrix has (row, column) coordinates. Coordinate pairs can be ordered and enumerated in the "row first" lexicographical order:
(0, 0) < (0, 1) < ... < (0, M - 1) < (1, 0) < ...
Function m_enc() returns the number a (row, column) element of an M-matrix has in this ordering. Conversely, function m_dec() returns coordinates of the element having a given number in the ordering. With the help of these two function an M-matrix can be re-arranged into an N-matrix in such a way the element position in the ordering remains invariant.
Layout mapping function m0_pdclust_instance_map() performs these re-arrangements in the following places:
- to convert a parity group number to a (tile number, group in tile) pair. This is a conversion of 1-matrix to C-matrix;
- to convert a tile from C*(N + K + S) to L*P form. This is a conversion of (N + K + S)-matrix to P-matrix;
- to convert a (tile number, frame in tile) pair to a target frame number. This is a conversion of L-matrix to 1-matrix.
Inverse layout mapping function m0_pdclust_instance_inv() performs reverse conversions.
◆ M0_PDCLUST_SEED
#define M0_PDCLUST_SEED "upjumpandpumpim," |
◆ M0_TRACE_SUBSYSTEM
#define M0_TRACE_SUBSYSTEM M0_TRACE_SUBSYS_LAYOUT |
◆ m0_pdclust_unit_type
Classification of units in a parity group.
Enumerator |
---|
M0_PUT_DATA | |
M0_PUT_PARITY | |
M0_PUT_SPARE | |
M0_PUT_NR | |
Definition at line 89 of file pdclust.h.
◆ M0_BASSERT()
◆ M0_BOB_DEFINE() [1/2]
◆ M0_BOB_DEFINE() [2/2]
◆ m0_layout_instance_to_pdi()
◆ m0_layout_to_pdl()
◆ m0_pdclust_attr_check()
M0_INTERNAL bool m0_pdclust_attr_check |
( |
const struct m0_pdclust_attr * |
attr | ) |
|
Returns true iff pa_P >= pa_N + pa_K + pa_S
Definition at line 340 of file pdclust.c.
◆ m0_pdclust_build()
Allocates and builds a layout object with the pdclust layout type, by setting its intial ref count to 1.
- Postcondition
- ergo(rc == 0, pdclust_invariant(*out))
-
ergo(rc == 0, m0_ref_read(l->l_ref) == 1)
- Note
- The layout object built by this API is to be finalised by releasing 'the reference on it that has been held during its creation'.
- See also
- m0_layout_put()
In short: Dual to m0_layout_put() when it is the last reference being released.
Definition at line 305 of file pdclust.c.
◆ m0_pdclust_instance_inv()
Reverse layout mapping function.
This function is a right inverse of layout mapping function. It is used by SNS repair and other server side mechanisms.
Definition at line 746 of file pdclust.c.
◆ m0_pdclust_instance_map()
Layout mapping function.
This function contains main parity de-clustering logic. It maps source units to target frames. It is used by client IO code to build IO requests and to direct them to the target objects.
Definition at line 701 of file pdclust.c.
◆ m0_pdclust_is_replicated()
◆ m0_pdclust_K()
◆ m0_pdclust_N()
◆ m0_pdclust_P()
◆ m0_pdclust_perm_cache_build()
◆ m0_pdclust_perm_cache_destroy()
◆ m0_pdclust_S()
◆ m0_pdclust_size()
◆ m0_pdclust_unit_classify()
Returns type of the given unit according to layout information.
Definition at line 425 of file pdclust.c.
◆ m0_pdclust_unit_size()
◆ m0_pdl_to_layout()
◆ m_dec()
static void m_dec |
( |
uint64_t |
width, |
|
|
uint64_t |
pos, |
|
|
uint64_t * |
row, |
|
|
uint64_t * |
column |
|
) |
| |
|
static |
"Decoding" function: returns (row, column) coordinates of a pos-th element in a matrix with "width" column when elements are counted row by row. This function is denoted d_{width} in the HLD.
- See also
- m_enc()
Definition at line 591 of file pdclust.c.
◆ m_enc()
static uint64_t m_enc |
( |
uint64_t |
width, |
|
|
uint64_t |
row, |
|
|
uint64_t |
column |
|
) |
| |
|
static |
"Encoding" function: returns the number that a (row, column) element of a matrix with "width" columns has when elements are counted row by row. This function is denoted e_{width} in the HLD.
- See also
- m_dec()
Definition at line 578 of file pdclust.c.
◆ pdclust_allocate()
Implementation of lto_allocate() for PDCLUST layout type.
Definition at line 210 of file pdclust.c.
◆ pdclust_allocated_invariant()
◆ pdclust_decode()
Implementation of lo_decode() for pdclust layout type.
Definition at line 445 of file pdclust.c.
◆ pdclust_delete()
static void pdclust_delete |
( |
struct m0_layout * |
l | ) |
|
|
static |
Implementation of lo_delete() for PDCLUST layout type.
Definition at line 244 of file pdclust.c.
◆ pdclust_encode()
Implementation of lo_encode() for pdclust layout type.
Definition at line 514 of file pdclust.c.
◆ pdclust_fini()
static void pdclust_fini |
( |
struct m0_ref * |
ref | ) |
|
|
static |
Implementation of lo_fini for pdclust layout type.
Definition at line 192 of file pdclust.c.
◆ pdclust_instance_build()
Implementation of lo_instance_build().
Allocates and builds a parity de-clustered layout instance using the supplied layout 'l' that is necessarily of the type pdclust. It acquires an additional reference on that layout.
- Precondition
- pdclust_invariant(pl)
- Postcondition
- ergo(rc == 0, pdclust_instance_invariant(*out) && m0_ref_read(&l->l_ref) > 1))
Definition at line 844 of file pdclust.c.
◆ pdclust_instance_fini()
Implementation of lio_fini().
Definition at line 929 of file pdclust.c.
◆ pdclust_instance_invariant()
◆ pdclust_instance_to_enum()
Implementation of lio_to_enum()
Definition at line 416 of file pdclust.c.
◆ pdclust_invariant()
◆ pdclust_max_recsize()
Implementation of lto_max_recsize() for pdclust layout type.
Definition at line 436 of file pdclust.c.
◆ pdclust_populate()
Populates pl using the arguments supplied.
Definition at line 262 of file pdclust.c.
◆ pdclust_recsize()
Implementation of lo_recsize() for pdclust layout type.
Definition at line 555 of file pdclust.c.
◆ pdclust_register()
Implementation of lto_register for PDCLUST layout type. No table is required specifically for PDCLUST layout type.
Definition at line 179 of file pdclust.c.
◆ pdclust_unregister()
Implementation of lto_unregister for PDCLUST layout type.
Definition at line 186 of file pdclust.c.
◆ permute()
static void permute |
( |
uint32_t |
n, |
|
|
uint32_t * |
k, |
|
|
uint32_t * |
s, |
|
|
uint32_t * |
r |
|
) |
| |
|
static |
Apply a permutation given by its Lehmer code in k[] to a set s[] of n elements and build inverse permutation in r[].
- Parameters
-
n | - number of elements in k[], s[] and r[] |
k | - Lehmer code of the permutation |
s | - an array to permute |
r | - an array to build inverse permutation in |
- Precondition
- m0_forall(i, n, k[i] + i < n)
-
m0_forall(i, n, s[i] < n && ergo(s[i] == s[j], i == j))
- Postcondition
- m0_forall(i, n, s[i] < n && ergo(s[i] == s[j], i == j))
-
m0_forall(i, n, s[r[i]] == i && r[s[i]] == i)
Definition at line 611 of file pdclust.c.
◆ permute_column()
Returns column number that a column t has after a permutation for tile omega is applied.
- Todo:
- Not sure if this should be replaced by an ADDB DP or a M0_LOG.
Definition at line 652 of file pdclust.c.
◆ pi_to_pl()
◆ m0_pdclust_layout_type [1/2]
◆ m0_pdclust_layout_type [2/2]
Initial value:= {
.lt_name = "pdclust",
.lt_id = 0,
.lt_ref_count = 0,
}
static const struct m0_layout_type_ops pdclust_type_ops
Definition at line 968 of file pdclust.c.
◆ M0_PDCLUST_SRC_NULL [1/2]
◆ M0_PDCLUST_SRC_NULL [2/2]
◆ pdclust_bob
Initial value:= {
.bt_name = "pdclust",
}
#define offsetof(typ, memb)
Definition at line 105 of file pdclust.c.
◆ pdclust_instance_bob
Initial value:= {
.bt_name = "pd_instance",
}
#define offsetof(typ, memb)
Definition at line 114 of file pdclust.c.
◆ pdclust_instance_ops
Initial value:= {
}
static struct m0_layout_enum * pdclust_instance_to_enum(const struct m0_layout_instance *li)
M0_INTERNAL void pdclust_instance_fini(struct m0_layout_instance *li)
Definition at line 786 of file pdclust.c.
◆ pdclust_ops
Initial value:= {
}
static void pdclust_fini(struct m0_ref *ref)
static int pdclust_instance_build(struct m0_layout *l, const struct m0_fid *fid, struct m0_layout_instance **out)
static m0_bcount_t pdclust_recsize(const struct m0_layout *l)
static int pdclust_encode(struct m0_layout *l, enum m0_layout_xcode_op op, struct m0_be_tx *tx, struct m0_bufvec_cursor *out)
static void pdclust_delete(struct m0_layout *l)
static int pdclust_decode(struct m0_layout *l, struct m0_bufvec_cursor *cur, enum m0_layout_xcode_op op, struct m0_be_tx *tx, uint32_t user_count)
Definition at line 208 of file pdclust.c.
◆ pdclust_type_ops
Initial value:= {
}
static m0_bcount_t pdclust_max_recsize(struct m0_layout_domain *dom)
static int pdclust_register(struct m0_layout_domain *dom, const struct m0_layout_type *lt)
static void pdclust_unregister(struct m0_layout_domain *dom, const struct m0_layout_type *lt)
static int pdclust_allocate(struct m0_layout_domain *dom, uint64_t lid, struct m0_layout **out)
Definition at line 961 of file pdclust.c.