|
enum | m0_avi_labels {
M0_AVI_CLIENT_SM_OP = M0_AVI_CLIENT_RANGE_START + 1,
M0_AVI_CLIENT_SM_OP_COUNTER,
M0_AVI_CLIENT_SM_OP_COUNTER_END = M0_AVI_CLIENT_SM_OP_COUNTER + 0x100,
M0_AVI_CLIENT_TO_DIX,
M0_AVI_CLIENT_COB_REQ,
M0_AVI_CLIENT_TO_COB_REQ,
M0_AVI_CLIENT_COB_REQ_TO_RPC,
M0_AVI_CLIENT_TO_IOO,
M0_AVI_IOO_TO_RPC,
M0_AVI_CLIENT_BULK_TO_RPC,
M0_AVI_OP_ATTR_ENTITY_ID,
M0_AVI_OP_ATTR_CODE,
M0_AVI_IOO_ATTR_BUFS_NR,
M0_AVI_IOO_ATTR_BUF_SIZE,
M0_AVI_IOO_ATTR_PAGE_SIZE,
M0_AVI_IOO_ATTR_BUFS_ALIGNED,
M0_AVI_IOO_ATTR_RMW,
M0_AVI_IOO_REQ,
M0_AVI_IOO_REQ_COUNTER,
M0_AVI_IOO_REQ_COUNTER_END = M0_AVI_IOO_REQ_COUNTER + 0x100
} |
|
enum | m0_entity_opcode {
M0_EO_INVALID,
M0_EO_CREATE,
M0_EO_DELETE,
M0_EO_SYNC,
M0_EO_OPEN,
M0_EO_GETATTR,
M0_EO_SETATTR,
M0_EO_LAYOUT_GET,
M0_EO_LAYOUT_SET,
M0_EO_NR
} |
|
enum | m0_obj_opcode {
M0_OC_READ = M0_EO_NR + 1,
M0_OC_WRITE,
M0_OC_ALLOC,
M0_OC_FREE,
M0_OC_NR
} |
|
enum | m0_idx_opcode {
M0_IC_GET = M0_OC_NR + 1,
M0_IC_PUT,
M0_IC_DEL,
M0_IC_NEXT,
M0_IC_LOOKUP,
M0_IC_LIST,
M0_IC_NR
} |
|
enum | m0_op_obj_flags { M0_OOF_NOHOLE = 1 << 0,
M0_OOF_SYNC = 1 << 1
} |
|
enum | m0_entity_type { M0_ET_REALM,
M0_ET_OBJ,
M0_ET_IDX
} |
|
enum | m0_entity_flags { M0_ENF_META = 1 << 0,
M0_ENF_NO_RMW = 1 << 1,
M0_ENF_DI = 1 << 2
} |
|
enum | m0_op_state {
M0_OS_UNINITIALISED,
M0_OS_INITIALISED,
M0_OS_LAUNCHED,
M0_OS_EXECUTED,
M0_OS_STABLE,
M0_OS_FAILED,
M0_OS_NR
} |
|
enum | m0_client_layout_type { M0_LT_PDCLUST = 0,
M0_LT_COMPOSITE,
M0_LT_CAPTURE,
M0_LT_NR
} |
|
enum | m0_realm_type { M0_ST_CONTAINER,
M0_ST_EPOCH,
M0_ST_DTX,
M0_ST_NR
} |
|
enum | {
LOCAL,
HA,
CONFD,
PROF,
HELP
} |
|
enum | {
CRT,
DRP,
LST,
LKP,
PUT,
DEL,
GET,
NXT,
GENF,
GENV
} |
|
enum | { INDEX_CMD_COUNT = 10,
MAX_VAL_SIZE = 500
} |
|
|
int | m0_obj_lock_init (struct m0_obj *obj) |
|
void | m0_obj_lock_fini (struct m0_obj *obj) |
|
int | m0_obj_write_lock_get (struct m0_obj *obj, struct m0_rm_lock_req *req, struct m0_clink *clink) |
|
int | m0_obj_write_lock_get_sync (struct m0_obj *obj, struct m0_rm_lock_req *req) |
|
int | m0_obj_read_lock_get (struct m0_obj *obj, struct m0_rm_lock_req *req, struct m0_clink *clink) |
|
int | m0_obj_read_lock_get_sync (struct m0_obj *obj, struct m0_rm_lock_req *req) |
|
void | m0_obj_lock_put (struct m0_rm_lock_req *req) |
|
void | m0_op_setup (struct m0_op *op, const struct m0_op_ops *cbs, m0_time_t linger) |
|
void | m0_op_launch (struct m0_op **op, uint32_t nr) |
|
int32_t | m0_op_wait (struct m0_op *op, uint64_t bits, m0_time_t to) |
|
void | m0_op_cancel (struct m0_op **op, uint32_t nr) |
|
void | m0_op_kick (struct m0_op *op) |
|
int32_t | m0_rc (const struct m0_op *op) |
|
void | m0_op_fini (struct m0_op *op) |
|
void | m0_op_free (struct m0_op *op) |
|
void | m0_container_init (struct m0_container *con, struct m0_realm *parent, const struct m0_uint128 *id, struct m0_client *instance) |
|
void | m0_epoch_init (struct m0_epoch *epoch, struct m0_realm *parent, const struct m0_uint128 *id) |
|
void | m0__dtx_init (struct m0__dtx *dtx, struct m0_realm *parent, const struct m0_uint128 *id) |
|
void | m0_obj_init (struct m0_obj *obj, struct m0_realm *parent, const struct m0_uint128 *id, uint64_t layout_id) |
|
void | m0_obj_fini (struct m0_obj *obj) |
|
void | m0_obj_idx_init (struct m0_idx *idx, const struct m0_obj *obj) |
|
int | m0_obj_op (struct m0_obj *obj, enum m0_obj_opcode opcode, struct m0_indexvec *ext, struct m0_bufvec *data, struct m0_bufvec *attr, uint64_t mask, uint32_t flags, struct m0_op **op) |
|
void | m0_idx_init (struct m0_idx *idx, struct m0_realm *parent, const struct m0_uint128 *id) |
|
void | m0_idx_fini (struct m0_idx *idx) |
|
int | m0_idx_op (struct m0_idx *idx, enum m0_idx_opcode opcode, struct m0_bufvec *keys, struct m0_bufvec *vals, int32_t *rcs, uint32_t flags, struct m0_op **op) |
|
void | m0_realm_create (struct m0_realm *realm, uint64_t wcount, uint64_t rcount, struct m0_op **op) |
|
void | m0_realm_open (struct m0_realm *realm, uint64_t wcount, uint64_t rcount, struct m0_op **op) |
|
void | m0_realm_close (struct m0_realm *realm, uint64_t wcount, uint64_t rcount, struct m0_op **op) |
|
int | m0_entity_open (struct m0_entity *entity, struct m0_op **op) |
|
void | m0_entity_fini (struct m0_entity *entity) |
|
size_t | m0_op_maxsize (void) |
|
int | m0_client_init (struct m0_client **m0c, struct m0_config *conf, bool init_m0) |
|
void | m0_client_fini (struct m0_client *m0c, bool fini_m0) |
|
void | m0_process_fid (const struct m0_client *m0c, struct m0_fid *proc_fid) |
|
int | m0_sync_op_init (struct m0_op **sop) |
|
int | m0_sync_entity_add (struct m0_op *sop, struct m0_entity *ent) |
|
int | m0_sync_op_add (struct m0_op *sop, struct m0_op *op) |
|
int | m0_entity_sync (struct m0_entity *ent) |
|
int | m0_sync (struct m0_client *m0c, bool wait) |
|
uint64_t | m0_obj_unit_size_to_layout_id (int unit_size) |
|
int | m0_obj_layout_id_to_unit_size (uint64_t layout_id) |
|
uint64_t | m0_client_layout_id (const struct m0_client *instance) |
|
enum m0_client_layout_type | m0_obj_layout_type (struct m0_obj *obj) |
|
int | m0_composite_layer_add (struct m0_client_layout *layout, struct m0_obj *sub_obj, int priority) |
|
void | m0_composite_layer_del (struct m0_client_layout *layout, struct m0_uint128 subobj_id) |
|
int | m0_composite_layer_idx (struct m0_uint128 layer_id, bool write, struct m0_idx *idx) |
|
int | m0_composite_layer_idx_key_to_buf (struct m0_composite_layer_idx_key *key, void **out_kbuf, m0_bcount_t *out_klen) |
|
void | m0_composite_layer_idx_key_from_buf (struct m0_composite_layer_idx_key *key, void *kbuf) |
|
int | m0_composite_layer_idx_val_to_buf (struct m0_composite_layer_idx_val *val, void **out_vbuf, m0_bcount_t *out_vlen) |
|
void | m0_composite_layer_idx_val_from_buf (struct m0_composite_layer_idx_val *val, void *vbuf) |
|
int | m0_client_layout_op (struct m0_obj *obj, enum m0_entity_opcode opcode, struct m0_client_layout *layout, struct m0_op **op) |
|
int | m0_client_layout_capture (struct m0_client_layout *layout, struct m0_obj *obj, struct m0_client_layout **out) |
|
struct m0_client_layout * | m0_client_layout_alloc (enum m0_client_layout_type type) |
|
void | m0_client_layout_free (struct m0_client_layout *layout) |
|
static int | subsystem_id (char *name) |
|
static void | usage (void) |
|
static int | opts_get (struct params *par, int *argc, char ***argv) |
|
int | main (int argc, char **argv) |
|
struct m0_client * | m0_instance () |
|
static struct m0_fid | ifid (uint64_t x, uint64_t y) |
|
static int | instance_init (struct params *params) |
|
static void | instance_fini (void) |
|
static int | genf (char *filename, int cnt) |
|
static int | genv (char *filename, int cnt, int size) |
|
static void | log_hex_val (const char *tag, void *buf, int size) |
|
static void | log_keys_vals (struct m0_bufvec *keys, struct m0_bufvec *vals) |
|
static void | log_fids (struct m0_fid_arr *fids, struct m0_bufvec *vals) |
|
static int | cmd_exec (struct index_cmd *cmd) |
|
static void | ctx_init (struct index_ctx *ctx) |
|
static void | ctx_fini (struct index_ctx *ctx) |
|
int | index_execute (int argc, char **argv) |
|
int | index_init (struct params *params) |
|
void | index_fini (void) |
|
void | index_usage (void) |
|
static int | per_item_rcs_analyse (int32_t *rcs, int cnt) |
|
static int | index_op_tail (struct m0_entity *ce, struct m0_op *op, int rc, int *sm_rc) |
|
int | index_create (struct m0_realm *parent, struct m0_fid_arr *fids) |
|
int | index_drop (struct m0_realm *parent, struct m0_fid_arr *fids) |
|
int | index_list (struct m0_realm *parent, struct m0_fid *fid, int cnt, struct m0_bufvec *keys) |
|
int | index_lookup (struct m0_realm *parent, struct m0_fid_arr *fids, struct m0_bufvec *rets) |
|
static int | index_op (struct m0_realm *parent, struct m0_fid *fid, enum m0_idx_opcode opcode, struct m0_bufvec *keys, struct m0_bufvec *vals) |
|
int | index_put (struct m0_realm *parent, struct m0_fid_arr *fids, struct m0_bufvec *keys, struct m0_bufvec *vals) |
|
int | index_del (struct m0_realm *parent, struct m0_fid_arr *fids, struct m0_bufvec *keys) |
|
int | index_get (struct m0_realm *parent, struct m0_fid *fid, struct m0_bufvec *keys, struct m0_bufvec *vals) |
|
int | index_next (struct m0_realm *parent, struct m0_fid *fid, struct m0_bufvec *keys, int cnt, struct m0_bufvec *vals) |
|
static int | command_id (const char *name) |
|
static int | file_lines_count (const char *filename) |
|
static int | fids_load (const char *val, struct m0_fid_arr *fids) |
|
static int | vals_xcode (const char *value, void *buf, m0_bcount_t *size) |
|
static int | item_load (FILE *f, char **item, int *size) |
|
static int | vals_load (const char *value, struct m0_bufvec *vals) |
|
static int | command_assign (struct index_cmd *cmd, int *argc, char ***argv) |
|
static bool | command_is_valid (struct index_cmd *cmd) |
|
int | index_parser_args_process (struct index_ctx *ctx, int argc, char **argv) |
|
void | index_parser_print_command_help (void) |
|
static void | rm_ctx_init (struct m0_rm_lock_ctx *ctx, struct m0_client *m0c, struct m0_fid *fid) |
|
static void | rm_ctx_fini (struct m0_ref *ref) |
|
static void | rm_lock_req_init (struct m0_clink *clink, struct m0_rm_owner *owner, struct m0_rm_lock_req *req, enum m0_rm_rwlock_req_type rw_type) |
|
static void | rm_lock_req_fini (struct m0_rm_lock_req *req) |
|
static void | obj_lock_incoming_complete (struct m0_rm_incoming *in, int32_t rc) |
|
static void | obj_lock_incoming_conflict (struct m0_rm_incoming *in) |
|
static bool | rm_key_eq (const void *key1, const void *key2) |
|
static uint64_t | rm_hash_func (const struct m0_htable *htable, const void *k) |
|
| M0_HT_DESCR_DEFINE (rm_ctx, "Hash-table for RM locks", M0_INTERNAL, struct m0_rm_lock_ctx, rmc_hlink, rmc_magic, M0_RM_MAGIC, M0_RM_HEAD_MAGIC, rmc_key, rm_hash_func, rm_key_eq) |
|
| M0_HT_DEFINE (rm_ctx, M0_INTERNAL, struct m0_rm_lock_ctx, struct m0_fid) |
|
M0_INTERNAL int | m0_obj_lock_get (struct m0_obj *obj, struct m0_rm_lock_req *req, struct m0_clink *clink, enum m0_rm_rwlock_req_type rw_type) |
|
M0_INTERNAL int | m0_obj_lock_get_sync (struct m0_obj *obj, struct m0_rm_lock_req *req, enum m0_rm_rwlock_req_type rw_type) |
|
Overview
Examples of Motr applications are:
- Motr file system client (m0t1fs);
- Lustre osd-motr module (part of LOMO);
- Lustre HSM backend (part of Castor-A200);
- SWIFT or S3 backend (part of WOMO);
- Motr-based block device (part of BOMO);
Client interface is divided into the following sub-interfaces:
- access sub-interface, which provides basic abstraction to build storage
application;
- management sub-interface, which provides methods for Motr cluster
deployment, configuration, re-configuration and monitoring;
- extension sub-interface, which allows applications to extend Motr
functionality without modifying the core Motr.
This header describes the access sub-interface of client, simply called "client interface" hereafter.
In the following "an application" means a code invoking the client interface and "the implementation" refers to the implementation of the said interface.
Client provides the following abstractions:
- object (m0_obj) is an array of fixed-size blocks;
- index (m0_idx) is a key-value store;
- realm (m0_realm) is a spatial and temporal part of system with a
prescribed access discipline. Objects, indices and operations live in
realms;
- operation (m0_op) is a process of querying or updating system
state;
Realms are further sub-divided in:
- transaction (m0__dtx) is a collection of operations atomic in the
face of failures;
- epoch (m0_epoch) is a collection of operations done by an
application, which moves the system from one application-consistent
state to another;
- container (m0_container) is a collection of objects used by a
particular application or group of applications;
- other types of realms, as can be added in the future.
Object, index and realm are sub-types of entity (m0_entity). Entities provide state, interface and behavior shared by all sub-types.
All client entry points, except for m0_op_wait(), are non-blocking. To perform a potentially lengthy activity, that might involve network communication (for example, read from an object), the client entry point (m0_obj_op() in the case of object read), sets up an operation (m0_ops) structure containing the parameters of the activity and immediately returns to the caller. The caller should explicitly launch a set of previously prepared operations by calling m0_op_launch(). Separation of preparation and launch provides for more efficient network communication, where multiple operations are accumulated in the same network message.
In-memory structures (m0_{obj,index,realm,...}) correspond to some storage entities maintained by the implementation. The implementation does not enforce this correspondence. If an application keeps multiple in-memory structures for the same storage entity (whether in the same process address space or not), it is up to the application to keep the structures coherent (the very notion of coherency, obviously, being application-dependent). At the one end of the spectrum, an application can employ a fully coherent, distributed cache for entities, providing strong consistency guarantees. On the other end, an application can tolerate multiple inconsistent views of the same storage entities, providing NFSv2-like semantics.
Sub-typing
*
* entity (create, delete, open, close, fini) [abstract, no constructor]
* |
* |
* +---- object (init, read, write, alloc, free)
* |
* |
* +---- index (init, get, put, next)
* |
* |
* +---- realm () [abstract, no constructor]
* |
* |
* +---- container (init)
* |
* |
* +---- epoch (init)
* |
* |
* +---- dtx (init)
*
*
* op (init, wait, setup, launch, kick, free, fini)
* [has private sub-types in private.h]
*
*
Identifiers
An entity exists in some realm and has a 128-bit identifier, unique within the cluster and never re-used. The high 8 bits of an identifier denote the entity type. Identifier management is up to the application, except that the single identifier M0_UBER_REALM is reserved for the "uber realm", representing the root of the realm hierarchy, and within each entity type, identifiers less than M0_ID_APP are reserved for the implementation's internal use.
- Todo:
- A library on top of client for fast scalable identifier allocation will be provided as part of Motr.
The implementation is free to reserve some 8-bit combinations for its internal use.
- Todo:
- an interface to register 8-bit combinations for application use (to introduce application-specific "entity-like" things).
Operations
An operation structure tracks the state of execution of a request made to the implementation.
An operation structure is a state machine going through states described by enum m0_op_state:
* (0)
* |
* |
* V
* +---------------INITIALISED
* | |
* | | m0_op_launch()
* V V
* FAILED<-------------LAUNCHED
* ^ |
* | |
* | V
* +----------------EXECUTED---------------->STABLE
*
An operation in INITIALISED, FAILED or STABLE state is called "complete" and "outstanding" (or "in-progress") otherwise.
An operation is in INITIALISED state after allocation. In this state, the operation processing is not yet started, the application is free to modify operation parameters with a call to m0_op_setup() or direct field access.
Multiple initialised operation structures can be simultaneously moved to the LAUNCHED state, by a call to m0_op_launch(). This call starts actual operation processing. No changes to the operation structure are allowed by the application after this call is made and until the operation completes. To improve caching and utilisation of system resources, the implementation is free to delay any operation-related acitivities, such as sending network messages, for some time after the operation is launched. The value of m0_op::op_linger is an application-provided hint about the absolute time by which such delays should be limited.
In case of successful execution, a launched operation structure eventually reaches EXECUTED state, meaning that the operation was executed at least in the volatile stores of the respective services. When the operation enters EXECUTED state, the m0_op_ops::oop_executed() call-back, if provided by the application, is invoked. By the time this state is entered, the operation return code is in m0_op::op_sm::sm_rc, and all return information (object data in case of READ, keys and values in case of GET and NEXT) are already placed in the application-supplied buffers.
After an operation has been executed, it can still be lost due to a failure. The implementation continues to work toward making the operation stable. When this activity successfully terminates, the operation enters the STABLE state and the m0_op_ops::oop_stable() call-back is invoked, if provided. Once an operation is stable, the implementation guarantees that the operation would survive any "allowed failure", where allowed failures include at least transient service failures (crash and restart with volatile store loss), transient network failures and client failures.
In case of a failure, the operation structure moves into FAILED state, the m0_op_ops::oop_failed() call-back is invoked, and no further state transitions will ensue.
The implementation is free to add another states to the operation state machine.
All operation structures embed "generic operation" m0_op as the first member.
The application can track the state of the operation either synchronously, by waiting until the operation reaches a particular state (m0_op_wait()), or asynchronously by supplying (m0_op_setup()) a call-back to be called when the operation reaches a particular state.
Operation structures are either pre-allocated by the application or allocated by the appropriate entry points, see the "op" parameter of m0_obj_op() for an example. When an operation structure is pre-allocated, the application must set m0_op::op_size to the size of the pre-allocated structure before passing it to a client entry point. This allows the implementation to check that the pre-allocated structure has enough room and return an error (-EMSGSIZE) otherwise.
Operation errors are returned through m0_op::op_sm::sm_rc.
Operations, common for all entity types are implemented at the entity level: m0_entity_create(), m0_entity_delete(), m0_entity_fini().
A typical usage would involve initialisation of a concrete entity (e.g., object), execution of some generic operations and then of some concrete operations, for example:
Object
A client object is an array of blocks, which can be read from and written onto at the block granularity.
Block size is a power of two bytes and is selected at the object creation time.
An object has no traditional application-visible meta-data (in particular, it has no size). Instead it has meta-data, called "block attributes" associated with each block. Block attributes can be used to store check-sums, version numbers, hashes, etc. Because of the huge number of blocks in a typical system, the overhead of block attributes book-keeping must be kept at a minimum, which puts restrictions on the block attribute access interface (
- Todo:
- to be described).
There are 4 types of object operations, in addition to the common entity operations:
- READ: transfer blocks and block attributes from an object to
application buffers;
- WRITE: transfer blocks and block attributes from application buffers to
an object;
- ALLOC: pre-allocate certain blocks in an implementation-dependent
manner. This operation guarantees that consecutive WRITE onto
pre-allocated blocks will not fail due to lack of storage space;
- FREE: free storage resources used by specified object
blocks. Consecutive reads from the blocks will return zeroes.
READ and WRITE operations are fully scatter-gather-scatter: data are transferred between a sequence of object extents and a sequence of application buffers, the only restrictions being:
- total lengths of the extents must be equal to the total size of the
buffers, and
- extents must be block-size aligned and sizes of extents and buffers
must be multiples of block-size.
Internally, the implementation stores an object according to the object layout (specified at the object creation time). The layout determines fault-tolerance and performance related characteristics of the object. Examples layouts are:
- network striping with parity de-clustering. This is the default layout,
it provides a flexible level of fault-tolerance, high availability in
the face of permanent storage device failures and full utilisation of
storage resources;
- network striping without parity (raid0). This provides higher space
utilisation and lower processor costs than parity de-clustering at the
expense of fault-tolerance;
- network mirroring (raid1). This provides high fault-tolerance and
availability at the cost of storage space consumption;
- de-dup, compression, encryption.
Index
A client index is a key-value store.
An index stores records, each record consisting of a key and a value. Keys and values within the same index can be of variable size. Keys are ordered by the lexicographic ordering of their bit-level representation. Records are ordered by the key ordering. Keys are unique within an index.
There are 4 types of index operations:
- GET: given a set of keys, return the matching records from the index;
- PUT: given a set of records, place them in the index, overwriting
existing records if necessary, inserting new records otherwise;
- DEL: given a set of keys, delete the matching records from the index;
- NEXT: given a set of keys, return the records with the next (in the
ascending key order) keys from the index.
Indices are stored according to a layout, much like objects.
Realm
To define what a realm is, consider the entire history of a client storage system. In the history, each object and index is represented as a "world
line" (https://en.wikipedia.org/wiki/World_line), which starts when the entity is created and ends when it is deleted. Points on the world line correspond to operations that update entity state.
A realm is the union of some continuous portions of different world lines. That is, a realm is given by a collection of entities and, for each entity in the collection, a start and an end point (operations) on the entity world line. A realm can be visualised as a cylinder in the history.
The restriction placed on realms is that each start point in a realm must either be the first point in a world line (i.e., the corresponding entity is created in the realm) or belong to the same realm, called the parent of the realm in question. This arranges realms in a tree.
- Note
- Realms are not necessarily disjoint.
A realm can be in the following application-controllable states:
- OPEN: in this state the realm can be extended by executing new
operations against entities already in the realm or creating new
entities in the realm;
- CLOSED: in this state the realm can no longer be extended, but it is
tracked by the system and maintains its identity. Entities in a closed
realm can be located and read-only operations can be executed on them;
- ABSORBED: in this state the realm is no longer tracked by the
system. All the operations executed as part of the realm are by now
stable and absorbed in the parent realm;
- FAILED: an application aborted the realm or the implementation
unilaterally decided to abort it. The operations executed in the realm
are undone together with a transitive closure of dependent operations
(the precise definition being left to the implementation
discretion). Thus, failure of a realm can lead to cascading failures of
other realms.
Examples of realms are:
- a container (m0_container) can be thought of as a "place" where
a particular storage application lives. In a typical scenario, when an
application is setup on a system, a new container, initially empty,
will be created for the application. The application can create new
entities in the container and manipulate them without risk of conflicts
(e.g., for identifier allocation) with other applications. A container
can be thought of as a virtualised storage system for an application. A
container realm is open as long as application needs its persistent
data. When the application is uninstalled, its realm is deleted;
- a snapshot realm is created with a container as the parent and is
immediately closed. From now on, the snapshot provides a read-only view
of container objects at the moment of the snapshot creation. Finally,
the snapshot is deleted. If a snapshot is not closed immediately, but
remains open, it is a writeable snapshot (clone)---a separate branch in
the container's history. A clone is eventually deleted without being
absorbed in the parent container;
- an epoch (m0_epoch) is a realm capturing part of the
application's work-flow for resilience. Often an HPC application works
by interspersing "compute phases", when actual data processing is done,
with an "IO phase" when a checkpoint of application state is saved on
the storage system for failure recovery purposes. A client application
would, instead, keep an open "current" epoch realm, closed at the
compute-IO phase transition, with the new epoch opened immediately. The
realm tree for such application would look like
*
* CONTAINER--->E--->E---...->E--->CURRENT
*
*
Where all E epochs are closed and in the process of absorbtion, and all
earlier epochs already absorbed in the container.
If the application fails, it can restart either from the container or
from any closed epoch, which are all guaranteed to be consistent, that
is, reflect storage state at the boundry of a compute phase. The final
CURRENT epoch is potentially inconsistent after a failure and should be
deleted.
- a distributed transaction (m0__dtx) is a group of operations,
which must be atomic w.r.t. to failures.
Ownership
client entity structures (realms, objects and indices) are allocated by the application. The application may free a structure after completing the corresponding finalisation call. The application must ensure that all outstanding operations on the entity are complete before finalisation.
An operation structure allocated by the application, must remain allocated until the operation is complete. Before a complete operation structure can be re-used, it should be finalised by a call to m0_op_fini(). An operation structure allocated by the client implementation can be finalised and re-used as one allocated by the application, and must be eventually freed by the application (by calling m0_op_free()) some time after the operation completes.
Data blocks used by scatter-gather-scatter lists and key-value records are allocated by the application. For read-only operations (M0_OC_READ, M0_IC_GET and M0_IC_NEXT) the application may free the data blocks as soon as the operation reaches EXECUTED or FAILED state. For updating operations, the data blocks must remain allocated until the operation stabilises.
Concurrency
The client implementation guarantees that concurrent calls to the same index are linearizable.
All other concurrency control, including ordering of reads and writes to a client object, and distributed transaction serializability, is up to the application.
For documentation links, please refer to this file : doc/motr-design-doc-list.rst
- Todo:
- entity type structures (to provide constructors, 8-bit identifier tags and an ability to register new entity types).
- Todo:
- handling of extensible attributes (check-sums, version numbers, etc.), which require interaction with the implementation on the service side.
◆ M0_COMPOSITE_EXTENT_INF
#define M0_COMPOSITE_EXTENT_INF (0xffffffffffffffff) |
◆ M0_TRACE_SUBSYSTEM [1/5]
#define M0_TRACE_SUBSYSTEM M0_TRACE_SUBSYS_CLIENT |
◆ M0_TRACE_SUBSYSTEM [2/5]
#define M0_TRACE_SUBSYSTEM M0_TRACE_SUBSYS_CLIENT |
◆ M0_TRACE_SUBSYSTEM [3/5]
#define M0_TRACE_SUBSYSTEM M0_TRACE_SUBSYS_CLIENT |
◆ M0_TRACE_SUBSYSTEM [4/5]
#define M0_TRACE_SUBSYSTEM M0_TRACE_SUBSYS_CLIENT |
◆ M0_TRACE_SUBSYSTEM [5/5]
#define M0_TRACE_SUBSYSTEM M0_TRACE_SUBSYS_CLIENT |
◆ anonymous enum
Enumerator |
---|
LOCAL | |
HA | |
CONFD | |
PROF | |
HELP | |
Definition at line 55 of file cmd_main.c.
◆ anonymous enum
Enumerator |
---|
CRT | |
DRP | |
LST | |
LKP | |
PUT | |
DEL | |
GET | |
NXT | |
GENF | |
GENV | |
Definition at line 36 of file index.h.
◆ anonymous enum
Enumerator |
---|
INDEX_CMD_COUNT | |
MAX_VAL_SIZE | |
Definition at line 49 of file index.h.
◆ m0_avi_labels
Enumerator |
---|
M0_AVI_CLIENT_SM_OP | |
M0_AVI_CLIENT_SM_OP_COUNTER | |
M0_AVI_CLIENT_SM_OP_COUNTER_END | |
M0_AVI_CLIENT_TO_DIX | |
M0_AVI_CLIENT_COB_REQ | |
M0_AVI_CLIENT_TO_COB_REQ | |
M0_AVI_CLIENT_COB_REQ_TO_RPC | |
M0_AVI_CLIENT_TO_IOO | |
M0_AVI_IOO_TO_RPC | |
M0_AVI_CLIENT_BULK_TO_RPC | |
M0_AVI_OP_ATTR_ENTITY_ID | |
M0_AVI_OP_ATTR_CODE | |
M0_AVI_IOO_ATTR_BUFS_NR | |
M0_AVI_IOO_ATTR_BUF_SIZE | |
M0_AVI_IOO_ATTR_PAGE_SIZE | |
M0_AVI_IOO_ATTR_BUFS_ALIGNED | |
M0_AVI_IOO_ATTR_RMW | |
M0_AVI_IOO_REQ | |
M0_AVI_IOO_REQ_COUNTER | |
M0_AVI_IOO_REQ_COUNTER_END | |
Definition at line 38 of file addb.h.
◆ m0_client_layout_type
Layout is of an entity containing information to locate data (node, service, device). TODO: rewrite the definition.
Enumerator |
---|
M0_LT_PDCLUST | |
M0_LT_COMPOSITE | |
M0_LT_CAPTURE | |
M0_LT_NR | |
Definition at line 776 of file client.h.
◆ m0_entity_flags
Flags passed to m0_entitiy_create(), m0_entity_open() to specify application's behaviour.
Enumerator |
---|
M0_ENF_META | If motr client application has the capability to store object metadata by itself (such as pool version and layout, which can be stored by the application at motr distributed index, for example), it can use this flag to avoid sending additional metadata fops on such object operations as CREATE, OPEN, DELETE, GETATTR and, thus, improve its performance.
Before calling m0_entity_create() or m0_entity_open(), application is expected to set obj->ob_entity->en_flags |= M0_ENF_META. When m0_entity_create() returns, the pool version and layout id will be available for the application at obj->ob_attr.oa_pver and obj->ob_attr.oa_lid respectively.
For example, create workflow can look like this:
obj->ob_entity.en_flags |= M0_ENF_META; m0_entity_create(NULL, &obj->ob_entity, &ops[0]); // Save the returned pool version and lid into app_meta_data app_meta_data.pver = obj->ob_attr.oa_pver; app_meta_data.lid = obj->ob_attr.oa_lid;
And read workflow:
obj->ob_entity.en_flags |= M0_ENF_META; // Set the pool version and lid from app_meta_data obj->ob_attr.oa_pver = app_meta_data.pver; obj->ob_attr.oa_lid = app_meta_data.lid; m0_entity_open(NULL, &obj->ob_entity, &ops[0]);
|
M0_ENF_NO_RMW | If this flags is set during entity_create() that means application do not support update operation. This flag is not in use yet.
|
M0_ENF_DI | This flag is to enable data integrity.
|
Definition at line 595 of file client.h.
◆ m0_entity_opcode
Operation codes for entity, object and index.
Enumerator |
---|
M0_EO_INVALID | |
M0_EO_CREATE | |
M0_EO_DELETE | |
M0_EO_SYNC | |
M0_EO_OPEN | |
M0_EO_GETATTR | |
M0_EO_SETATTR | |
M0_EO_LAYOUT_GET | |
M0_EO_LAYOUT_SET | |
M0_EO_NR | |
Definition at line 523 of file client.h.
◆ m0_entity_type
Types of entities supported by client.
Enumerator |
---|
M0_ET_REALM | |
M0_ET_OBJ | |
M0_ET_IDX | |
Definition at line 585 of file client.h.
◆ m0_idx_opcode
Enumerator |
---|
M0_IC_GET | Lookup a value with the given key.
|
M0_IC_PUT | Insert or update the value, given a key.
|
M0_IC_DEL | Delete the value, if any, for the given key.
|
M0_IC_NEXT | Given a key, return the next key and its value.
|
M0_IC_LOOKUP | Check an index for an existence.
|
M0_IC_LIST | Given an index id, get the list of next indices.
|
M0_IC_NR | |
Definition at line 550 of file client.h.
◆ m0_obj_opcode
Object operation codes.
Enumerator |
---|
M0_OC_READ | Read object data.
|
M0_OC_WRITE | Write object data.
|
M0_OC_ALLOC | Pre-allocate space.
|
M0_OC_FREE | De-allocate space, consecutive reads will return 0s.
|
M0_OC_NR | |
Definition at line 537 of file client.h.
◆ m0_op_obj_flags
Flags passed to m0_obj_op() to specify object IO operation behaviour.
Enumerator |
---|
M0_OOF_NOHOLE | Read operation should not see any holes. If a hole is met during read, return error instead.
|
M0_OOF_SYNC | Write, alloc and free operations wait for the transaction to become persistent before returning.
|
Definition at line 569 of file client.h.
◆ m0_op_state
Operation state, stored in m0_op::op_sm::sm_state.
Enumerator |
---|
M0_OS_UNINITIALISED | |
M0_OS_INITIALISED | |
M0_OS_LAUNCHED | |
M0_OS_EXECUTED | |
M0_OS_STABLE | |
M0_OS_FAILED | |
M0_OS_NR | |
Definition at line 691 of file client.h.
◆ m0_realm_type
Enumerator |
---|
M0_ST_CONTAINER | |
M0_ST_EPOCH | |
M0_ST_DTX | |
M0_ST_NR | |
Definition at line 850 of file client.h.
◆ cmd_exec()
static int cmd_exec |
( |
struct index_cmd * |
cmd | ) |
|
|
static |
◆ command_assign()
static int command_assign |
( |
struct index_cmd * |
cmd, |
|
|
int * |
argc, |
|
|
char *** |
argv |
|
) |
| |
|
static |
◆ command_id()
static int command_id |
( |
const char * |
name | ) |
|
|
static |
◆ command_is_valid()
static bool command_is_valid |
( |
struct index_cmd * |
cmd | ) |
|
|
static |
◆ ctx_fini()
static void ctx_fini |
( |
struct index_ctx * |
ctx | ) |
|
|
static |
◆ ctx_init()
static void ctx_init |
( |
struct index_ctx * |
ctx | ) |
|
|
static |
◆ fids_load()
static int fids_load |
( |
const char * |
val, |
|
|
struct m0_fid_arr * |
fids |
|
) |
| |
|
static |
◆ file_lines_count()
static int file_lines_count |
( |
const char * |
filename | ) |
|
|
static |
◆ genf()
static int genf |
( |
char * |
filename, |
|
|
int |
cnt |
|
) |
| |
|
static |
◆ genv()
static int genv |
( |
char * |
filename, |
|
|
int |
cnt, |
|
|
int |
size |
|
) |
| |
|
static |
◆ ifid()
static struct m0_fid ifid |
( |
uint64_t |
x, |
|
|
uint64_t |
y |
|
) |
| |
|
static |
◆ index_create()
◆ index_del()
◆ index_drop()
◆ index_execute()
int index_execute |
( |
int |
argc, |
|
|
char ** |
argv |
|
) |
| |
◆ index_fini()
◆ index_get()
◆ index_init()
int index_init |
( |
struct params * |
params | ) |
|
◆ index_list()
◆ index_lookup()
◆ index_next()
◆ index_op()
◆ index_op_tail()
static int index_op_tail |
( |
struct m0_entity * |
ce, |
|
|
struct m0_op * |
op, |
|
|
int |
rc, |
|
|
int * |
sm_rc |
|
) |
| |
|
static |
◆ index_parser_args_process()
int index_parser_args_process |
( |
struct index_ctx * |
ctx, |
|
|
int |
argc, |
|
|
char ** |
argv |
|
) |
| |
◆ index_parser_print_command_help()
void index_parser_print_command_help |
( |
void |
| ) |
|
◆ index_put()
◆ index_usage()
void index_usage |
( |
void |
| ) |
|
◆ instance_fini()
static void instance_fini |
( |
void |
| ) |
|
|
static |
◆ instance_init()
static int instance_init |
( |
struct params * |
params | ) |
|
|
static |
◆ item_load()
static int item_load |
( |
FILE * |
f, |
|
|
char ** |
item, |
|
|
int * |
size |
|
) |
| |
|
static |
◆ log_fids()
◆ log_hex_val()
static void log_hex_val |
( |
const char * |
tag, |
|
|
void * |
buf, |
|
|
int |
size |
|
) |
| |
|
static |
◆ log_keys_vals()
◆ m0__dtx_init()
◆ m0_client_fini()
void m0_client_fini |
( |
struct m0_client * |
m0c, |
|
|
bool |
fini_m0 |
|
) |
| |
Finalises client, finalise state machine group et al.
- Precondition
- (m0c != NULL).
Definition at line 1711 of file client_init.c.
◆ m0_client_init()
Initialises state machine types et al.
- Parameters
-
m0c | Where to store the allocated instance. |
conf | client configuration parameters. |
init_m0 | Indicate whether or not Motr needs to be initialised. |
- Returns
- 0 for success, anything else for an error.
- Precondition
- m0c must point to a NULL struct m0_client *.
-
local_ep must not be NULL or the empty string.
Definition at line 1533 of file client_init.c.
◆ m0_client_layout_alloc()
◆ m0_client_layout_capture()
Note: current version only support capturing pdclust layout for an object.
To capture the layout for an object, an application has to issue LAYOUT_GET op first to retrieve the object's layout.
-------------------------------------------------------------------------—*
Routines for capture layout *
Definition at line 138 of file layout.c.
◆ m0_client_layout_free()
◆ m0_client_layout_id()
uint64_t m0_client_layout_id |
( |
const struct m0_client * |
instance | ) |
|
◆ m0_client_layout_op()
Initialises layout operation.
- Parameters
-
| obj | The object which the layout is belong to. |
| layout | Layout the operation is targeted to. |
| opcode | Operation code for the operation. |
[out] | op | Pointer to the operation pointer. If the operation pointer is NULL, motr will allocate one. Otherwise, motr will check the operation and make sure it is reusable for this operation. |
Definition at line 436 of file layout.c.
◆ m0_composite_layer_add()
Add layer to the composite layout.
- Parameters
-
layout | The layout to add to. |
sub_obj | The sub object corresponds to the new layer. The API requires object argument instead of its identifier as Motr internally requires some object attributes to construct the composite layout. |
priority | The layer's priority which is used to select which layer an IO request goes to. |
- Returns
- 0 for success, anything else for an error.
Definition at line 535 of file composite_layout.c.
◆ m0_composite_layer_del()
Delete layer from the composite layout.
- Parameters
-
layout | The layout to delete from. |
subobj_id | The id of the sub-object (layer) to delete. |
Definition at line 589 of file composite_layout.c.
◆ m0_composite_layer_idx()
int m0_composite_layer_idx |
( |
struct m0_uint128 |
layer_id, |
|
|
bool |
write, |
|
|
struct m0_idx * |
idx |
|
) |
| |
Returns an in-memory index representation for extents in a composite layer.
- Parameters
-
layer_id | The composite layer in question. |
write | True for extents for WRITE, false for extents for READ. |
idx | The returned index. |
- Returns
- 0 for success, anything else for an error.
Definition at line 1312 of file composite_layout.c.
◆ m0_composite_layer_idx_key_from_buf()
◆ m0_composite_layer_idx_key_to_buf()
◆ m0_composite_layer_idx_val_from_buf()
◆ m0_composite_layer_idx_val_to_buf()
◆ m0_container_init()
◆ m0_entity_create()
Sets an operation to create or delete an entity.
- Parameters
-
| pool | Specify the pool to store the entity if it is not NULL, otherwise a pool selected by internal policy is used. |
| entity | In-memory representation of the entity that is to be created. |
[out] | op | Pointer to the operation. The operation can be pre-allocated by the application. Otherwise, this entry point will allocate it if succeed. |
- Returns
- 0 for success, (*op)->op_sm.sm_rc otherwise.
- Precondition
- entity != NULL
-
op != NULL
Definition at line 801 of file obj.c.
◆ m0_entity_delete()
int m0_entity_delete |
( |
struct m0_entity * |
entity, |
|
|
struct m0_op ** |
op |
|
) |
| |
◆ m0_entity_fini()
void m0_entity_fini |
( |
struct m0_entity * |
entity | ) |
|
Finalises an entity, freeing any additional memory allocated to represent it.
- Parameters
-
entity | Pointer to the entity to finalise. |
- Precondition
- entity != NULL
-
entity->en_sm.sm_state == M0_ES_INIT
Definition at line 438 of file client.c.
◆ m0_entity_open()
Sets an operation to open an entity.
- Parameters
-
| entity | The entity that needs to be opened. |
[out] | op | Pointer to the operation which can be pre-allocated by the application. Else, this entry point will allocate it if succeed. |
- Returns
- 0 for success, (*op)->op_sm.sm_rc otherwise
- Precondition
- entity != NULL
-
op != NULL
Definition at line 885 of file obj.c.
◆ m0_entity_sync()
Blocking version of entity sync API, corresponding to m0t1fs_fsync() in m0t1fs.
- Parameters
-
ent | The object is going to be sync'ed. |
- Returns
- 0 for success, anything else for an error.
Entry point for sync, calls sync_core with mode=active
Definition at line 1061 of file sync.c.
◆ m0_epoch_init()
◆ M0_HT_DEFINE()
◆ M0_HT_DESCR_DEFINE()
M0_HT_DESCR_DEFINE |
( |
rm_ctx |
, |
|
|
"Hash-table for RM locks" |
, |
|
|
M0_INTERNAL |
, |
|
|
struct m0_rm_lock_ctx |
, |
|
|
rmc_hlink |
, |
|
|
rmc_magic |
, |
|
|
M0_RM_MAGIC |
, |
|
|
M0_RM_HEAD_MAGIC |
, |
|
|
rmc_key |
, |
|
|
rm_hash_func |
, |
|
|
rm_key_eq |
|
|
) |
| |
◆ m0_idx_fini()
void m0_idx_fini |
( |
struct m0_idx * |
idx | ) |
|
◆ m0_idx_init()
Initialises client index in a given realm.
Notes for M0_IDX_DIX index service type: 'id' should be a valid motr fid of type 'x' (see m0_dix_fid_type). Zero fid container is reserved for distributed meta-indices and shouldn't be used for user indices, i.e. indices with M0_FID_TINIT('x', 0, *) fids are reserved.
Non-distributed indices (having fid type 'i') are going to be supported in future.
Definition at line 626 of file idx.c.
◆ m0_idx_op()
Initialises an index operation.
For M0_IC_NEXT operation arguments should be as follows:
- 'keys' buffer vector first element should contain a starting key and other elements should be set to NULL. Buffer vector size indicates number of records to return. Starting key can be NULL. In this case starting key is treated as the smallest possible key of the index. If starting key doesn't exist in the index, then retrieved records will start with the smallest key following the starting key. Otherwise, a record corresponding to the starting key will be included in a result.
- 'vals' vector should be at least of the same size as 'keys' and should contain NULLs. After successful operation completion retrieved index records are stored in 'keys' and 'vals' buffer vectors. If some error occurred during i-th index record retrieval then rcs[i] != 0. -ENOENT error means that there are no more records to return.
For M0_IC_GET operation arguments should be as follows:
- 'keys' buffer vector should contain keys for records being requested. At least one key should be specified and no NULL keys are allowed.
- 'vals' vector should be at least of the same size as 'keys' and should contain NULLs. After successful operation completion retrieved record values are stored in 'vals' buffer vector. If some value retrieval has failed, then corresponding element in 'rcs' array != 0.
'rcs' holds array of per-item return codes for the operation. It should be allocated by user with a size of at least 'keys->ov_vec.v_nr' elements. For example, 6 records with keys k0...k5 were requested through GET request with k3 being absent in the index. After operation completion rcs[3] will be -ENOENT and rcs[0,1,2,4,5] will be 0.
Per-item return codes are more fine-grained than global operation return code (op->op_sm.sm_rc). On operation completion the global return code is set to negative value if it's impossible to process any item (invalid index fid, lost RPC connection, etc.).
- If the operation global return code is 0, then user should check per-item return codes.
- If the operation global return code is not 0, then per-item return codes are undefined.
'rcs' argument is mandatory for all operations except M0_IC_LOOKUP.
For M0_CLOVIC_IC_PUT flags argument may be set.
- 'flags' is a bit-mask of m0_op_idx_flags enum. M0_OIF_OVERWRITE and M0_OIF_SYNC_WAIT are supported for now. If M0_OIF_OVERWRITE flag is set then records with existing keys are overwritten, otherwise operation returns -EEXIST for existing keys. If M0_OIF_SYNC_WAIT flag is set then it ensures that reply would be sent only when transaction is persisted. This flag can only be used with M0_IC_PUT or M0_IC_DEL.
- Precondition
- idx != NULL
-
M0_IN(opcode, (M0_IC_LOOKUP, M0_IC_LIST, M0_IC_GET, M0_IC_PUT, M0_IC_DEL, M0_IC_NEXT))
-
ergo(*op != NULL, *op->op_size >= sizeof **op)
-
ergo(opcode == M0_IC_LOOKUP, rcs != NULL)
-
ergo(opcode != M0_IC_LOOKUP, keys != NULL)
-
M0_IN(opcode, (M0_IC_DEL, M0_IC_LOOKUP, M0_IC_LIST)) == (vals == NULL)
-
ergo(opcode == M0_IC_LIST, m0_forall(i, keys->ov_vec.v_nr, keys->ov_vec.v_count[i] == sizeof(struct m0_uint128)))
-
ergo(opcode == M0_IC_GET, keys->ov_vec.v_nr != 0)
-
ergo(opcode == M0_IC_GET, m0_forall(i, keys->ov_vec.v_nr, keys->ov_buf[i] != NULL))
-
ergo(flags == M0_OIF_SYNC_WAIT, M0_IN(opcode, (M0_IC_PUT, M0_IC_DEL)))
-
ergo(vals != NULL, keys->ov_vec.v_nr == vals->ov_vec.v_nr)
- Postcondition
- ergo(result == 0, *op != NULL && *op->op_code == opcode && *op->op_sm.sm_state == M0_OS_INITIALISED)
- Todo:
- For now 'rcs' may be NULL if index backend is not Motr KVS and operation code is not M0_IC_GET. All backends should be updated to fill 'rcs' for all operation codes.
Definition at line 554 of file idx.c.
◆ m0_instance()
◆ m0_obj_fini()
void m0_obj_fini |
( |
struct m0_obj * |
obj | ) |
|
Finalises an obj, leading to finilise entity and to free any additiona memory allocated to represent it.
- Parameters
-
obj | Pointer to the object to finalise. |
- Precondition
- obj != NULL
Definition at line 467 of file client.c.
◆ m0_obj_idx_init()
void m0_obj_idx_init |
( |
struct m0_idx * |
idx, |
|
|
const struct m0_obj * |
obj |
|
) |
| |
Initialises the index corresponding to a given object.
Keys in this index are 64-bit block offsets (in BE representation, with lexicographic ordering) and the values are battrs (and maybe data?) for the block.
The index structure, initialised by this function, provides access to object data through client index interface.
- Postcondition
- m0_uint128_eq(&idx->in_entity.en_id, &obj->ob_entity.en_id)
◆ m0_obj_init()
Initialises a client object so that it can be created or deleted, or have read, write, alloc and free operations executed on it.
The size of data and parity buffer (m0_obj::ob_attr::oa_bshift) is set to default value 'M0_DEFAULT_BUF_SHIFT'.
If layout_id == 0, then this object will be set with optimal layout id according to the object size set in m0_obj::ob_attr::oa_buf_size. If Object size is not set, then this object will be set with default layout id (See struct m0_obj_attr).
- Parameters
-
obj | The object to initialise. |
parent | The realm operations on this object will be part of. |
id | The identifier assigned by the application to this object. |
layout_id | The layout id assigned by the application to this object. |
- Precondition
- obj != NULL
-
parent != NULL
-
id != NULL && m0_uint128_cmp(&M0_ID_APP, id) < 0
Definition at line 403 of file client.c.
◆ m0_obj_layout_id_to_unit_size()
int m0_obj_layout_id_to_unit_size |
( |
uint64_t |
layout_id | ) |
|
Maps a layout id to unit size.
- Parameters
-
layout_id | The layout id to query. The layout_id must be one of the valid layout ids defined in Motr. |
- Returns
- The corresponding unit size of queried layout id.
Definition at line 851 of file obj.c.
◆ m0_obj_layout_type()
Gets the layout type of an object.
- Parameters
-
- Returns
- The layout type of object in question.
Definition at line 879 of file obj.c.
◆ m0_obj_lock_fini()
void m0_obj_lock_fini |
( |
struct m0_obj * |
obj | ) |
|
Finalizes the object lock and decreased the rm_ctx::rmc_ref::ref_cnt. If the rm_ctx::rmc_ref::ref_cnt becomes 0, then finalize the cached RM context.
- Precondition
- m0_obj_init()
-
m0_obj_lock_init()
- Parameters
-
obj | an instance of object. |
Definition at line 144 of file obj_lock.c.
◆ m0_obj_lock_get()
Acquires the RM lock for the object asynchronously.
This function requests RM creditor (remote or local) to acquire the rights to use a resource, attaches a clink to the lock_req channel and returns. The clink will be signalled when the resource has been granted, hence the application should wait on the clink before executing any code which absolutely requires the object to be locked.
Definition at line 191 of file obj_lock.c.
◆ m0_obj_lock_get_sync()
Acquires the RM lock for the object. This is a blocking function.
Definition at line 214 of file obj_lock.c.
◆ m0_obj_lock_init()
int m0_obj_lock_init |
( |
struct m0_obj * |
obj | ) |
|
Initializes the RM object lock for this object and attaches a RM context with this object.
- Precondition
- m0_obj_init()
- Parameters
-
obj | an instance of object. |
group | group_id for the object. |
- Return values
-
0 | On success. |
-ENOMEM | when there is no memory for ctx or fid. |
Definition at line 82 of file obj_lock.c.
◆ m0_obj_lock_put()
Releases the RM lock for the object.
- Parameters
-
req | Representing a request to RM. |
Definition at line 267 of file obj_lock.c.
◆ m0_obj_op()
Initialises object operation.
- Parameters
-
| obj | Object the operation is targeted to. |
| opcode | Operation code for the operation. |
| ext | Extents in the object, measured in blocks. |
| data | Application buffers for the operation. |
| attr | Application buffers for block attributes. |
| mask | Attribute mask. |
[in,out] | op | Pointer to the operation pointer. If the operation pointer is NULL, client will allocate one. Otherwise, client will check the operation and make sure it is reusable for this operation. |
- Returns
- 0 for successful initialisation of operation. Otherwise standard linux system error code.
- Precondition
- obj != NULL
-
M0_IN(opcode, (M0_OC_READ, M0_OC_WRITE, M0_OC_ALLOC, M0_OC_FREE))
-
ext != NULL
-
obj->ob_attr.oa_bshift >= M0_MIN_BUF_SHIFT
-
m0_vec_count(&ext->iv_vec) % (1ULL << obj->ob_attr.oa_bshift) == 0
-
op != NULL
-
ergo(M0_IN(opcode, (M0_OC_READ, M0_OC_WRITE)), data != NULL && attr != NULL && m0_vec_count(&ext->iv_vec) == m0_vec_count(&data->ov_vec) && m0_vec_count(&attr->ov_vec) == 8 * m0_no_of_bits_set(mask) * (m0_vec_count(&ext->iv_vec) >> obj->ob_attr.oa_bshift)
-
ergo(M0_IN(opcode, (M0_OC_ALLOC, M0_OC_FREE)), data == NULL && attr == NULL && mask == 0)
-
ergo(opcode == M0_OC_READ, M0_IN(flags, (0, M0_OOF_NOHOLE)))
-
ergo(opcode != M0_OC_READ, M0_IN(flags, (0, M0_OOF_SYNC)))
- Postcondition
- ergo(*op != NULL, *op->op_code == opcode && *op->op_sm.sm_state == M0_OS_INITIALISED)
Definition at line 717 of file io.c.
◆ m0_obj_read_lock_get()
◆ m0_obj_read_lock_get_sync()
◆ m0_obj_unit_size_to_layout_id()
uint64_t m0_obj_unit_size_to_layout_id |
( |
int |
unit_size | ) |
|
Maps a unit size to a layout id defined in Motr.
- Parameters
-
unit_size | Parity group unit size set. Only those unit sizes defined in layout/layout_pver.c are valid. |
- Returns
- 0 for invalid unit sizes and layout id for valid unit sizes.
Definition at line 836 of file obj.c.
◆ m0_obj_write_lock_get()
Acquires the write lock for the object asynchronously. Recursive locking is not supported.
This is a wrapper over m0_obj_lock_get();
- Precondition
- m0_obj_init()
-
m0_obj_lock_init()
- Parameters
-
obj | a pointer to m0_obj. |
req | representing a request to RM. |
clink | a link which will listen on a channel for the signal of completion. |
- Return values
-
Definition at line 233 of file obj_lock.c.
◆ m0_obj_write_lock_get_sync()
◆ m0_op_cancel()
void m0_op_cancel |
( |
struct m0_op ** |
op, |
|
|
uint32_t |
nr |
|
) |
| |
Cancels client operations. Caller is expected to wait for operation to move to one of the terminal states. The process of cancellation does not guarantee restoring an object to any consistent state and is left to application. Eg. cancellation of "create" operation does not invoke "unlink" operation internally, and application shall restore the consistency by either "unlinking" the object or trying to recreate the same. : Revisit the logic once DTM0 lands into dev.
- Parameters
-
op | collection of operations to cancel. |
- Precondition
- op != NULL
-
cancellation callback is defined for every operation.
Definition at line 639 of file client.c.
◆ m0_op_fini()
void m0_op_fini |
( |
struct m0_op * |
op | ) |
|
Finalises a complete operation. The state machine will be moved to M0_OS_UNINITIALISED.
- Parameters
-
op | Operation being finalised. |
- Precondition
- op != NULL
-
M0_IN(op->op_sm.sm_state, (M0_OS_INITIALISED, M0_OS_STABLE, M0_OS_FAILED))
Definition at line 847 of file client.c.
◆ m0_op_free()
void m0_op_free |
( |
struct m0_op * |
op | ) |
|
Frees a complete operation, allocated by the implementation.
- Parameters
-
- Precondition
- op != NULL pre op->op_sm.sm_state == M0_OS_UNINITIALISED
Definition at line 885 of file client.c.
◆ m0_op_kick()
void m0_op_kick |
( |
struct m0_op * |
op | ) |
|
Asks the implementation to speed up progress of this operation toward stability.
The implementation is free to either honour this call by modifying various internal caching and queuing policies to process the operation with less delays, or to ignore this call altogether. This call may incur resource under-utilisation and other overheads.
- Parameters
-
op | Operation to be kicked. |
- Precondition
- op != NULL
-
op->op_sm.sm_state >= M0_OS_INITIALISED
- Todo:
- : put pressure on the rpc system to send this rpc message
- Todo:
- : send an fsync-force fop to hurry the placement of this transaction
- Todo:
- : could release/acquire the group lock in op_launch, and test whether the op is already in launched, allowing operations that are to-be-launched to be launched from here
Definition at line 924 of file client.c.
◆ m0_op_launch()
void m0_op_launch |
( |
struct m0_op ** |
op, |
|
|
uint32_t |
nr |
|
) |
| |
Launches a collection of operations. Operations must belong to the same m0_ instances.
- Note
- the launched operations may be in other states than M0_OS_LAUNCHED by the time this call returns.
- Parameters
-
op | Array of operations to be launched. |
nr | Number of operations. |
- Precondition
- ergo(op != NULL)
-
m0_forall(i, nr, op[i] != NULL)
-
m0_forall(i, nr, op[i]->op_sm.sm_state == M0_OS_INITIALISED)
-
m0_forall(i, nr, m0_entity_type_is_valid(op[i]->op_entity))
- Postcondition
- m0_forall(i, nr, op[i]->op_sm.sm_state >= M0_OS_LAUNCHED)
Definition at line 725 of file client.c.
◆ m0_op_maxsize()
size_t m0_op_maxsize |
( |
void |
| ) |
|
Returns the maximum size a client operation is expected to be. If pre-allocating 'struct m0_op's, allocations smaller than this size may be rejected with EMSGSIZE
◆ m0_op_setup()
Sets application-manipulable operation parameters.
- Parameters
-
op | Operation to be setup with callback functions. |
cbs | Callback functions. |
linger | The absolute time by which delays should be limited. If linger < m0_time_now(), the op is executed as soon as possible. |
- Precondition
- op != NULL
-
op->op_sm.sm_state == M0_OS_INITIALISED
Definition at line 908 of file client.c.
◆ m0_op_wait()
Waits until the operation reaches a desired state.
- Parameters
-
bits | Bitmask of states based on m0_op_state. M0_BITS() macro should be used to build a bitmask. * |
op | Single operation to wait on. |
to | Absolute timeout for the wait. |
if (result == -ETIMEDOUT)
else if (result == 0) {
...
} else {
...
}
} else {
}
- Precondition
- op != NULL
-
bits != 0
-
(bits & ~M0_BITS(M0_OS_LAUNCHED, M0_OS_EXECUTED, M0_OS_STABLE, M0_OS_FAILED)) == 0
Definition at line 739 of file client.c.
◆ m0_process_fid()
void m0_process_fid |
( |
const struct m0_client * |
m0c, |
|
|
struct m0_fid * |
proc_fid |
|
) |
| |
Returns the process fid of the motr instance.
- Parameters
-
m0c | The client instance being queried. |
proc_fid | The returned process fid. |
- Returns
- 0 for success and valid process fid is stored in proc_fid, anything else for an error.
Definition at line 1766 of file client_init.c.
◆ m0_rc()
int32_t m0_rc |
( |
const struct m0_op * |
op | ) |
|
An API to return the return code from an op.
This is basically a function that abstracts the return code member access for applications using client.
- Parameters
-
op | Operation whose return code is to be accessed. |
- Precondition
- op != NULL
Definition at line 943 of file client.c.
◆ m0_realm_close()
void m0_realm_close |
( |
struct m0_realm * |
realm, |
|
|
uint64_t |
wcount, |
|
|
uint64_t |
rcount, |
|
|
struct m0_op ** |
op |
|
) |
| |
◆ m0_realm_create()
void m0_realm_create |
( |
struct m0_realm * |
realm, |
|
|
uint64_t |
wcount, |
|
|
uint64_t |
rcount, |
|
|
struct m0_op ** |
op |
|
) |
| |
◆ m0_realm_open()
void m0_realm_open |
( |
struct m0_realm * |
realm, |
|
|
uint64_t |
wcount, |
|
|
uint64_t |
rcount, |
|
|
struct m0_op ** |
op |
|
) |
| |
◆ m0_sync()
int m0_sync |
( |
struct m0_client * |
m0c, |
|
|
bool |
wait |
|
) |
| |
Motr sync instance entry point, corresponding to m0t1fs_sync_fs() in m0t1fs.
- Parameters
-
m0c | The Motr instance is going to be sync'ed. |
wait | Ask client to wait till pending tx's are done if set to be "ture". |
- Returns
- 0 for success, anything else for an error.
Entry point for syncing the all pending tx in the Client instance. Unlike sync_core this function acquires the sc_max_pending_tx_lock for each service, as there is not a larger-granularity lock.
Definition at line 1093 of file sync.c.
◆ m0_sync_entity_add()
Adds an entity to SYNC op.
- Parameters
-
sop | The SYNC op where an entity is added to. |
entity | The entity to be SYNC-ed. |
- Returns
- 0 for success, anything else for an error.
Definition at line 985 of file sync.c.
◆ m0_sync_op_add()
int m0_sync_op_add |
( |
struct m0_op * |
sop, |
|
|
struct m0_op * |
op |
|
) |
| |
Adds an op
to SYNC op.
- Parameters
-
sop | The SYNC op where an entity is added to. |
op | The operation to be SYNC-ed. |
- Returns
- 0 for success, anything else for an error.
Definition at line 1020 of file sync.c.
◆ m0_sync_op_init()
int m0_sync_op_init |
( |
struct m0_op ** |
sop | ) |
|
Allocates and initialises an SYNC operation.
- Parameters
-
sop | A new SYNC op is created, entities and ops can be added into this SYNC op once it's initialised. |
- Returns
- 0 for success, anything else for an error.
Definition at line 972 of file sync.c.
◆ main()
int main |
( |
int |
argc, |
|
|
char ** |
argv |
|
) |
| |
◆ obj_lock_incoming_complete()
static void obj_lock_incoming_complete |
( |
struct m0_rm_incoming * |
in, |
|
|
int32_t |
rc |
|
) |
| |
|
static |
Lock request completion callback
Definition at line 307 of file obj_lock.c.
◆ obj_lock_incoming_conflict()
Lock request conflict callback
Definition at line 321 of file obj_lock.c.
◆ opts_get()
static int opts_get |
( |
struct params * |
par, |
|
|
int * |
argc, |
|
|
char *** |
argv |
|
) |
| |
|
static |
◆ per_item_rcs_analyse()
static int per_item_rcs_analyse |
( |
int32_t * |
rcs, |
|
|
int |
cnt |
|
) |
| |
|
static |
◆ rm_ctx_fini()
static void rm_ctx_fini |
( |
struct m0_ref * |
ref | ) |
|
|
static |
Finalizes the rm_lock_ctx
Definition at line 168 of file obj_lock.c.
◆ rm_ctx_init()
Initialises the rm_lock_ctx
Definition at line 114 of file obj_lock.c.
◆ rm_hash_func()
static uint64_t rm_hash_func |
( |
const struct m0_htable * |
htable, |
|
|
const void * |
k |
|
) |
| |
|
static |
◆ rm_key_eq()
static bool rm_key_eq |
( |
const void * |
key1, |
|
|
const void * |
key2 |
|
) |
| |
|
static |
◆ rm_lock_req_fini()
Finalizes lock request
Definition at line 296 of file obj_lock.c.
◆ rm_lock_req_init()
Initialises lock request
Definition at line 279 of file obj_lock.c.
◆ subsystem_id()
static int subsystem_id |
( |
char * |
name | ) |
|
|
static |
◆ usage()
static void usage |
( |
void |
| ) |
|
|
static |
◆ vals_load()
static int vals_load |
( |
const char * |
value, |
|
|
struct m0_bufvec * |
vals |
|
) |
| |
|
static |
◆ vals_xcode()
static int vals_xcode |
( |
const char * |
value, |
|
|
void * |
buf, |
|
|
m0_bcount_t * |
size |
|
) |
| |
|
static |
◆ cc_ctx
◆ cek_layer_id
◆ cek_off
◆ cev_len
◆ co_realm
◆ commands
Initial value:= {
{
CRT,
"create",
"create FID_PARAM, create index" },
{
DRP,
"drop",
"drop FID_PARAM, drop existing index"},
{
LST,
"list",
"list FID NUM, get indicies" },
{
LKP,
"lookup",
"lookup FID_PARAM, lookup index in storage" },
{
PUT,
"put",
"put FID_PARAM KEY_PARAM VAL_PARAM, put record" },
{
DEL,
"del",
"del FID_PARAM KEY_PARAM, delete record" },
{
GET,
"get",
"get FID KEY_PARAM, lookup and returns values by key" },
{
NXT,
"next",
"next FID KEY CNT, returns records larger than KEY " },
{
GENF,
"genf",
"genf CNT FILE, generate file with several FID" },
{
GENV,
"genv",
"genv CNT SIZE FILE, generate file with several " "KEY_PARAM/VAL_PARAM. Note: SIZE > 16" },
}
Definition at line 52 of file index_parser.c.
◆ dt_realm
◆ en_flags
◆ en_id
Globally unique, not re-usable entity identifier.
Definition at line 708 of file client.h.
◆ en_pending_tx
struct m0_tl en_pending_tx |
list of pending transactions.
Definition at line 736 of file client.h.
◆ en_pending_tx_lock
◆ en_realm
Parent realm, this entity lives in.
Definition at line 710 of file client.h.
◆ en_sm
Entity state machine. Used internally by the implementation. For the reference, the state diagram is:
* create
* CREATING<--------+
* | |
* | |
* | |
* | |
* +---------->INIT<----------------------CLOSING
* | | | ^
* | | | |
* | | | | close
* | | | |
* DELETING<--------+ +-------->OPENING-------->OPEN
* delete open
*
Definition at line 732 of file client.h.
◆ en_sm_group
Each entity has its own sm group.
Definition at line 734 of file client.h.
◆ en_type
◆ ep_realm
◆ idx_layout_type
DIX pool layout type. Please refer to enum dix_layout_type.
Definition at line 813 of file client.h.
◆ idx_pver
DIX pool version.
Definition at line 815 of file client.h.
◆ in_attr
◆ in_entity
◆ instance
◆ is_str [1/2]
◆ is_str [2/2]
◆ M0_ID_APP
First identifier that applications are free to use.
It is guaranteed that M0_UBER_REALM falls into reserved extent.
- Invariant
- m0_uint128_cmp(&M0_UBER_REALM, &M0_ID_APP) < 0
First identifier the application is allowed to use. The first 0x100000 ids are reserved for use by client.
Definition at line 92 of file client.c.
◆ M0_UBER_REALM
The identifier of the root of realm hierarchy.
Pre-defined identifier of the over-arching realm.
Definition at line 85 of file client.c.
◆ M0_XCA_ENUM [1/2]
◆ M0_XCA_ENUM [2/2]
◆ mc_addb_size
◆ mc_ha_addr
HA service's endpoint.
Definition at line 935 of file client.h.
◆ mc_idx_service_conf
void* mc_idx_service_conf |
◆ mc_idx_service_id
◆ mc_is_addb_init
Flag to enable/disable addb2 initialization
Definition at line 930 of file client.h.
◆ mc_is_oostore
oostore mode is set when 'is_oostore' is TRUE.
Definition at line 920 of file client.h.
◆ mc_is_read_verify
Flag for verify-on-read. Parity is checked when doing READ's if this flag is set.
Definition at line 925 of file client.h.
◆ mc_layout_id
◆ mc_local_addr
const char* mc_local_addr |
Local endpoint.
Definition at line 933 of file client.h.
◆ mc_max_rpc_msg_size
uint32_t mc_max_rpc_msg_size |
The maximum rpc message size, use M0_RPC_DEF_MAX_RPC_MSG_SIZE if unsure.
Definition at line 949 of file client.h.
◆ mc_process_fid
const char* mc_process_fid |
Process fid for rmservice.
Definition at line 937 of file client.h.
◆ mc_profile
◆ mc_tm_recv_queue_min_len
uint32_t mc_tm_recv_queue_min_len |
The minimum length of the 'tm' receive queue, use M0_NET_TM_RECV_QUEUE_DEF_LEN if unsure.
Definition at line 944 of file client.h.
◆ ml_entity
◆ ml_obj
◆ ml_ops
◆ ml_type
◆ oa_bshift
Binary logarithm (bit-shift) of object minimal block size.
Definition at line 749 of file client.h.
◆ oa_buf_size
◆ oa_layout_id
Layout ID for an object.
Definition at line 752 of file client.h.
◆ oa_pool
The pool this object stores data to. A pool can be selected when creating an object by specifying this field. A pool version matching the specified pool fid is then chosen for the object. The pool version is then stored as one of its attributes in service's backend.
Definition at line 760 of file client.h.
◆ oa_pver
Pool version fid
Definition at line 763 of file client.h.
◆ ob_attr
◆ ob_cookie
Cookie associated with a RM context
Definition at line 793 of file client.h.
◆ ob_entity
◆ ob_layout
◆ obj_lock_incoming_ops
Initial value:= {
}
static void obj_lock_incoming_complete(struct m0_rm_incoming *in, int32_t rc)
static void obj_lock_incoming_conflict(struct m0_rm_incoming *in)
Definition at line 60 of file obj_lock.c.
◆ oop_executed
void(* oop_executed) (struct m0_op *op) |
◆ oop_failed
void(* oop_failed) (struct m0_op *op) |
◆ oop_stable
void(* oop_stable) (struct m0_op *op) |
◆ op_cbs
Application-supplied call-backs.
Definition at line 658 of file client.h.
◆ op_code
◆ op_count
◆ op_datum
◆ op_entity
The entity this operation is on.
Definition at line 660 of file client.h.
◆ op_gen
Part of a cookie (m0_cookie) used to identify this operation.
Definition at line 666 of file client.h.
◆ op_linger
Caching dead-line.
Definition at line 662 of file client.h.
◆ op_magic
◆ op_parent
Back pointer to parent op and it is used to form an execution plan for a group of ops. An example: an composite layout IO op is divided into a few IO ops to sub-objects. Each sub-object IO op has an pointer to the composite IO op.
Definition at line 673 of file client.h.
◆ op_parent_ast
◆ op_pending_tx
struct m0_tl op_pending_tx |
list of pending transactions.
Definition at line 676 of file client.h.
◆ op_pending_tx_lock
◆ op_priv
Private field, to be used by internal implementation.
Definition at line 684 of file client.h.
◆ op_priv_lock
◆ op_rc
Operation result code
Definition at line 652 of file client.h.
◆ op_size
Size of the ambient operation structure.
Definition at line 664 of file client.h.
◆ op_sm
Operation state machine.
Definition at line 656 of file client.h.
◆ op_sm_group
Each op has its own sm group.
Definition at line 654 of file client.h.
◆ re_entity
◆ re_instance
◆ re_type
◆ subsystems
Initial value:= {
}
int index_init(struct params *params)
int index_execute(int argc, char **argv)
Definition at line 51 of file cmd_main.c.