Motr  M0
file.c
Go to the documentation of this file.
1 /* -*- C -*- */
2 /*
3  * Copyright (c) 2012-2021 Seagate Technology LLC and/or its Affiliates
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  * For any questions about this software or licensing,
18  * please email opensource@seagate.com or cortx-questions@seagate.com.
19  *
20  */
21 
22 
23 #include <linux/version.h> /* LINUX_VERSION_CODE */
24 #if LINUX_VERSION_CODE <= KERNEL_VERSION(4,11,0)
25 #include <asm/uaccess.h> /* VERIFY_READ, VERIFY_WRITE */
26 #endif
27 #include <asm/atomic.h> /* atomic_get */
28 #include <linux/mm.h> /* get_user_pages, get_page, put_page */
29 #include <linux/fs.h> /* struct file_operations */
30 #include <linux/mount.h> /* struct vfsmount (f_path.mnt) */
31 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
32 #include <linux/uio.h> /* struct iovec */
33 #include <linux/aio.h> /* struct kiocb */
34 #endif
35 
36 #define M0_TRACE_SUBSYSTEM M0_TRACE_SUBSYS_M0T1FS
37 #include "lib/trace.h"
38 
39 #include "fop/fom_generic.h"/* m0_rpc_item_is_generic_reply_fop */
40 #include "lib/memory.h" /* m0_alloc, m0_free */
41 #include "lib/misc.h" /* m0_round_{up/down} */
42 #include "lib/bob.h" /* m0_bob_type */
43 #include "lib/ext.h" /* m0_ext */
44 #include "lib/arith.h" /* min_type */
45 #include "lib/finject.h" /* M0_FI_ENABLED */
46 #include "layout/pdclust.h" /* M0_PUT_*, m0_layout_to_pdl, */
47 #include "lib/bob.h" /* m0_bob_type */
48 #include "lib/tlist.h"
49 #include "rpc/rpc_machine.h" /* m0_rpc_machine, m0_rpc_machine_lock */
50 #include "ioservice/io_fops.h" /* m0_io_fop */
51 #include "motr/magic.h" /* M0_T1FS_IOREQ_MAGIC */
52 #include "m0t1fs/linux_kernel/m0t1fs.h" /* m0t1fs_sb */
53 #include "file/file.h"
54 #include "fd/fd.h" /* m0_fd_fwd_map m0_fd_bwd_map */
55 #include "lib/hash.h" /* m0_htable */
56 #include "sns/parity_repair.h" /*m0_sns_repair_spare_map() */
57 #include "addb2/addb2.h"
61 #include "ioservice/fid_convert.h" /* m0_fid_cob_device_id */
62 
322 struct io_mem_stats iommstats;
323 
324 M0_INTERNAL void iov_iter_advance(struct iov_iter *i, size_t bytes);
325 
326 /* Imports */
327 struct m0_net_domain;
328 M0_INTERNAL bool m0t1fs_inode_bob_check(struct m0t1fs_inode *bob);
329 M0_TL_DECLARE(rpcbulk, M0_INTERNAL, struct m0_rpc_bulk_buf);
330 M0_TL_DESCR_DECLARE(rpcbulk, M0_EXTERN);
331 
332 M0_TL_DESCR_DEFINE(iofops, "List of IO fops", static,
333  struct io_req_fop, irf_link, irf_magic,
335 
336 M0_TL_DEFINE(iofops, static, struct io_req_fop);
337 
338 static const struct m0_bob_type tioreq_bobtype;
340 static const struct m0_bob_type ioreq_bobtype;
341 static const struct m0_bob_type pgiomap_bobtype;
342 static const struct m0_bob_type nwxfer_bobtype;
343 static const struct m0_bob_type dtbuf_bobtype;
344 
351 
352 static const struct m0_bob_type ioreq_bobtype = {
353  .bt_name = "io_request_bobtype",
354  .bt_magix_offset = offsetof(struct io_request, ir_magic),
355  .bt_magix = M0_T1FS_IOREQ_MAGIC,
356  .bt_check = NULL,
357 };
358 
359 static const struct m0_bob_type pgiomap_bobtype = {
360  .bt_name = "pargrp_iomap_bobtype",
361  .bt_magix_offset = offsetof(struct pargrp_iomap, pi_magic),
362  .bt_magix = M0_T1FS_PGROUP_MAGIC,
363  .bt_check = NULL,
364 };
365 
366 static const struct m0_bob_type nwxfer_bobtype = {
367  .bt_name = "nw_xfer_request_bobtype",
368  .bt_magix_offset = offsetof(struct nw_xfer_request, nxr_magic),
369  .bt_magix = M0_T1FS_NWREQ_MAGIC,
370  .bt_check = NULL,
371 };
372 
373 static const struct m0_bob_type dtbuf_bobtype = {
374  .bt_name = "data_buf_bobtype",
375  .bt_magix_offset = offsetof(struct data_buf, db_magic),
376  .bt_magix = M0_T1FS_DTBUF_MAGIC,
377  .bt_check = NULL,
378 };
379 
380 static const struct m0_bob_type tioreq_bobtype = {
381  .bt_name = "target_ioreq",
382  .bt_magix_offset = offsetof(struct target_ioreq, ti_magic),
383  .bt_magix = M0_T1FS_TIOREQ_MAGIC,
384  .bt_check = NULL,
385 };
386 
387 /*
388  * These are used as macros since they are used as lvalues which is
389  * not possible by using static inline functions.
390  */
391 #define INDEX(ivec, i) ((ivec)->iv_index[(i)])
392 #define COUNT(ivec, i) ((ivec)->iv_vec.v_count[(i)])
393 #define SEG_NR(ivec) ((ivec)->iv_vec.v_nr)
394 
395 #define V_INDEX(ivec, i) (*(m0_bindex_t*)(m0_varr_ele_get(&(ivec)->iv_index, (i))))
396 #define V_ADDR(bv, i) (*(void**) (m0_varr_ele_get(&(bv )->iv_index, (i))))
397 #define V_COUNT(ivec, i) (*(m0_bcount_t*)(m0_varr_ele_get(&(ivec)->iv_count, (i))))
398 #define V_SEG_NR(ivec) ((ivec)->iv_nr)
399 
400 #define PA(pa, i) (*(enum page_attr*)(m0_varr_ele_get((pa), (i))))
401 
402 #define indexvec_dump(ivec) \
403 do { \
404  int seg; \
405  for (seg = 0; seg < SEG_NR((ivec)); ++seg) { \
406  M0_LOG(M0_DEBUG, "seg# %d: [pos, +len) = [%llu, +%llu)", \
407  seg, INDEX((ivec), seg), COUNT((ivec), seg)); \
408  } \
409 } while (0)
410 
411 #define indexvec_varr_dump(ivec) \
412 do { \
413  int seg; \
414  for (seg = 0; seg < V_SEG_NR((ivec)); ++seg) { \
415  M0_LOG(M0_DEBUG, "seg# %d: [pos, +len) = [%llu, +%llu)", \
416  seg, V_INDEX((ivec), seg), V_COUNT((ivec), seg)); \
417  } \
418 } while (0)
419 
420 static inline m0_bcount_t seg_endpos(const struct m0_indexvec *ivec, uint32_t i)
421 {
422  M0_PRE(ivec != NULL);
423 
424  return INDEX(ivec, i) + COUNT(ivec, i);
425 }
426 
427 static inline m0_bcount_t
428 v_seg_endpos(struct m0_indexvec_varr *ivec, uint32_t i)
429 {
430  M0_PRE(ivec != NULL);
431 
432  return V_INDEX(ivec, i) + V_COUNT(ivec, i);
433 }
434 
435 M0_INTERNAL struct inode *m0t1fs_file_to_inode(const struct file *file)
436 {
437 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0)
438  return file->f_path.dentry->d_inode;
439 #else
440  return file->f_dentry->d_inode;
441 #endif
442 }
443 
444 M0_INTERNAL struct m0t1fs_inode *m0t1fs_file_to_m0inode(const struct file *file)
445 {
446  return M0T1FS_I(m0t1fs_file_to_inode(file));
447 }
448 
449 M0_INTERNAL struct m0_pool_version *m0t1fs_file_to_pver(const struct file *file)
450 {
451  struct m0t1fs_inode *inode = M0T1FS_I(m0t1fs_file_to_inode(file));
452  struct m0t1fs_sb *csb = M0T1FS_SB(m0t1fs_file_to_inode(file)->i_sb);
453 
454  return m0_pool_version_find(&csb->csb_pools_common, &inode->ci_pver);
455 }
456 
457 M0_INTERNAL struct m0_poolmach *m0t1fs_file_to_poolmach(const struct file *file)
458 {
459  return &m0t1fs_file_to_pver(file)->pv_mach;
460 }
461 
462 M0_INTERNAL struct m0t1fs_inode *m0t1fs_inode_to_m0inode(const struct inode *inode)
463 {
464  return M0T1FS_I(inode);
465 }
466 
467 static inline struct inode *iomap_to_inode(const struct pargrp_iomap *map)
468 {
469  return m0t1fs_file_to_inode(map->pi_ioreq->ir_file);
470 }
471 
472 M0_INTERNAL struct m0t1fs_sb *m0inode_to_sb(const struct m0t1fs_inode *m0inode)
473 {
474  return M0T1FS_SB(m0inode->ci_inode.i_sb);
475 }
476 
477 static inline const struct m0_fid *file_to_fid(const struct file *file)
478 {
480 }
481 
482 static inline struct m0t1fs_sb *file_to_sb(const struct file *file)
483 {
484  return M0T1FS_SB(m0t1fs_file_to_inode(file)->i_sb);
485 }
486 
487 static inline struct m0_sm_group *file_to_smgroup(const struct file *file)
488 {
489  return &file_to_sb(file)->csb_iogroup;
490 }
491 
492 static inline uint64_t page_nr(m0_bcount_t size)
493 {
494  return size >> PAGE_SHIFT;
495 }
496 
497 static struct m0_layout_instance *
499 {
500  return m0t1fs_file_to_m0inode(req->ir_file)->ci_layout_instance;
501 }
502 
503 static inline struct m0_pdclust_instance *
505 {
506  return m0_layout_instance_to_pdi(li);
507 }
508 
509 static inline struct m0_pdclust_layout *
511 {
512  return m0_layout_to_pdl(layout_instance(req)->li_l);
513 }
514 
515 static inline uint32_t layout_n(const struct m0_pdclust_layout *play)
516 {
517  return play->pl_attr.pa_N;
518 }
519 
520 static inline uint32_t layout_k(const struct m0_pdclust_layout *play)
521 {
522  return play->pl_attr.pa_K;
523 }
524 
525 static inline uint64_t layout_unit_size(const struct m0_pdclust_layout *play)
526 {
527  return play->pl_attr.pa_unit_size;
528 }
529 
530 static inline uint64_t parity_units_page_nr(const struct m0_pdclust_layout *play)
531 {
532  return page_nr(layout_unit_size(play)) * layout_k(play);
533 }
534 
535 static inline uint64_t indexvec_varr_count(struct m0_indexvec_varr *varr)
536 {
537  uint64_t sum = 0;
538 
539  m0_varr_for(&varr->iv_count, uint64_t *, i, countp) {
540  sum += *(uint64_t*)countp;
541  } m0_varr_endfor;
542  return sum;
543 }
544 
545 static inline uint64_t iomap_page_nr(struct pargrp_iomap *map)
546 {
547  return page_nr(indexvec_varr_count(&map->pi_ivv));
548 }
549 
550 static inline uint64_t data_size(const struct m0_pdclust_layout *play)
551 {
552  return layout_n(play) * layout_unit_size(play);
553 }
554 
555 static inline struct m0_parity_math *parity_math(struct io_request *req)
556 {
558 }
559 
560 static inline uint64_t group_id(m0_bindex_t index, m0_bcount_t dtsize)
561 {
562  return index / dtsize;
563 }
564 
565 static inline bool is_page_read(struct data_buf *dbuf)
566 {
567  return dbuf->db_flags & PA_READ &&
568  dbuf->db_tioreq != NULL && dbuf->db_tioreq->ti_rc == 0;
569 }
570 
571 static inline uint64_t target_offset(uint64_t frame,
572  struct m0_pdclust_layout *play,
573  m0_bindex_t gob_offset)
574 {
575  return frame * layout_unit_size(play) +
576  (gob_offset % layout_unit_size(play));
577 }
578 
579 static inline uint32_t target_ioreq_type_get(struct target_ioreq *ti)
580 {
581  return ti->ti_req_type;
582 }
583 
584 static inline void target_ioreq_type_set(struct target_ioreq *ti,
585  enum target_ioreq_type type)
586 {
587  ti->ti_req_type = type;
588 }
589 
590 static bool is_pver_dud(uint32_t fdev_nr, uint32_t dev_k, uint32_t fsvc_nr,
591  uint32_t svc_k);
592 
593 static uint64_t tioreqs_hash_func(const struct m0_htable *htable, const void *k)
594 {
595  const uint64_t *key = (uint64_t *)k;
596 
597  return *key % htable->h_bucket_nr;
598 }
599 
600 static bool tioreq_key_eq(const void *key1, const void *key2)
601 {
602  const uint64_t *k1 = (uint64_t *)key1;
603  const uint64_t *k2 = (uint64_t *)key2;
604 
605  return *k1 == *k2;
606 }
607 
608 M0_HT_DESCR_DEFINE(tioreqht, "Hash of target_ioreq objects", static,
609  struct target_ioreq, ti_link, ti_magic,
611  ti_fid.f_container, tioreqs_hash_func, tioreq_key_eq);
612 
613 M0_HT_DEFINE(tioreqht, static, struct target_ioreq, uint64_t);
614 
615 /* Finds the parity group associated with a given target offset.
616  * index - target offset for intended IO.
617  * req - IO-request holding information about IO.
618  * tio_req - io-request for given target.
619  * src - output parity group.
620  */
622  const struct io_request *req,
623  const struct target_ioreq *tio_req,
624  struct m0_pdclust_src_addr *src)
625 {
626  struct m0_pdclust_tgt_addr tgt;
627  struct m0_pdclust_layout *play;
628 
629  M0_PRE(req != NULL);
630  M0_PRE(src != NULL);
631 
632  play = pdlayout_get(req);
633  tgt.ta_obj = tio_req->ti_obj;
636 }
637 
638 static inline uint64_t pargrp_id_find(m0_bindex_t index,
639  const struct io_request *req,
640  const struct io_req_fop *ir_fop)
641 {
642  struct m0_pdclust_src_addr src;
643 
644  pargrp_src_addr(index, req, ir_fop->irf_tioreq, &src);
645  return src.sa_group;
646 }
647 
649  const struct pargrp_iomap *map,
650  const struct m0_pdclust_layout *play,
651  const struct m0_pdclust_src_addr *src)
652 {
653  m0_bindex_t goff;
654 
655  M0_PRE(map != NULL);
656  M0_PRE(play != NULL);
657 
658  M0_ENTRY("grpid = %llu, target_off = %llu", map->pi_grpid, toff);
659 
660  goff = map->pi_grpid * data_size(play) +
661  src->sa_unit * layout_unit_size(play) +
662  toff % layout_unit_size(play);
663  M0_LEAVE("global file offset = %llu", goff);
664 
665  return goff;
666 }
667 
668 static inline struct m0_fid target_fid(const struct io_request *req,
669  struct m0_pdclust_tgt_addr *tgt)
670 {
671  struct m0_fid fid;
672 
675  &fid);
676  return fid;
677 }
678 
679 static inline struct m0_rpc_session *target_session(struct io_request *req,
680  struct m0_fid tfid)
681 {
683  m0_fid_cob_device_id(&tfid));
684 }
685 
686 static inline uint64_t page_id(m0_bindex_t offset)
687 {
688  return offset >> PAGE_SHIFT;
689 }
690 
691 static inline uint32_t rows_nr(struct m0_pdclust_layout *play)
692 {
693  return page_nr(layout_unit_size(play));
694 }
695 
696 #if !defined(round_down)
697 static inline uint64_t round_down(uint64_t val, uint64_t size)
698 {
700 
701  /*
702  * Returns current value if it is already a multiple of size,
703  * else m0_round_down() is invoked.
704  */
705  return (val & (size - 1)) == 0 ?
707 }
708 #endif
709 
710 #if !defined(round_up)
711 static inline uint64_t round_up(uint64_t val, uint64_t size)
712 {
714 
715  /*
716  * Returns current value if it is already a multiple of size,
717  * else m0_round_up() is invoked.
718  */
719  return (val & (size - 1)) == 0 ?
720  val : m0_round_up(val, size);
721 }
722 #endif
723 
724 /* Returns the position of page in matrix of data buffers. */
725 static void page_pos_get(struct pargrp_iomap *map,
727  uint32_t *row,
728  uint32_t *col)
729 {
730  uint64_t pg_id;
731  struct m0_pdclust_layout *play;
732 
733  M0_PRE(map != NULL);
734  M0_PRE(row != NULL);
735  M0_PRE(col != NULL);
736 
737  play = pdlayout_get(map->pi_ioreq);
738 
739  pg_id = page_id(index - data_size(play) * map->pi_grpid);
740  *row = pg_id % rows_nr(play);
741  *col = pg_id / rows_nr(play);
742 }
743 
746  uint32_t *row,
747  uint32_t *col)
748 {
749  uint64_t pg_id;
750  struct m0_pdclust_layout *play;
751 
752  M0_PRE(map != NULL);
753  M0_PRE(row != NULL);
754  M0_PRE(col != NULL);
755 
756  play = pdlayout_get(map->pi_ioreq);
757 
758  pg_id = page_id(index);
759  *row = pg_id % rows_nr(play);
760  *col = pg_id / rows_nr(play);
761 }
762 
763 /*
764  * Returns the starting offset of page given its position in data matrix.
765  * Acts as opposite of page_pos_get() API.
766  */
768  uint32_t row,
769  uint32_t col)
770 {
771  struct m0_pdclust_layout *play;
773 
774  M0_PRE(map != NULL);
775  M0_ENTRY("gid = %llu, row = %u, col = %u", map->pi_grpid, row, col);
776 
777  play = pdlayout_get(map->pi_ioreq);
778 
779  M0_ASSERT(row < rows_nr(play));
780  M0_ASSERT(col < layout_n(play));
781 
782  out = data_size(play) * map->pi_grpid +
783  col * layout_unit_size(play) + row * PAGE_SIZE;
784 
785  M0_LEAVE("offsef = %llu", out);
786  return out;
787 }
788 
789 /* Invoked during m0t1fs mount. */
790 M0_INTERNAL void io_bob_tlists_init(void)
791 {
795 }
796 
797 static void device_state_reset(struct nw_xfer_request *xfer, bool rmw);
798 
799 static void io_rpc_item_cb (struct m0_rpc_item *item);
800 static void io_req_fop_release(struct m0_ref *ref);
801 static void cc_rpc_item_cb(struct m0_rpc_item *item);
802 static void cc_fop_release(struct m0_ref *ref);
803 
804 /*
805  * io_rpc_item_cb can not be directly invoked from io fops code since it
806  * leads to build dependency of ioservice code over kernel-only code (m0t1fs).
807  * Hence, a new m0_rpc_item_ops structure is used for fops dispatched
808  * by m0t1fs io requests.
809  */
810 static const struct m0_rpc_item_ops io_item_ops = {
812 };
813 
814 static const struct m0_rpc_item_ops cc_item_ops = {
816 };
817 
818 static bool nw_xfer_request_invariant(const struct nw_xfer_request *xfer);
819 
820 static int nw_xfer_io_distribute(struct nw_xfer_request *xfer);
821 static void nw_xfer_req_complete (struct nw_xfer_request *xfer,
822  bool rmw);
823 static int nw_xfer_req_dispatch (struct nw_xfer_request *xfer);
824 
825 static int nw_xfer_tioreq_map (struct nw_xfer_request *xfer,
826  const struct m0_pdclust_src_addr *src,
827  struct m0_pdclust_tgt_addr *tgt,
828  struct target_ioreq **tio);
829 
830 static int nw_xfer_tioreq_get (struct nw_xfer_request *xfer,
831  const struct m0_fid *fid,
832  uint64_t ta_obj,
833  struct m0_rpc_session *session,
834  uint64_t size,
835  struct target_ioreq **out);
836 
837 static const struct nw_xfer_ops xfer_ops = {
839  .nxo_complete = nw_xfer_req_complete,
840  .nxo_dispatch = nw_xfer_req_dispatch,
841  .nxo_tioreq_map = nw_xfer_tioreq_map,
842 };
843 
844 static int pargrp_iomap_populate (struct pargrp_iomap *map,
845  struct m0_ivec_varr_cursor *cursor);
846 
847 static bool pargrp_iomap_spans_seg (struct pargrp_iomap *map,
850 
851 static int pargrp_iomap_readrest (struct pargrp_iomap *map);
852 
853 
854 static int pargrp_iomap_seg_process (struct pargrp_iomap *map,
855  uint64_t seg,
856  bool rmw);
857 
858 static int pargrp_iomap_parity_recalc(struct pargrp_iomap *map);
859 static int pargrp_iomap_parity_verify(struct pargrp_iomap *map);
860 
861 static uint64_t pargrp_iomap_fullpages_count(struct pargrp_iomap *map);
862 
864 
866 
867 static int pargrp_iomap_dgmode_process (struct pargrp_iomap *map,
868  struct target_ioreq *tio,
870  uint32_t count);
871 
873 
874 static int pargrp_iomap_dgmode_recover (struct pargrp_iomap *map);
875 
876 static const struct pargrp_iomap_ops iomap_ops = {
878  .pi_spans_seg = pargrp_iomap_spans_seg,
879  .pi_readrest = pargrp_iomap_readrest,
880  .pi_fullpages_find = pargrp_iomap_fullpages_count,
881  .pi_seg_process = pargrp_iomap_seg_process,
882  .pi_readold_auxbuf_alloc = pargrp_iomap_readold_auxbuf_alloc,
883  .pi_parity_recalc = pargrp_iomap_parity_recalc,
884  .pi_parity_verify = pargrp_iomap_parity_verify,
885  .pi_paritybufs_alloc = pargrp_iomap_paritybufs_alloc,
886  .pi_dgmode_process = pargrp_iomap_dgmode_process,
887  .pi_dgmode_postprocess = pargrp_iomap_dgmode_postprocess,
888  .pi_dgmode_recover = pargrp_iomap_dgmode_recover,
889 };
890 
891 static bool pargrp_iomap_invariant_nr (struct io_request *req);
892 static bool target_ioreq_invariant (struct target_ioreq *ti);
893 
894 static void target_ioreq_fini (struct target_ioreq *ti);
895 
896 static int target_ioreq_iofops_prepare(struct target_ioreq *ti,
897  enum page_attr filter);
898 
899 static void target_ioreq_seg_add(struct target_ioreq *ti,
900  const struct m0_pdclust_src_addr *src,
901  const struct m0_pdclust_tgt_addr *tgt,
902  m0_bindex_t gob_offset,
904  struct pargrp_iomap *map);
905 
906 static int target_cob_create_fop_prepare(struct target_ioreq *ti);
907 static const struct target_ioreq_ops tioreq_ops = {
909  .tio_iofops_prepare = target_ioreq_iofops_prepare,
910  .tio_cc_fops_prepare = target_cob_create_fop_prepare,
911 };
912 
913 static int io_req_fop_dgmode_read(struct io_req_fop *irfop);
914 
915 static struct data_buf *data_buf_alloc_init(enum page_attr pattr);
916 
917 static void data_buf_dealloc_fini(struct data_buf *buf);
918 
919 static void io_bottom_half(struct m0_sm_group *grp, struct m0_sm_ast *ast);
920 
921 static void cc_bottom_half(struct m0_sm_group *grp, struct m0_sm_ast *ast);
922 
923 static int ioreq_iomaps_prepare(struct io_request *req);
924 
925 static void ioreq_iomaps_destroy(struct io_request *req);
926 
927 static int ioreq_user_data_copy (struct io_request *req,
928  enum copy_direction dir,
929  enum page_attr filter);
930 
931 static int ioreq_parity_recalc (struct io_request *req);
932 static int ioreq_parity_verify (struct io_request *req);
933 
934 static int ioreq_iosm_handle (struct io_request *req);
935 
936 static int ioreq_file_lock (struct io_request *req);
937 static void ioreq_file_unlock (struct io_request *req);
938 static int ioreq_no_lock (struct io_request *req);
939 static void ioreq_no_unlock (struct io_request *req);
940 
941 static int ioreq_dgmode_read (struct io_request *req, bool rmw);
942 static int ioreq_dgmode_write (struct io_request *req, bool rmw);
943 static int ioreq_dgmode_recover (struct io_request *req);
944 
945 static bool should_req_sm_complete(struct io_request *req);
946 
947 static const struct io_request_ops ioreq_ops = {
949  .iro_iomaps_destroy = ioreq_iomaps_destroy,
950  .iro_user_data_copy = ioreq_user_data_copy,
951  .iro_parity_recalc = ioreq_parity_recalc,
952  .iro_parity_verify = ioreq_parity_verify,
953  .iro_iosm_handle = ioreq_iosm_handle,
954  .iro_file_lock = ioreq_file_lock,
955  .iro_file_unlock = ioreq_file_unlock,
956  .iro_dgmode_read = ioreq_dgmode_read,
957  .iro_dgmode_write = ioreq_dgmode_write,
958  .iro_dgmode_recover = ioreq_dgmode_recover,
959 };
960 
961 static const struct io_request_ops ioreq_oostore_ops = {
963  .iro_iomaps_destroy = ioreq_iomaps_destroy,
964  .iro_user_data_copy = ioreq_user_data_copy,
965  .iro_parity_recalc = ioreq_parity_recalc,
966  .iro_parity_verify = ioreq_parity_verify,
967  .iro_iosm_handle = ioreq_iosm_handle,
968  .iro_file_lock = ioreq_no_lock,
969  .iro_file_unlock = ioreq_no_unlock,
970  .iro_dgmode_read = ioreq_dgmode_read,
971  .iro_dgmode_write = ioreq_dgmode_write,
972  .iro_dgmode_recover = ioreq_dgmode_recover,
973 };
974 
975 static inline uint32_t ioreq_sm_state(const struct io_request *req)
976 {
977  return req->ir_sm.sm_state;
978 }
979 
980 static struct m0_sm_state_descr io_states[] = {
981  [IRS_INITIALIZED] = {
983  .sd_name = "IO_initial",
984  .sd_allowed = M0_BITS(IRS_READING, IRS_WRITING,
986  },
987  [IRS_READING] = {
988  .sd_name = "IO_reading",
989  .sd_allowed = M0_BITS(IRS_READ_COMPLETE, IRS_FAILED)
990  },
991  [IRS_READ_COMPLETE] = {
992  .sd_name = "IO_read_complete",
993  .sd_allowed = M0_BITS(IRS_WRITING, IRS_REQ_COMPLETE,
995  IRS_READING)
996  },
998  .sd_name = "IO_degraded_read",
999  .sd_allowed = M0_BITS(IRS_READ_COMPLETE, IRS_FAILED)
1000  },
1001  [IRS_DEGRADED_WRITING] = {
1002  .sd_name = "IO_degraded_write",
1003  .sd_allowed = M0_BITS(IRS_WRITE_COMPLETE, IRS_FAILED)
1004  },
1005  [IRS_WRITING] = {
1006  .sd_name = "IO_writing",
1007  .sd_allowed = M0_BITS(IRS_WRITE_COMPLETE, IRS_FAILED)
1008  },
1009  [IRS_WRITE_COMPLETE] = {
1010  .sd_name = "IO_write_complete",
1011  .sd_allowed = M0_BITS(IRS_REQ_COMPLETE, IRS_FAILED,
1013  },
1014  [IRS_FAILED] = {
1015  .sd_flags = M0_SDF_FAILURE,
1016  .sd_name = "IO_req_failed",
1017  .sd_allowed = M0_BITS(IRS_REQ_COMPLETE)
1018  },
1019  [IRS_REQ_COMPLETE] = {
1020  .sd_flags = M0_SDF_TERMINAL,
1021  .sd_name = "IO_req_complete",
1022  },
1023 };
1024 
1025 static const struct m0_sm_conf io_sm_conf = {
1026  .scf_name = "IO request state machine configuration",
1027  .scf_nr_states = ARRAY_SIZE(io_states),
1028  .scf_state = io_states,
1029 };
1030 
1031 static void ioreq_sm_failed(struct io_request *req, int rc)
1032 {
1033  M0_LOG(M0_DEBUG, "[%p] rc %d", req, rc);
1037 }
1038 
1039 static void ioreq_sm_state_set(struct io_request *req, int state)
1040 {
1041  M0_LOG(M0_INFO, "[%p] change state %s -> %s",
1042  req, io_states[ioreq_sm_state(req)].sd_name,
1043  io_states[state].sd_name);
1045  m0_sm_state_set(&req->ir_sm, state);
1047 }
1048 
1049 static void ioreq_sm_state_set_nolock(struct io_request *req, int state)
1050 {
1051  M0_LOG(M0_INFO, "[%p] change state %s -> %s",
1052  req, io_states[ioreq_sm_state(req)].sd_name,
1053  io_states[state].sd_name);
1054  m0_sm_state_set(&req->ir_sm, state);
1055 }
1056 
1058 {
1059  return
1060  _0C(io_request_bob_check(req)) &&
1061  _0C(req->ir_type <= IRT_TYPE_NR) &&
1062  _0C(req->ir_iovec != NULL) &&
1063  _0C(req->ir_ops != NULL) &&
1065 
1067  !tioreqht_htable_is_empty(&req->ir_nwxfer.
1068  nxr_tioreqs_hash))) &&
1069 
1071  !tioreqht_htable_is_empty(&req->ir_nwxfer.
1072  nxr_tioreqs_hash))) &&
1073 
1078 
1079  _0C(indexvec_varr_count(&req->ir_ivv) > 0) &&
1080 
1081  m0_forall(i, V_SEG_NR(&req->ir_ivv) - 1,
1082  _0C(v_seg_endpos(&req->ir_ivv, i) <=
1083  V_INDEX(&req->ir_ivv, i+1))) &&
1084 
1086 
1088 }
1089 
1090 static bool nw_xfer_request_invariant(const struct nw_xfer_request *xfer)
1091 {
1092  return _0C(nw_xfer_request_bob_check(xfer)) &&
1093  _0C(xfer->nxr_state <= NXS_STATE_NR) &&
1094 
1095  _0C(ergo(xfer->nxr_state == NXS_INITIALIZED,
1096  xfer->nxr_rc == 0 && xfer->nxr_bytes == 0 &&
1097  m0_atomic64_get(&xfer->nxr_iofop_nr) == 0)) &&
1098 
1099  _0C(ergo(xfer->nxr_state == NXS_INFLIGHT,
1100  !tioreqht_htable_is_empty(&xfer->nxr_tioreqs_hash))) &&
1101 
1102  _0C(ergo(xfer->nxr_state == NXS_COMPLETE,
1103  m0_atomic64_get(&xfer->nxr_iofop_nr) == 0 &&
1104  m0_atomic64_get(&xfer->nxr_rdbulk_nr) == 0)) &&
1105 
1106  m0_htable_forall(tioreqht, tioreq, &xfer->nxr_tioreqs_hash,
1107  target_ioreq_invariant(tioreq));
1108 }
1109 
1110 static bool data_buf_invariant(const struct data_buf *db)
1111 {
1112  return
1113  db != NULL &&
1114  data_buf_bob_check(db) &&
1115  ergo(db->db_buf.b_addr != NULL, db->db_buf.b_nob > 0);
1116 }
1117 
1118 static bool data_buf_invariant_nr(const struct pargrp_iomap *map)
1119 {
1120  uint32_t row;
1121  uint32_t col;
1122  struct m0_pdclust_layout *play;
1123 
1124  play = pdlayout_get(map->pi_ioreq);
1125  for (row = 0; row < rows_nr(play); ++row) {
1126  for (col = 0; col < layout_n(play); ++col) {
1127  if (map->pi_databufs[row][col] != NULL &&
1128  !data_buf_invariant(map->pi_databufs[row][col]))
1129  return false;
1130  }
1131  }
1132 
1133  if (map->pi_paritybufs != NULL) {
1134  for (row = 0; row < rows_nr(play); ++row) {
1135  for (col = 0; col < layout_k(play); ++col) {
1136  if (map->pi_paritybufs[row][col] != NULL &&
1137  !data_buf_invariant(map->pi_paritybufs
1138  [row][col]))
1139  return false;
1140  }
1141  }
1142  }
1143  return true;
1144 }
1145 
1146 static void data_buf_init(struct data_buf *buf, void *addr, uint64_t flags)
1147 {
1148  M0_PRE(buf != NULL);
1149  M0_PRE(addr != NULL);
1150 
1151  data_buf_bob_init(buf);
1152  buf->db_flags = flags;
1153  m0_buf_init(&buf->db_buf, addr, PAGE_SIZE);
1154  buf->db_tioreq = NULL;
1155 }
1156 
1157 static void data_buf_fini(struct data_buf *buf)
1158 {
1159  M0_PRE(buf != NULL);
1160 
1161  data_buf_bob_fini(buf);
1162  buf->db_flags = PA_NONE;
1163 }
1164 
1165 static bool io_req_fop_invariant(const struct io_req_fop *fop)
1166 {
1167  return
1168  _0C(io_req_fop_bob_check(fop)) &&
1169  _0C(fop->irf_tioreq != NULL) &&
1170  _0C(fop->irf_ast.sa_cb != NULL) &&
1171  _0C(fop->irf_ast.sa_mach != NULL);
1172 }
1173 
1174 static bool target_ioreq_invariant(struct target_ioreq *ti)
1175 {
1176  return
1177  _0C(target_ioreq_bob_check(ti)) &&
1178  _0C(ti->ti_session != NULL) &&
1179  _0C(ti->ti_nwxfer != NULL) &&
1180  _0C(m0_fid_is_valid(&ti->ti_fid)) &&
1181  m0_tl_forall(iofops, iofop, &ti->ti_iofops,
1182  io_req_fop_invariant(iofop));
1183 }
1184 
1186 {
1187  return
1188  pargrp_iomap_bob_check(map) &&
1189  map->pi_ops != NULL &&
1190  map->pi_rtype < PIR_NR &&
1191  map->pi_databufs != NULL &&
1192  map->pi_ioreq != NULL &&
1193  ergo(indexvec_varr_count(&map->pi_ivv) > 0 &&
1194  V_SEG_NR(&map->pi_ivv) >= 2,
1195  m0_forall(i, V_SEG_NR(&map->pi_ivv) - 1,
1196  v_seg_endpos(&map->pi_ivv, i) <=
1197  V_INDEX(&map->pi_ivv, i+1))) &&
1199 }
1200 
1202 {
1203  return m0_forall(i, req->ir_iomap_nr,
1205 }
1206 
1207 static void nw_xfer_request_init(struct nw_xfer_request *xfer)
1208 {
1209  struct io_request *req;
1210  struct m0_pdclust_layout *play;
1211 
1212  M0_ENTRY("nw_xfer_request : %p", xfer);
1213  M0_PRE(xfer != NULL);
1214 
1215  req = bob_of(xfer, struct io_request, ir_nwxfer, &ioreq_bobtype);
1216  nw_xfer_request_bob_init(xfer);
1217  xfer->nxr_rc = 0;
1218  xfer->nxr_bytes = 0;
1219  m0_atomic64_set(&xfer->nxr_ccfop_nr, 0);
1220  m0_atomic64_set(&xfer->nxr_iofop_nr, 0);
1221  m0_atomic64_set(&xfer->nxr_rdbulk_nr, 0);
1222  xfer->nxr_state = NXS_INITIALIZED;
1223  xfer->nxr_ops = &xfer_ops;
1224  m0_mutex_init(&xfer->nxr_lock);
1225 
1226  play = pdlayout_get(req);
1227  xfer->nxr_rc = tioreqht_htable_init(&xfer->nxr_tioreqs_hash,
1228  layout_n(play) + 2 * layout_k(play));
1229 
1231  M0_LEAVE();
1232 }
1233 
1234 static void nw_xfer_request_fini(struct nw_xfer_request *xfer)
1235 {
1236  M0_PRE(xfer != NULL && xfer->nxr_state == NXS_COMPLETE);
1238  M0_ENTRY("nw_xfer_request : %p, nxr_rc %d", xfer, xfer->nxr_rc);
1239 
1240  xfer->nxr_ops = NULL;
1241  m0_mutex_fini(&xfer->nxr_lock);
1242  nw_xfer_request_bob_fini(xfer);
1243  tioreqht_htable_fini(&xfer->nxr_tioreqs_hash);
1244  M0_LEAVE();
1245 }
1246 
1247 M0_INTERNAL int user_page_map(struct data_buf *dbuf, unsigned long user_addr)
1248 {
1249  void *kmapped;
1250  int rc;
1251 
1252  M0_ASSERT_INFO((user_addr & ~PAGE_MASK) == 0,
1253  "user_addr = %lx", user_addr);
1254  M0_ASSERT_INFO(dbuf->db_page == NULL,
1255  "dbuf->db_page = %p", dbuf->db_page);
1256 
1257  /* XXX these calls can block */
1258  /* XXX
1259  * semaphore locking copy-pasted
1260  * from m0_net implementation
1261  */
1262  /*
1263  * XXX use PAGE_SIZE and
1264  * pin more than one page if needed
1265  */
1266  down_read(&current->mm->mmap_sem);
1267 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,9,0)
1268  rc = get_user_pages(user_addr, 1, FOLL_WRITE,
1269  &dbuf->db_page, NULL);
1270 #else
1271  rc = get_user_pages(current, current->mm, user_addr, 1, 1, 0,
1272  &dbuf->db_page, NULL);
1273 #endif
1274  up_read(&current->mm->mmap_sem);
1275  if (rc == 1) {
1276  kmapped = kmap(dbuf->db_page);
1277  rc = kmapped == NULL ? -EFAULT : 0;
1278  if (kmapped != NULL)
1279  data_buf_init(dbuf, kmapped, 0);
1280  }
1281  return M0_RC(rc);
1282 }
1283 
1284 static void user_page_unmap(struct data_buf *dbuf, bool set_dirty)
1285 {
1286  M0_ASSERT(dbuf->db_page != NULL);
1287  kunmap(dbuf->db_page);
1288  if (set_dirty)
1289  set_page_dirty(dbuf->db_page);
1290  put_page(dbuf->db_page);
1291  dbuf->db_page = NULL;
1292 }
1293 
1294 static int user_data_copy(struct pargrp_iomap *map,
1296  m0_bindex_t end,
1297  struct iov_iter *it,
1298  enum copy_direction dir,
1299  enum page_attr filter)
1300 {
1301  /*
1302  * iov_iter should be able to be used with copy_to_user() as well
1303  * since it is as good as a vector cursor.
1304  * Present kernel 2.6.32 has no support for such requirement.
1305  */
1306  uint64_t bytes;
1307  uint32_t row;
1308  uint32_t col;
1309  struct page *page;
1310  struct data_buf *dbuf;
1311 
1312  M0_ENTRY("Copy %s user-space, start = %8llu, end = %8llu",
1313  dir == CD_COPY_FROM_USER ? (char *)"from" : (char *)" to ",
1314  start, end);
1316  M0_PRE(it != NULL);
1318  M0_PRE(start >> PAGE_SHIFT == (end - 1) >> PAGE_SHIFT);
1319 
1320  /* Finds out the page from pargrp_iomap::pi_databufs. */
1321  page_pos_get(map, start, &row, &col);
1322  dbuf = map->pi_databufs[row][col];
1323  M0_ASSERT(dbuf != NULL);
1324  M0_ASSERT(ergo(dbuf->db_page != NULL, map->pi_ioreq->ir_direct_io));
1325 
1326  if (dir == CD_COPY_FROM_USER) {
1327  if ((dbuf->db_flags & filter) == filter) {
1328  if (dbuf->db_flags & PA_COPY_FRMUSR_DONE)
1329  return M0_RC(0);
1330 
1331  /*
1332  * Copies page to auxiliary buffer before it gets
1333  * overwritten by user data. This is needed in order
1334  * to calculate delta parity in case of read-old
1335  * approach.
1336  */
1337  if (dbuf->db_auxbuf.b_addr != NULL &&
1338  map->pi_rtype == PIR_READOLD) {
1339  if (filter == 0) {
1340  M0_ASSERT(dbuf->db_page == NULL);
1341  memcpy(dbuf->db_auxbuf.b_addr,
1342  dbuf->db_buf.b_addr, PAGE_SIZE);
1343  } else
1344  return M0_RC(0);
1345  }
1346 
1347  if (dbuf->db_page == NULL) {
1348  page = virt_to_page(dbuf->db_buf.b_addr);
1349  /* Copies to appropriate offset within page. */
1350 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
1351  bytes = iov_iter_copy_from_user_atomic(page, it,
1352  start & ~PAGE_MASK,
1353  end - start);
1354 #else
1355  bytes = iov_iter_copy_from_user(page, it,
1356  start & ~PAGE_MASK,
1357  end - start);
1358 #endif
1359 
1360  } else
1361  bytes = end - start;
1362 
1363  M0_LOG(M0_DEBUG, "[%p] %llu bytes copied from "
1364  "user-space from offset %llu", map->pi_ioreq,
1365  bytes, start);
1366 
1367  map->pi_ioreq->ir_copied_nr += bytes;
1368  /*
1369  * user_data_copy() may be called to handle only part
1370  * of PA_FULLPAGE_MODIFY page. In this case we should
1371  * mark the page as done only when the last piece is
1372  * processed. Otherwise, the rest piece of the page
1373  * will be ignored.
1374  */
1375  if (ergo(dbuf->db_flags & PA_FULLPAGE_MODIFY,
1376  (end & ~PAGE_MASK) == 0))
1377  dbuf->db_flags |= PA_COPY_FRMUSR_DONE;
1378 
1379  if (bytes != end - start)
1380  return M0_ERR_INFO(
1381  -EFAULT, "[%p] Failed to"
1382  " copy_from_user: %" PRIu64 " !="
1383  " %" PRIu64 " - %" PRIu64,
1384  map->pi_ioreq, bytes, end, start);
1385  }
1386  } else {
1387  if (dbuf->db_page == NULL)
1388  bytes = copy_to_user(it->iov->iov_base + it->iov_offset,
1389  (char *)dbuf->db_buf.b_addr +
1390  (start & ~PAGE_MASK),
1391  end - start);
1392  else
1393  bytes = 0;
1394 
1395  map->pi_ioreq->ir_copied_nr += end - start - bytes;
1396 
1397  M0_LOG(M0_DEBUG, "[%p] %llu bytes copied to user-space from "
1398  "offset %llu", map->pi_ioreq, end - start - bytes,
1399  start);
1400 
1401  if (bytes != 0)
1402  return M0_ERR_INFO(-EFAULT, "[%p] Failed to "
1403  "copy_to_user", map->pi_ioreq);
1404  }
1405 
1406  return M0_RC(0);
1407 }
1408 
1410 {
1411  int rc;
1412  uint32_t row;
1413  uint32_t col;
1414  struct m0_buf *dbufs;
1415  struct m0_buf *pbufs;
1416  struct m0_buf *old_pbuf;
1417  struct m0_pdclust_layout *play;
1418  struct inode *inode;
1419  struct m0t1fs_sb *csb;
1420  struct page *page;
1421  unsigned long zpage;
1422 
1423  M0_ENTRY("[%p] map = %p", map->pi_ioreq, map);
1425 
1427  csb = M0T1FS_SB(inode->i_sb);
1428  if (!(map->pi_ioreq->ir_type == IRT_READ && csb->csb_verify))
1429  return M0_RC(0);
1430 
1431  play = pdlayout_get(map->pi_ioreq);
1432  M0_ALLOC_ARR(dbufs, layout_n(play));
1433  M0_ALLOC_ARR(pbufs, layout_k(play));
1434  zpage = get_zeroed_page(GFP_KERNEL);
1435 
1436  if (dbufs == NULL || pbufs == NULL || zpage == 0) {
1437  rc = M0_ERR(-ENOMEM);
1438  goto last;
1439  }
1440 
1441  /* temprary buf to hold parity */
1442  for (col = 0; col < layout_k(play); ++col) {
1443  page = alloc_pages(GFP_KERNEL, 0);
1444  if (page == NULL) {
1445  rc = M0_ERR(-ENOMEM);
1446  goto last;
1447  }
1448 
1449  pbufs[col].b_addr = (void *)page_address(page);
1450  pbufs[col].b_nob = PAGE_SIZE;
1451  }
1452 
1453  for (row = 0; row < rows_nr(play); ++row) {
1454  /* data */
1455  for (col = 0; col < layout_n(play); ++col) {
1456  if (map->pi_databufs[row][col] != NULL) {
1457  dbufs[col] =
1458  map->pi_databufs[row][col]->db_buf;
1459  } else {
1460  dbufs[col].b_addr = (void *)zpage;
1461  dbufs[col].b_nob = PAGE_SIZE;
1462  }
1463  }
1464  /* generate parity into new buf */
1466  dbufs, pbufs);
1467 
1468  /* verify the parity */
1469  for (col = 0; col < layout_k(play); ++col) {
1470  old_pbuf = &map->pi_paritybufs[row][col]->db_buf;
1471  if (memcmp(pbufs[col].b_addr, old_pbuf->b_addr,
1472  PAGE_SIZE)) {
1473  M0_LOG(M0_ERROR, "[%p] parity verification "
1474  "failed for %llu [%u:%u], rc %d",
1475  map->pi_ioreq, map->pi_grpid, row, col,
1476  -EIO);
1477  rc = M0_ERR(-EIO);
1478  goto last;
1479  }
1480  M0_LOG(M0_DEBUG, "[%p] parity verified for %llu "
1481  "[%u:%u]", map->pi_ioreq, map->pi_grpid,
1482  row, col);
1483  }
1484  }
1485 
1486  rc = 0;
1487 last:
1488  if (pbufs != NULL) {
1489  for (col = 0; col < layout_k(play); ++col) {
1490  /* free_page(NULL) is OK */
1491  free_page((unsigned long)pbufs[col].b_addr);
1492  }
1493  }
1494  m0_free(dbufs);
1495  m0_free(pbufs);
1496  free_page(zpage);
1497  M0_LOG(M0_DEBUG, "[%p] parity verified for %llu, rc=%d", map->pi_ioreq,
1498  map->pi_grpid, rc);
1499  return M0_RC(rc);
1500 }
1501 
1503 {
1504  int rc = 0;
1505  uint32_t row;
1506  uint32_t col;
1507  struct m0_buf *dbufs;
1508  struct m0_buf *pbufs;
1509  struct m0_pdclust_layout *play;
1510 
1512 
1513  M0_ENTRY("[%p] map = %p", map->pi_ioreq, map);
1514 
1515  play = pdlayout_get(map->pi_ioreq);
1516  M0_ALLOC_ARR(dbufs, layout_n(play));
1517  M0_ALLOC_ARR(pbufs, layout_k(play));
1518 
1519  if (dbufs == NULL || pbufs == NULL) {
1520  rc = M0_ERR(-ENOMEM);
1521  goto last;
1522  }
1523 
1524  if ((map->pi_ioreq->ir_type == IRT_WRITE && map->pi_rtype == PIR_NONE)
1525  || map->pi_rtype == PIR_READREST) {
1526 
1527  unsigned long zpage;
1528 
1529  zpage = get_zeroed_page(GFP_KERNEL);
1530  if (zpage == 0) {
1531  rc = M0_ERR(-ENOMEM);
1532  goto last;
1533  }
1534 
1535  for (row = 0; row < rows_nr(play); ++row) {
1536  for (col = 0; col < layout_n(play); ++col)
1537  if (map->pi_databufs[row][col] != NULL) {
1538  dbufs[col] = map->pi_databufs
1539  [row][col]->db_buf;
1540  } else {
1541  dbufs[col].b_addr = (void *)zpage;
1542  dbufs[col].b_nob = PAGE_SIZE;
1543  }
1544 
1545  for (col = 0; col < layout_k(play); ++col)
1546  pbufs[col] = map->pi_paritybufs[row][col]->
1547  db_buf;
1548 
1550  dbufs, pbufs);
1551  }
1552  free_page(zpage);
1553  M0_LOG(M0_DEBUG, "[%p] Parity recalculated for %s",
1554  map->pi_ioreq,
1555  map->pi_rtype == PIR_READREST ? "read-rest" :
1556  "aligned write");
1557 
1558  } else {
1559  struct m0_buf *old;
1560 
1561  M0_ALLOC_ARR(old, layout_n(play));
1562  if (old == NULL) {
1563  rc = M0_ERR(-ENOMEM);
1564  goto last;
1565  }
1566 
1567  for (row = 0; row < rows_nr(play); ++row) {
1568  for (col = 0; col < layout_k(play); ++col)
1569  pbufs[col] = map->pi_paritybufs[row][col]->
1570  db_buf;
1571 
1572  for (col = 0; col < layout_n(play); ++col) {
1573  /*
1574  * During rmw-IO request with read-old approach
1575  * we allocate primary and auxiliary buffers
1576  * for those units from a parity group, that
1577  * are spanned by input rmw-IO request. If
1578  * these units belong to failed devices then
1579  * during the degraded reading, primary buffers
1580  * are allocated for rest of the units from the
1581  * parity group in order to recover the failed
1582  * units. Thus if a parity group is in dgmode,
1583  * then every unit will have a primary buffer,
1584  * but may not have an auxiliary buffer.
1585  */
1586  if (map->pi_databufs[row][col] == NULL ||
1587  map->pi_databufs[row][col]->
1588  db_auxbuf.b_addr == NULL)
1589  continue;
1590 
1591  dbufs[col] = map->pi_databufs[row][col]->db_buf;
1592  old[col] = map->pi_databufs[row][col]->
1593  db_auxbuf;
1594 
1595  rc = m0_parity_math_diff(parity_math(map->pi_ioreq),
1596  old, dbufs, pbufs, col);
1597  if (rc != 0) {
1598  m0_free(old);
1599  goto last;
1600  }
1601  }
1602  }
1603  m0_free(old);
1604  }
1605 last:
1606  m0_free(dbufs);
1607  m0_free(pbufs);
1608  return M0_RC(rc);
1609 }
1610 
1612 {
1613  int rc = 0;
1614  uint64_t i;
1615  struct pargrp_iomap *iomap;
1616  struct inode *inode;
1617  struct m0t1fs_sb *csb;
1618 
1619  M0_ENTRY("[%p]", req);
1621 
1623  csb = M0T1FS_SB(inode->i_sb);
1624 
1625  if (!(req->ir_type == IRT_READ && csb->csb_verify))
1626  return M0_RC(0);
1627 
1629 
1630  for (i = 0; i < req->ir_iomap_nr; ++i) {
1631  iomap = req->ir_iomaps[i];
1632  if (iomap->pi_state == PI_DEGRADED) {
1633  /* data is recovered from existing data and parity.
1634  * It's meaningless to do parity verification */
1635  continue;
1636  }
1637  rc = iomap->pi_ops->pi_parity_verify(iomap);
1638  if (rc != 0)
1639  break;
1640  }
1641 
1643 
1644  return rc != 0 ? M0_ERR_INFO(rc, "[%p] Parity verification failed for "
1645  "grpid=%llu", req,
1646  iomap->pi_grpid) : M0_RC(rc);
1647 }
1648 
1650 {
1651  int rc = 0;
1652  uint64_t i;
1653  struct pargrp_iomap *iomap;
1654 
1655  M0_ENTRY("[%p]", req);
1657 
1659 
1660  for (i = 0; i < req->ir_iomap_nr; ++i) {
1661  iomap = req->ir_iomaps[i];
1662  rc = iomap->pi_ops->pi_parity_recalc(iomap);
1663  if (rc != 0)
1664  break;
1665  }
1666 
1668 
1669  return rc == 0 ? M0_RC(rc) :
1670  M0_ERR_INFO(rc, "Parity recalc failed for grpid=%3"PRIu64,
1671  iomap->pi_grpid);
1672 }
1673 
1674 /* Finds out pargrp_iomap from array of such structures in io_request. */
1675 static void ioreq_pgiomap_find(struct io_request *req,
1676  uint64_t grpid,
1677  uint64_t *cursor,
1678  struct pargrp_iomap **out)
1679 {
1680  uint64_t i;
1681 
1682  M0_PRE(req != NULL);
1683  M0_PRE(out != NULL);
1684  M0_PRE(cursor != NULL);
1685  M0_PRE(*cursor < req->ir_iomap_nr);
1686  M0_ENTRY("[%p] group_id=%llu cursor=%llu", req, grpid, *cursor);
1687 
1688  for (i = *cursor; i < req->ir_iomap_nr; ++i) {
1689  if (req->ir_iomaps[i]->pi_grpid == grpid) {
1690  *out = req->ir_iomaps[i];
1691  *cursor = i;
1692  break;
1693  }
1694  }
1695 
1696  M0_POST(i < req->ir_iomap_nr);
1697  M0_LEAVE("[%p] result iomap=%llu", req, i);
1698 }
1699 
1701  enum copy_direction dir,
1702  enum page_attr filter)
1703 {
1704  int rc;
1705  uint64_t i;
1706  m0_bindex_t grpstart;
1707  m0_bindex_t grpend;
1708  m0_bindex_t pgstart;
1709  m0_bindex_t pgend;
1711  struct iov_iter it;
1712  struct m0_ivec_varr_cursor srccur;
1713  struct m0_pdclust_layout *play;
1714  struct pargrp_iomap *iomap;
1715 
1716  M0_ENTRY("[%p] %s user-space. filter = 0x%x",
1717  req, dir == CD_COPY_FROM_USER ? (char *)"from" : (char *)"to",
1718  filter);
1720  M0_PRE(dir < CD_NR);
1721 
1722 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
1723  iov_iter_init(&it, WRITE, req->ir_iovec, V_SEG_NR(&req->ir_ivv),
1725 #else
1726  iov_iter_init(&it, req->ir_iovec, V_SEG_NR(&req->ir_ivv),
1728 #endif
1729  m0_ivec_varr_cursor_init(&srccur, &req->ir_ivv);
1730  play = pdlayout_get(req);
1731 
1732  for (i = 0; i < req->ir_iomap_nr; ++i) {
1733  iomap = req->ir_iomaps[i];
1735 
1736  count = 0;
1737  grpstart = data_size(play) * iomap->pi_grpid;
1738  grpend = grpstart + data_size(play);
1739 
1740  while (!m0_ivec_varr_cursor_move(&srccur, count) &&
1741  m0_ivec_varr_cursor_index(&srccur) < grpend) {
1742 
1743  pgstart = m0_ivec_varr_cursor_index(&srccur);
1744  pgend = min64u(m0_round_up(pgstart + 1, PAGE_SIZE),
1745  pgstart + m0_ivec_varr_cursor_step(&srccur));
1746  count = pgend - pgstart;
1747 
1748  /*
1749  * This takes care of finding correct page from
1750  * current pargrp_iomap structure from pgstart
1751  * and pgend.
1752  */
1753  rc = user_data_copy(iomap, pgstart, pgend,
1754  &it, dir, filter);
1755  if (rc != 0)
1756  return M0_ERR_INFO(
1757  rc, "[%p] Copy failed (pgstart=%" PRIu64
1758  " pgend=%" PRIu64 ")",
1759  req, pgstart, pgend);
1760 
1762  }
1763  }
1764 
1765  return M0_RC(0);
1766 }
1767 
1768 static void indexvec_sort(struct m0_indexvec_varr *ivec)
1769 {
1770  uint32_t i;
1771  uint32_t j;
1772 
1773  M0_ENTRY("indexvec = %p", ivec);
1774  M0_PRE(ivec != NULL && indexvec_varr_count(ivec) != 0);
1775 
1776  /*
1777  * TODO Should be replaced by an efficient sorting algorithm,
1778  * something like heapsort which is fairly inexpensive in kernel
1779  * mode with the least worst case scenario.
1780  * Existing heap sort from kernel code can not be used due to
1781  * apparent disconnect between index vector and its associated
1782  * count vector for same index.
1783  */
1784  for (i = 0; i < V_SEG_NR(ivec); ++i) {
1785  for (j = i+1; j < V_SEG_NR(ivec); ++j) {
1786  if (V_INDEX(ivec, i) > V_INDEX(ivec, j)) {
1787  M0_SWAP(V_INDEX(ivec, i), V_INDEX(ivec, j));
1788  M0_SWAP(V_COUNT(ivec, i), V_COUNT(ivec, j));
1789  }
1790  }
1791  }
1792  M0_LEAVE();
1793 }
1794 
1796  struct io_request *req,
1797  uint64_t grpid)
1798 {
1799  int rc;
1800  int row;
1801  struct m0_pdclust_layout *play;
1802  struct inode *inode;
1803  struct m0t1fs_sb *csb;
1804 
1805  M0_ENTRY("[%p] map = %p, grpid = %llu", req, map, grpid);
1806  M0_PRE(map != NULL);
1807  M0_PRE(req != NULL);
1808 
1809  pargrp_iomap_bob_init(map);
1810  map->pi_ops = &iomap_ops;
1811  map->pi_rtype = PIR_NONE;
1812  map->pi_grpid = grpid;
1813  map->pi_ioreq = req;
1814  map->pi_state = PI_HEALTHY;
1815  map->pi_paritybufs = NULL;
1816 
1818  csb = M0T1FS_SB(inode->i_sb);
1819 
1820  play = pdlayout_get(req);
1821  rc = m0_indexvec_varr_alloc(&map->pi_ivv, page_nr(data_size(play)));
1822  if (rc != 0)
1823  goto fail_iv;
1824 
1825  /*
1826  * This number is incremented only when a valid segment
1827  * is added to the index vector.
1828  */
1829  V_SEG_NR(&map->pi_ivv) = 0;
1830 
1831  M0_ALLOC_ARR(map->pi_databufs, rows_nr(play));
1832  if (map->pi_databufs == NULL)
1833  goto fail;
1834 
1835  for (row = 0; row < rows_nr(play); ++row) {
1836  M0_ALLOC_ARR(map->pi_databufs[row], layout_n(play));
1837  if (map->pi_databufs[row] == NULL)
1838  goto fail;
1839  }
1840 
1841  if (req->ir_type == IRT_WRITE ||
1842  (req->ir_type == IRT_READ && csb->csb_verify)) {
1843  M0_ALLOC_ARR(map->pi_paritybufs, rows_nr(play));
1844  if (map->pi_paritybufs == NULL)
1845  goto fail;
1846 
1847  for (row = 0; row < rows_nr(play); ++row) {
1848  M0_ALLOC_ARR(map->pi_paritybufs[row],
1849  layout_k(play));
1850  if (map->pi_paritybufs[row] == NULL)
1851  goto fail;
1852  }
1853  }
1854 
1855  M0_LOG(M0_DEBUG, "[%p] grpid=%llu, ivec has %llu segs, "
1856  "databufs=[%u x %u] paritybufs=[%u x %u]",
1857  req, grpid, page_nr(data_size(play)),
1858  rows_nr(play), layout_n(play),
1859  rows_nr(play), layout_k(play));
1860 
1862  return M0_RC(0);
1863 
1864 fail:
1865  m0_indexvec_varr_free(&map->pi_ivv);
1866 
1867  if (map->pi_databufs != NULL) {
1868  for (row = 0; row < rows_nr(play); ++row)
1869  m0_free(map->pi_databufs[row]);
1870  m0_free(map->pi_databufs);
1871  }
1872  if (map->pi_paritybufs != NULL) {
1873  for (row = 0; row < rows_nr(play); ++row)
1874  m0_free(map->pi_paritybufs[row]);
1875  m0_free(map->pi_paritybufs);
1876  }
1877 fail_iv:
1878  return M0_ERR_INFO(-ENOMEM, "[%p] Memory allocation failed", req);
1879 }
1880 
1881 static void pargrp_iomap_fini(struct pargrp_iomap *map)
1882 {
1883  uint32_t row;
1884  uint32_t col;
1885  struct m0_pdclust_layout *play;
1886 
1887  M0_ENTRY("[%p] map %p", map->pi_ioreq, map);
1889 
1890  play = pdlayout_get(map->pi_ioreq);
1891  map->pi_ops = NULL;
1892  map->pi_rtype = PIR_NONE;
1893  map->pi_state = PI_NONE;
1894 
1895  pargrp_iomap_bob_fini(map);
1896  m0_indexvec_varr_free(&map->pi_ivv);
1897 
1898  for (row = 0; row < rows_nr(play); ++row) {
1899  for (col = 0; col < layout_n(play); ++col) {
1900  if (map->pi_databufs[row][col] != NULL) {
1902  pi_databufs[row][col]);
1903  map->pi_databufs[row][col] = NULL;
1904  }
1905  }
1906  m0_free0(&map->pi_databufs[row]);
1907  }
1908 
1909  if (map->pi_paritybufs != NULL) {
1910  for (row = 0; row < rows_nr(play); ++row) {
1911  for (col = 0; col < layout_k(play); ++col) {
1912  if (map->pi_paritybufs[row][col] != NULL) {
1914  pi_paritybufs[row][col]);
1915  map->pi_paritybufs[row][col] = NULL;
1916  }
1917  }
1918  m0_free0(&map->pi_paritybufs[row]);
1919  }
1920  }
1921 
1922  m0_free0(&map->pi_databufs);
1923  m0_free0(&map->pi_paritybufs);
1924  map->pi_ioreq = NULL;
1925  M0_LEAVE();
1926 }
1927 
1933 {
1934  uint32_t seg;
1935  bool spanned = false;
1936 
1937  M0_ENTRY("[%p] map %p", map->pi_ioreq, map);
1938 
1940 
1941  for (seg = 0; seg < V_SEG_NR(&map->pi_ivv); ++seg) {
1942  if (V_INDEX(&map->pi_ivv, seg) <= index &&
1943  index + count <= v_seg_endpos(&map->pi_ivv, seg)) {
1944  spanned = true;
1945  break;
1946  }
1947  }
1948  return M0_RC(!!spanned);
1949 }
1950 
1952  uint32_t row,
1953  uint32_t col)
1954 {
1955  M0_PRE(map != NULL);
1956  M0_PRE(map->pi_databufs[row][col] == NULL);
1957 
1958  M0_ENTRY("[%p] map %p, row %u col %u", map->pi_ioreq, map, row, col);
1959  map->pi_databufs[row][col] = data_buf_alloc_init(0);
1960 
1961  return map->pi_databufs[row][col] == NULL ? M0_ERR(-ENOMEM) : 0;
1962 }
1963 
1964 /* Allocates data_buf structures as needed and populates the buffer flags. */
1966  uint64_t seg,
1967  bool rmw)
1968 {
1969  int rc;
1970  int flags;
1971  bool ret;
1972  uint32_t row;
1973  uint32_t col;
1974  uint64_t count = 0;
1976  m0_bindex_t end;
1977  struct inode *inode;
1978  struct m0_ivec_varr_cursor cur;
1979  struct m0_pdclust_layout *play;
1980  struct io_request *req = map->pi_ioreq;
1981 
1982  M0_ENTRY("[%p] map %p", map->pi_ioreq, map);
1983  M0_LOG(M0_DEBUG, "[%p] pgid %llu seg %llu = [%llu, +%llu), %s",
1984  map->pi_ioreq, map->pi_grpid, seg,
1985  V_INDEX(&map->pi_ivv, seg),
1986  V_COUNT(&map->pi_ivv, seg),
1987  rmw ? "rmw" : "aligned");
1988  play = pdlayout_get(req);
1990  m0_ivec_varr_cursor_init(&cur, &map->pi_ivv);
1991  ret = m0_ivec_varr_cursor_move_to(&cur, V_INDEX(&map->pi_ivv, seg));
1992  M0_ASSERT(!ret);
1993 
1994  /* process a page at each iteration */
1995  while (!m0_ivec_varr_cursor_move(&cur, count)) {
1997  end = min64u(m0_round_up(start + 1, PAGE_SIZE),
1999  count = end - start;
2000 
2001  flags = 0;
2002  if (req->ir_type == IRT_WRITE) {
2003  flags |= PA_WRITE;
2004  flags |= count == PAGE_SIZE ?
2006 
2007  /*
2008  * Even if PA_PARTPAGE_MODIFY flag is set in
2009  * this buffer, the auxiliary buffer can not be
2010  * allocated until ::pi_rtype is selected.
2011  */
2012  if (rmw && (flags & PA_PARTPAGE_MODIFY) &&
2013  (end < inode->i_size ||
2014  (inode->i_size > 0 &&
2015  page_id(end - 1) == page_id(inode->i_size - 1))))
2016  flags |= PA_READ;
2017  } else {
2018  /*
2019  * For read IO requests, file_aio_read() has already
2020  * delimited the index vector to EOF boundary.
2021  */
2022  flags |= PA_READ;
2023  }
2024 
2025  page_pos_get(map, start, &row, &col);
2026  rc = pargrp_iomap_databuf_alloc(map, row, col);
2027  M0_LOG(M0_DEBUG, "[%p] alloc start %8llu count %4llu pgid "
2028  "%3llu row %u col %u f 0x%x addr %p",
2029  req, start, count, map->pi_grpid, row, col, flags,
2030  map->pi_databufs[row][col] != NULL ?
2031  map->pi_databufs[row][col]->db_buf.b_addr : NULL);
2032  if (rc != 0)
2033  goto err;
2034  map->pi_databufs[row][col]->db_flags = flags;
2035  }
2036 
2037  return M0_RC(0);
2038 err:
2039  for (row = 0; row < rows_nr(play); ++row) {
2040  for (col = 0; col < layout_n(play); ++col) {
2041  if (map->pi_databufs[row][col] != NULL) {
2042  data_buf_dealloc_fini(map->pi_databufs
2043  [row][col]);
2044  map->pi_databufs[row][col] = NULL;
2045  }
2046  }
2047  }
2048  return M0_ERR_INFO(rc, "[%p] databuf_alloc failed", req);
2049 }
2050 
2052 {
2053  uint32_t row;
2054  uint32_t col;
2055  uint64_t nr = 0;
2056  struct m0_pdclust_layout *play;
2057 
2059 
2060  M0_ENTRY("[%p] map %p", map->pi_ioreq, map);
2061  play = pdlayout_get(map->pi_ioreq);
2062 
2063  for (row = 0; row < rows_nr(play); ++row) {
2064  for (col = 0; col < layout_n(play); ++col) {
2065 
2066  if (map->pi_databufs[row][col] &&
2067  map->pi_databufs[row][col]->db_flags &
2069  ++nr;
2070  }
2071  }
2072  M0_LEAVE();
2073  return nr;
2074 }
2075 
2077  uint32_t row,
2078  uint32_t col)
2079 {
2081  M0_PRE(map->pi_rtype == PIR_READOLD);
2082 
2083  M0_ENTRY("[%p] map %p", map->pi_ioreq, map);
2084  map->pi_databufs[row][col]->db_auxbuf.b_addr = (void *)
2085  get_zeroed_page(GFP_KERNEL);
2086 
2087  if (map->pi_databufs[row][col]->db_auxbuf.b_addr == NULL)
2088  return M0_ERR(-ENOMEM);
2089  ++iommstats.a_page_nr;
2090  map->pi_databufs[row][col]->db_auxbuf.b_nob = PAGE_SIZE;
2091 
2092  return M0_RC(0);
2093 }
2094 
2095 /*
2096  * Allocates auxiliary buffer for data_buf structures in
2097  * pargrp_iomap structure.
2098  */
2100 {
2101  int rc = 0;
2102  uint64_t start;
2103  uint64_t end;
2104  uint64_t count = 0;
2105  uint32_t row;
2106  uint32_t col;
2107  struct inode *inode;
2108  struct m0_ivec_varr_cursor cur;
2109 
2111  M0_PRE(map->pi_rtype == PIR_READOLD);
2112 
2113  M0_ENTRY("[%p] map %p", map->pi_ioreq, map);
2114  inode = m0t1fs_file_to_inode(map->pi_ioreq->ir_file);
2115  m0_ivec_varr_cursor_init(&cur, &map->pi_ivv);
2116 
2117  while (!m0_ivec_varr_cursor_move(&cur, count)) {
2119  end = min64u(m0_round_up(start + 1, PAGE_SIZE),
2121  count = end - start;
2122  page_pos_get(map, start, &row, &col);
2123 
2124  if (map->pi_databufs[row][col] != NULL) {
2125  /*
2126  * In Readold approach, all valid pages have to
2127  * be read regardless of whether they are fully
2128  * occupied or partially occupied.
2129  * This is needed in order to calculate correct
2130  * parity in differential manner.
2131  * Also, read flag should be set only for pages
2132  * which lie within end-of-file boundary.
2133  */
2134  if (end < inode->i_size ||
2135  (inode->i_size > 0 &&
2136  page_id(end - 1) == page_id(inode->i_size - 1)))
2137  map->pi_databufs[row][col]->db_flags |=
2138  PA_READ;
2139 
2140  rc = pargrp_iomap_auxbuf_alloc(map, row, col);
2141  if (rc != 0)
2142  return M0_ERR_INFO(rc, "[%p] auxbuf_alloc "
2143  "failed", map->pi_ioreq);
2144  }
2145  }
2146  return M0_RC(rc);
2147 }
2148 
2149 /*
2150  * A read request from rmw IO request can lead to either
2151  *
2152  * read_old - Read the old data for the extent spanned by current
2153  * IO request, along with the old parity unit. This approach needs
2154  * to calculate new parity in _iterative_ manner. This approach is
2155  * selected only if current IO extent lies within file size.
2156  *
2157  * read_rest - Read rest of the parity group, which is _not_ spanned
2158  * by current IO request, so that data for whole parity group can
2159  * be availble for parity calculation.
2160  * This approach reads the extent from start of parity group to the
2161  * point where a page is completely spanned by incoming IO request.
2162  *
2163  * Typically, the approach which leads to least size of data to be
2164  * read and written from server is selected.
2165  *
2166  * N = 5, P = 1, K = 1, unit_size = 4k
2167  * F => Fully occupied
2168  * P' => Partially occupied
2169  * # => Parity unit
2170  * * => Spare unit
2171  * x => Start of actual file extent.
2172  * y => End of actual file extent.
2173  * a => Rounded down value of x.
2174  * b => Rounded up value of y.
2175  *
2176  * Read-rest approach
2177  *
2178  * a x
2179  * +---+---+---+---+---+---+---+
2180  * | | P'| F | F | F | # | * | PG#0
2181  * +---+---+---+---+---+---+---+
2182  * | F | F | F | F | F | # | * | PG#1
2183  * +---+---+---+---+---+---+---+
2184  * | F | F | F | P'| | # | * | PG#2
2185  * +---+---+---+---+---+---+---+
2186  * N N N N N K P
2187  * y b
2188  *
2189  * Read-old approach
2190  *
2191  * a x
2192  * +---+---+---+---+---+---+---+
2193  * | | | | P'| F | # | * | PG#0
2194  * +---+---+---+---+---+---+---+
2195  * | F | F | F | F | F | # | * | PG#1
2196  * +---+---+---+---+---+---+---+
2197  * | F | P'| | | | # | * | PG#2
2198  * +---+---+---+---+---+---+---+
2199  * N N N N N K P
2200  * y b
2201  *
2202  */
2204 {
2205  int rc;
2206  uint32_t row;
2207  uint32_t col;
2208  uint32_t seg;
2209  uint32_t seg_nr;
2210  m0_bindex_t grpstart;
2211  m0_bindex_t grpend;
2213  m0_bindex_t end;
2214  m0_bcount_t count = 0;
2215  struct inode *inode;
2216  struct m0_indexvec_varr *ivec;
2217  struct m0_ivec_varr_cursor cur;
2218  struct m0_pdclust_layout *play;
2219 
2221  M0_PRE(map->pi_rtype == PIR_READREST);
2222 
2223  M0_ENTRY("[%p] map %p", map->pi_ioreq, map);
2224  play = pdlayout_get(map->pi_ioreq);
2225  ivec = &map->pi_ivv;
2226  seg_nr = V_SEG_NR(&map->pi_ivv);
2227  grpstart = data_size(play) * map->pi_grpid;
2228  grpend = grpstart + data_size(play);
2229 
2230  /* Extends first segment to align with start of parity group. */
2231  V_COUNT(ivec, 0) += (V_INDEX(ivec, 0) - grpstart);
2232  V_INDEX(ivec, 0) = grpstart;
2233 
2234  /* Extends last segment to align with end of parity group. */
2235  V_COUNT(ivec, seg_nr - 1) = grpend - V_INDEX(ivec, seg_nr - 1);
2236 
2237  /*
2238  * All io extents _not_ spanned by pargrp_iomap::pi_ivv
2239  * need to be included so that _all_ pages from parity group
2240  * are available to do IO.
2241  */
2242  for (seg = 1; seg_nr > 2 && seg <= seg_nr - 2; ++seg) {
2243  if (v_seg_endpos(ivec, seg) < V_INDEX(ivec, seg + 1))
2244  V_COUNT(ivec, seg) += V_INDEX(ivec, seg + 1) -
2245  v_seg_endpos(ivec, seg);
2246  }
2247 
2248  inode = m0t1fs_file_to_inode(map->pi_ioreq->ir_file);
2249  m0_ivec_varr_cursor_init(&cur, &map->pi_ivv);
2250 
2251  while (!m0_ivec_varr_cursor_move(&cur, count)) {
2252 
2254  end = min64u(m0_round_up(start + 1, PAGE_SIZE),
2256  count = end - start;
2257  page_pos_get(map, start, &row, &col);
2258 
2259  if (map->pi_databufs[row][col] == NULL) {
2260  rc = pargrp_iomap_databuf_alloc(map, row, col);
2261  if (rc != 0)
2262  return M0_ERR_INFO(rc, "[%p] databuf_alloc "
2263  "failed", map->pi_ioreq);
2264 
2265  if (end <= inode->i_size || (inode->i_size > 0 &&
2266  page_id(end - 1) == page_id(inode->i_size - 1)))
2267  map->pi_databufs[row][col]->db_flags |=
2268  PA_READ;
2269  }
2270  }
2271 
2272  return M0_RC(0);
2273 }
2274 
2276 {
2277  uint32_t row;
2278  uint32_t col;
2279  struct m0_pdclust_layout *play;
2280  struct inode *inode;
2281  struct m0t1fs_sb *csb;
2282  struct data_buf *dbuf;
2283 
2285 
2286  M0_ENTRY("[%p] map %p grpid=%llu", map->pi_ioreq, map, map->pi_grpid);
2288  csb = M0T1FS_SB(inode->i_sb);
2289 
2290  play = pdlayout_get(map->pi_ioreq);
2291  for (row = 0; row < rows_nr(play); ++row) {
2292  for (col = 0; col < layout_k(play); ++col) {
2293  struct file *irf;
2294 
2295  map->pi_paritybufs[row][col] = data_buf_alloc_init(0);
2296  if (map->pi_paritybufs[row][col] == NULL)
2297  goto err;
2298  dbuf = map->pi_paritybufs[row][col];
2299  if (map->pi_ioreq->ir_type == IRT_WRITE)
2300  dbuf->db_flags |= PA_WRITE;
2301 
2302  irf = map->pi_ioreq->ir_file;
2303  if ((map->pi_rtype == PIR_READOLD ||
2304  (map->pi_ioreq->ir_type == IRT_READ &&
2305  csb->csb_verify)) &&
2306  m0t1fs_file_to_inode(irf)->i_size >
2307  data_size(play) * map->pi_grpid)
2308  dbuf->db_flags |= PA_READ;
2309  }
2310  }
2311  return M0_RC(0);
2312 err:
2313  for (row = 0; row < rows_nr(play); ++row) {
2314  for (col = 0; col < layout_k(play); ++col)
2315  m0_free0(&map->pi_paritybufs[row][col]);
2316  }
2317  return M0_ERR_INFO(-ENOMEM, "[%p] Memory allocation failed for "
2318  "data_buf.", map->pi_ioreq);
2319 }
2320 
2329 static m0_bindex_t seg_set(struct pargrp_iomap *map, uint32_t seg,
2330  struct m0_ivec_varr_cursor *cur, m0_bindex_t grpend)
2331 {
2332  m0_bindex_t end = m0_ivec_varr_cursor_conti(cur, grpend);
2333 
2335  V_COUNT(&map->pi_ivv, seg) = end - V_INDEX(&map->pi_ivv, seg);
2336 
2337  return end;
2338 }
2339 
2341 static void seg_idx_inc_round(struct pargrp_iomap *map, uint32_t seg,
2342  uint64_t sz)
2343 {
2344  m0_bindex_t idx = m0_round_up(V_INDEX(&map->pi_ivv, seg) + 1, sz);
2345 
2346  V_COUNT(&map->pi_ivv, seg) -= idx - V_INDEX(&map->pi_ivv, seg);
2347  V_INDEX(&map->pi_ivv, seg) = idx;
2348 }
2349 
2351 static void seg_align(struct pargrp_iomap *map, uint32_t seg,
2352  m0_bindex_t end, uint64_t sz)
2353 {
2354  m0_bindex_t idx = round_down(V_INDEX(&map->pi_ivv, seg), sz);
2355 
2356  V_INDEX(&map->pi_ivv, seg) = idx;
2357  V_COUNT(&map->pi_ivv, seg) = round_up(end, sz) - idx;
2358 }
2359 
2365  struct m0_ivec_varr_cursor *cursor,
2366  bool rmw)
2367 {
2368  int rc;
2369  uint32_t seg;
2370  m0_bindex_t seg_end = 0;
2371  m0_bcount_t grpsize;
2372  m0_bcount_t count = 0;
2373  m0_bindex_t grpstart;
2374  m0_bindex_t grpend;
2375  struct m0_pdclust_layout *play;
2376  struct inode *inode;
2377 
2378  M0_PRE(map != NULL);
2379 
2380  play = pdlayout_get(map->pi_ioreq);
2381  grpsize = data_size(play);
2382  grpstart = grpsize * map->pi_grpid;
2383  grpend = grpstart + grpsize;
2385 
2386  for (seg = 0; !m0_ivec_varr_cursor_move(cursor, count) &&
2387  m0_ivec_varr_cursor_index(cursor) < grpend;) {
2388  /*
2389  * Skips the current segment if it is completely spanned by
2390  * rounding up/down of an earlier segment.
2391  */
2392  if (map->pi_ops->pi_spans_seg(map,
2393  m0_ivec_varr_cursor_index(cursor),
2394  m0_ivec_varr_cursor_step(cursor))) {
2395  count = m0_ivec_varr_cursor_step(cursor);
2396  continue;
2397  }
2398 
2399  /* Make sure read IO does not go beyond EOF. */
2400  if (map->pi_ioreq->ir_type == IRT_READ &&
2401  grpend > inode->i_size) {
2402  if (V_INDEX(&map->pi_ivv, seg) >= inode->i_size) {
2403  count = m0_ivec_varr_cursor_step(cursor);
2404  continue;
2405  }
2406  seg_end = seg_set(map, seg, cursor, inode->i_size);
2407  } else
2408  seg_end = seg_set(map, seg, cursor, grpend);
2409 
2410  /*
2411  * If current segment is _partially_ spanned by previous
2412  * segment in pargrp_iomap::pi_ivv, start of segment is
2413  * rounded up to move to next page.
2414  */
2415  if (seg > 0 && V_INDEX(&map->pi_ivv, seg) <
2416  v_seg_endpos(&map->pi_ivv, seg - 1))
2418 
2419  ++V_SEG_NR(&map->pi_ivv);
2420 
2421  M0_LOG(M0_DEBUG, "[%p] pre grp_id=%" PRIu64 " seg=%"PRIu32
2422  " =[%" PRIu64 ",+%" PRIu64 ")", map->pi_ioreq,
2423  map->pi_grpid,seg, V_INDEX(&map->pi_ivv, seg),
2424  V_COUNT(&map->pi_ivv, seg));
2425 
2426  rc = map->pi_ops->pi_seg_process(map, seg, rmw);
2427  if (rc != 0)
2428  return M0_ERR(rc);
2429 
2430  seg_align(map, seg, seg_end, PAGE_SIZE);
2431 
2432  M0_LOG(M0_DEBUG, "[%p] post grp_id=%" PRIu64 " seg=%"PRIu32
2433  " =[%" PRIu64 ",+%" PRIu64 ")", map->pi_ioreq,
2434  map->pi_grpid, seg, V_INDEX(&map->pi_ivv, seg),
2435  V_COUNT(&map->pi_ivv, seg));
2436 
2437  count = seg_end - m0_ivec_varr_cursor_index(cursor);
2438  M0_LOG(M0_DEBUG, "[%p] cursor advance +%" PRIu64 " from %"PRIu64,
2439  map->pi_ioreq, count, m0_ivec_varr_cursor_index(cursor));
2440  ++seg;
2441  }
2442 
2443  return M0_RC(0);
2444 }
2445 
2446 /*
2447  * Decides whether to undertake a read-old or read-rest approach for
2448  * the parity group RMW IO request based on the total number of pages
2449  * to be read and written.
2450  *
2451  * In read-old approach, the old data and parity units are read and
2452  * the new parity is calculated incrementally based on the difference
2453  * between old and new data and parity units.
2454  *
2455  * In read-rest approach, the rest data units of the group are read
2456  * and the new parity is calculated based on them and the new data
2457  * units to be written.
2458  *
2459  * In both approaches, the number of units to be written is the same
2460  * (new data units and udpated parity units), so we compare only the
2461  * number of units (pages) to be read.
2462  *
2463  * By default, the segments in index vector pargrp_iomap::pi_ivec
2464  * are suitable for read-old approach. Hence the index vector is
2465  * changed only if read-rest approach is selected.
2466  *
2467  * @param map is the parity group iomap
2468  * @param data_pages_nr is the number of data pages in group
2469  * @param parity_pages_nr is the number of parity pages in group
2470  */
2472  m0_bcount_t data_pages_nr,
2473  m0_bcount_t parity_pages_nr)
2474 {
2475  int rc;
2476  /*
2477  * In read-old the number of pages to be read is the same as
2478  * the number of pages to be written.
2479  *
2480  * TODO: Can use number of data_buf structures instead of using
2481  * indexvec_page_nr().
2482  */
2483  uint64_t ro_pages_nr = iomap_page_nr(map) + parity_pages_nr;
2484  /*
2485  * In read-rest the number of pages to be read is all data
2486  * pages which are not fully spanned by the io vector.
2487  */
2488  uint64_t rr_pages_nr = data_pages_nr -
2489  map->pi_ops->pi_fullpages_find(map);
2490 
2491  if (rr_pages_nr < ro_pages_nr) {
2492  M0_LOG(M0_DEBUG, "[%p] Read-rest selected", map->pi_ioreq);
2493  map->pi_rtype = PIR_READREST;
2494  rc = map->pi_ops->pi_readrest(map);
2495  if (rc != 0)
2496  return M0_ERR(rc);
2497  } else {
2498  M0_LOG(M0_DEBUG, "[%p] Read-old selected", map->pi_ioreq);
2499  map->pi_rtype = PIR_READOLD;
2500  rc = map->pi_ops->pi_readold_auxbuf_alloc(map);
2501  }
2502 
2503  return M0_RC(rc);
2504 }
2505 
2507  struct m0_ivec_varr_cursor *cursor)
2508 {
2509  int rc;
2510  bool rmw = false;
2511  uint64_t grpsize;
2512  m0_bcount_t count = 0;
2513  m0_bindex_t grpstart;
2514  m0_bindex_t grpend;
2515  struct m0_pdclust_layout *play;
2516  struct inode *inode;
2517  struct m0t1fs_sb *csb;
2518  struct io_request *req;
2519 
2520  M0_PRE(map != NULL);
2521  M0_PRE(cursor->vc_ivv != NULL);
2522 
2523  req = map->pi_ioreq;
2524  play = pdlayout_get(map->pi_ioreq);
2525  grpsize = data_size(play);
2526  grpstart = grpsize * map->pi_grpid;
2527  grpend = grpstart + grpsize;
2529  csb = M0T1FS_SB(inode->i_sb);
2530 
2531  M0_ENTRY("[%p] map=%p ivec=%p", req, map, cursor->vc_ivv);
2532 
2533  /*
2534  * For a write, if this map does not span the whole parity group,
2535  * it is a read-modify-write.
2536  */
2537  if (map->pi_ioreq->ir_type == IRT_WRITE && grpstart < inode->i_size &&
2538  (m0_ivec_varr_cursor_index(cursor) > grpstart ||
2539  m0_ivec_varr_cursor_conti(cursor, grpend) < grpend))
2540  rmw = true;
2541 
2542  M0_LOG(M0_INFO, "[%p] grp_id=%llu: %s", req, map->pi_grpid,
2543  rmw ? "rmw" : "aligned");
2544 
2545  /* In 'verify mode', read all data units in this parity group */
2546  if (map->pi_ioreq->ir_type == IRT_READ && csb->csb_verify) {
2547  indexvec_varr_dump(&map->pi_ivv);
2548  M0_LOG(M0_DEBUG, "[%p] ivec=[%llu, +%llu)", req,
2549  grpstart, grpsize);
2550  V_SEG_NR(&map->pi_ivv) = 1;
2551  V_INDEX(&map->pi_ivv, 0) = grpstart;
2552  /* limit to file size. */
2553  count = min64u(grpend, inode->i_size) - grpstart;
2554  V_COUNT(&map->pi_ivv, 0) = round_up(count, PAGE_SIZE);
2555  rc = map->pi_ops->pi_seg_process(map, 0, rmw);
2556  m0_ivec_varr_cursor_move_to(cursor, grpend);
2557  } else
2558  rc = pargrp_iomap_populate_pi_ivec(map, cursor, rmw);
2559 
2560  if (rc != 0)
2561  return M0_ERR_INFO(rc, "[%p] failed", req);
2562 
2563  if (rmw) {
2565  parity_units_page_nr(play));
2566  if (rc != 0)
2567  return M0_ERR_INFO(rc, "[%p] failed", req);
2568  }
2569 
2570  /* For READ in verify mode or WRITE */
2571  if (map->pi_ioreq->ir_type == IRT_WRITE ||
2572  (map->pi_ioreq->ir_type == IRT_READ && csb->csb_verify))
2573  rc = map->pi_ops->pi_paritybufs_alloc(map);
2574 
2576 
2577  return M0_RC(rc);
2578 }
2579 
2585 {
2586  int rc = 0;
2587  uint32_t row;
2588  uint32_t row_nr;
2589  uint32_t col;
2590  uint32_t col_nr;
2591  struct data_buf ***bufs;
2592  struct m0_pdclust_layout *play;
2593  M0_PRE(map != NULL);
2594  M0_PRE(M0_IN(type, (M0_PUT_DATA, M0_PUT_PARITY)));
2595  M0_ENTRY("[%p] map %p", map->pi_ioreq, map);
2596 
2597  play = pdlayout_get(map->pi_ioreq);
2598 
2599  if (type == M0_PUT_DATA) {
2600  M0_ASSERT(map->pi_databufs != NULL);
2601  row_nr = rows_nr(play);
2602  col_nr = layout_n(play);
2603  bufs = map->pi_databufs;
2604  } else {
2605  row_nr = rows_nr(play);
2606  col_nr = layout_k(play);
2607  bufs = map->pi_paritybufs;
2608  }
2609 
2610  /*
2611  * Allocates data_buf structures from either ::pi_databufs
2612  * or ::pi_paritybufs array.
2613  * The loop traverses the matrix, column (unit) by column (unit).
2614  */
2615  for (col = 0; col < col_nr; ++col) {
2616  for (row = 0; row < row_nr; ++row) {
2617  /*
2618  * If the page is marked as PA_READ_FAILED, all
2619  * other pages belonging to the unit same as
2620  * the failed one, are also marked as PA_READ_FAILED,
2621  * hence the loop breaks from here.
2622  */
2623  if (bufs[row][col] != NULL &&
2624  bufs[row][col]->db_flags & PA_READ_FAILED)
2625  break;
2626  }
2627 
2628  if (row == row_nr)
2629  continue;
2630 
2631  for (row = 0; row < row_nr; ++row) {
2632  if (bufs[row][col] == NULL) {
2633  bufs[row][col] = data_buf_alloc_init(0);
2634  if (bufs[row][col] == NULL) {
2635  rc = M0_ERR(-ENOMEM);
2636  break;
2637  }
2638  }
2639  bufs[row][col]->db_flags |= PA_READ_FAILED;
2640  }
2641  }
2642  return M0_RC(rc);
2643 }
2644 
2645 static int unit_state(const struct m0_pdclust_src_addr *src,
2646  const struct io_request *req,
2647  enum m0_pool_nd_state *state)
2648 {
2649  struct m0_pdclust_instance *play_instance;
2650  struct m0_pdclust_tgt_addr tgt;
2651  int rc;
2652  struct m0_poolmach *pm;
2653 
2654  M0_ENTRY("[%p]", req);
2655 
2656  play_instance = pdlayout_instance(layout_instance(req));
2657  m0_fd_fwd_map(play_instance, src, &tgt);
2658 
2660  M0_ASSERT(pm != NULL);
2661  rc = m0_poolmach_device_state(pm, tgt.ta_obj, state);
2662  if (rc != 0)
2663  return M0_RC(rc);
2664  return M0_RC(rc);
2665 }
2666 
2667 static int io_spare_map(const struct pargrp_iomap *map,
2668  const struct m0_pdclust_src_addr *src,
2669  uint32_t *spare_slot, uint32_t *spare_slot_prev,
2670  enum m0_pool_nd_state *eff_state)
2671 {
2672 
2673  struct m0_pdclust_layout *play;
2674  struct m0_pdclust_instance *play_instance;
2675  const struct m0_fid *gfid;
2676  struct m0_pdclust_src_addr spare;
2677  int rc;
2678  struct m0_poolmach *pm;
2679 
2680  M0_ENTRY("[%p]", map->pi_ioreq);
2681  play = pdlayout_get(map->pi_ioreq);
2682  play_instance = pdlayout_instance(layout_instance(map->pi_ioreq));
2683  gfid = file_to_fid(map->pi_ioreq->ir_file);
2684 
2685  pm = m0t1fs_file_to_poolmach(map->pi_ioreq->ir_file);
2686  M0_ASSERT(pm != NULL);
2687  rc = m0_sns_repair_spare_map(pm, gfid, play, play_instance,
2688  src->sa_group, src->sa_unit,
2689  spare_slot, spare_slot_prev);
2690  if (rc != 0) {
2691  return M0_RC(rc);
2692  }
2693  /* Check if there is an effective failure of unit. */
2694  spare.sa_group = src->sa_group;
2695  spare.sa_unit = *spare_slot_prev;
2696  rc = unit_state(&spare, map->pi_ioreq, eff_state);
2697  return M0_RC(rc);
2698 }
2699 
2700 
2701 static void mark_page_as_read_failed(struct pargrp_iomap *map, uint32_t row,
2702  uint32_t col, enum page_attr page_type)
2703 {
2704  struct m0_pdclust_layout *play;
2705  struct m0_pdclust_src_addr src;
2706  enum m0_pool_nd_state state;
2707  uint32_t spare_slot;
2708  uint32_t spare_prev;
2709  int rc;
2710 
2711  M0_ENTRY("[%p] pid=%llu, row = %u, col=%u, type=0x%x",
2712  map->pi_ioreq, map->pi_grpid, row, col, page_type);
2713  M0_PRE(M0_IN(page_type,(PA_DATA, PA_PARITY)));
2714  M0_PRE(ergo(page_type == PA_DATA, map->pi_databufs[row][col] != NULL));
2715  M0_PRE(ergo(page_type == PA_PARITY,
2716  map->pi_paritybufs[row][col] != NULL));
2717 
2718  play = pdlayout_get(map->pi_ioreq);
2719  src.sa_group = map->pi_grpid;
2720  if (page_type == PA_DATA)
2721  src.sa_unit = col;
2722  else
2723  src.sa_unit = col + layout_n(play);
2724 
2725  rc = unit_state(&src, map->pi_ioreq, &state);
2726  M0_ASSERT(rc == 0);
2727  if (state == M0_PNDS_SNS_REPAIRED) {
2728  /* gets the state of corresponding spare unit */
2729  rc = io_spare_map(map, &src, &spare_slot, &spare_prev,
2730  &state);
2731  M0_ASSERT(rc == 0);
2732  }
2733  /*
2734  * Checking state M0_PNDS_SNS_REBALANCING allows concurrent read during
2735  * sns rebalancing in oostore mode. This works similarly to
2736  * M0_PNDS_FAILED.
2737  * To handle concurrent i/o in non-oostore mode, some more changes are
2738  * required to write data to live unit (on earlier failed device) if the
2739  * device state is M0_PNDS_SNS_REBALANCING.
2740  */
2741  if (M0_IN(state, (M0_PNDS_FAILED, M0_PNDS_OFFLINE,
2743  if (page_type == PA_DATA)
2744  map->pi_databufs[row][col]->db_flags |=
2746  else
2747  map->pi_paritybufs[row][col]->db_flags |=
2749  }
2750  M0_LEAVE();
2751 }
2752 
2760  struct target_ioreq *tio,
2761  m0_bindex_t *index,
2762  uint32_t count)
2763 {
2764  int rc = 0;
2765  uint32_t row;
2766  uint32_t col;
2767  m0_bindex_t goff;
2768  struct m0_pdclust_layout *play;
2769  struct m0_pdclust_src_addr src;
2770  enum m0_pool_nd_state dev_state;
2771  uint32_t spare_slot;
2772  uint32_t spare_slot_prev;
2773  struct m0_poolmach *pm;
2774  struct io_request *req;
2775 
2777  M0_ENTRY("[%p] grpid = %llu, count = %u\n",
2778  map->pi_ioreq, map->pi_grpid, count);
2779  M0_PRE(tio != NULL);
2780  M0_PRE(index != NULL);
2781  M0_PRE(count > 0);
2782 
2783  req = map->pi_ioreq;
2784  pm = m0t1fs_file_to_poolmach(map->pi_ioreq->ir_file);
2785  M0_ASSERT(pm != NULL);
2786  rc = m0_poolmach_device_state(pm, tio->ti_obj, &dev_state);
2787  play = pdlayout_get(req);
2788  pargrp_src_addr(index[0], req, tio, &src);
2789  M0_ASSERT(src.sa_group == map->pi_grpid);
2790  M0_ASSERT(src.sa_unit < layout_n(play) + layout_k(play));
2791  M0_LOG(M0_DEBUG, "[%p] src=[%llu:%llu] device state=%d",
2792  map->pi_ioreq, src.sa_group, src.sa_unit, dev_state);
2793  if (dev_state == M0_PNDS_SNS_REPAIRED) {
2794  rc = io_spare_map(map, &src, &spare_slot, &spare_slot_prev,
2795  &dev_state);
2796  M0_ASSERT(rc == 0);
2797  M0_LOG(M0_DEBUG, "[%p] spare=[%u] spare_prev=[%u] state=%d",
2798  map->pi_ioreq, spare_slot,
2799  spare_slot_prev, dev_state);
2800  if (dev_state == M0_PNDS_SNS_REPAIRED) {
2801  M0_LOG(M0_DEBUG, "reading from spare");
2802  return M0_RC(0);
2803  }
2804  }
2805  map->pi_state = PI_DEGRADED;
2806  ++req->ir_dgmap_nr;
2807  /* Failed segment belongs to a data unit. */
2808  if (src.sa_unit < layout_n(play)) {
2809  goff = gfile_offset(index[0], map, play, &src);
2810  page_pos_get(map, goff, &row, &col);
2811  M0_ASSERT(map->pi_databufs[row][col] != NULL);
2812  map->pi_databufs[row][col]->db_flags |= PA_READ_FAILED;
2813  } else {
2814  /* Failed segment belongs to a parity unit. */
2815  row = page_nr(index[0]) % page_nr(layout_unit_size(play));
2816  col = src.sa_unit - layout_n(play);
2817  M0_ASSERT(map->pi_paritybufs[row][col] != NULL);
2818  map->pi_paritybufs[row][col]->db_flags |= PA_READ_FAILED;
2819  }
2820  /*
2821  * Since m0_parity_math_recover() API will recover one or more
2822  * _whole_ units, all pages from a failed unit can be marked as
2823  * PA_READ_FAILED. These pages need not be read again.
2824  */
2826  if (rc != 0)
2827  return M0_ERR_INFO(rc, "[%p] Failed to mark pages from parity "
2828  "group", req);
2829 
2830  /*
2831  * If parity buffers are not allocated, they should be allocated
2832  * since they are needed for recovering lost data.
2833  */
2834  if (map->pi_paritybufs == NULL) {
2835  M0_ALLOC_ARR(map->pi_paritybufs, rows_nr(play));
2836  if (map->pi_paritybufs == NULL)
2837  return M0_ERR_INFO(-ENOMEM, "[%p] Failed to allocate "
2838  "parity buffers", req);
2839 
2840  for (row = 0; row < rows_nr(play); ++row) {
2841  M0_ALLOC_ARR(map->pi_paritybufs[row],
2842  layout_k(play));
2843  if (map->pi_paritybufs[row] == NULL) {
2844  rc = M0_ERR(-ENOMEM);
2845  goto par_fail;
2846  }
2847  }
2848  }
2850  return M0_RC(rc);
2851 
2852 par_fail:
2853  M0_ASSERT(rc != 0);
2854  for (row = 0; row < rows_nr(play); ++row)
2855  m0_free0(&map->pi_paritybufs[row]);
2856  m0_free0(&map->pi_paritybufs);
2857 
2858  return M0_ERR_INFO(rc, "[%p] dgmode_process failed", req);
2859 }
2860 
2862 {
2863  int rc = 0;
2864  bool within_eof;
2865  uint32_t row;
2866  uint32_t col;
2868  struct inode *inode;
2869  struct data_buf *dbuf;
2870  struct m0_pdclust_layout *play;
2871  struct m0t1fs_sb *csb;
2872  struct io_request *req;
2873 
2875 
2876  /*
2877  * read_old: Reads unavailable data subject to condition that
2878  * data lies within file size. Parity is already read.
2879  * read_rest: Reads parity units. Data for parity group is already
2880  * read.
2881  * simple_read: Reads unavailable data subject to condition that
2882  * data lies within file size. Parity also has to be read.
2883  */
2884 
2885  req = map->pi_ioreq;
2886  M0_ENTRY("[%p] parity group id %llu, map state = %d",
2887  req, map->pi_grpid, map->pi_state);
2888 
2890  play = pdlayout_get(req);
2891 
2892  /*
2893  * Data matrix from parity group.
2894  * The loop traverses column by column to be in sync with
2895  * increasing file offset.
2896  * This is necessary in order to generate correct index vector.
2897  */
2898  for (col = 0; col < layout_n(play); ++col) {
2899  for (row = 0; row < rows_nr(play); ++row) {
2900 
2901  start = data_page_offset_get(map, row, col);
2902  within_eof = start + PAGE_SIZE < inode->i_size ||
2903  (inode->i_size > 0 &&
2904  page_id(start + PAGE_SIZE - 1) ==
2905  page_id(inode->i_size - 1));
2906  if (map->pi_databufs[row][col] != NULL) {
2907  if (map->pi_databufs[row][col]->db_flags &
2909  continue;
2910  } else {
2911  /*
2912  * If current parity group map is degraded,
2913  * then recovery is needed and a new
2914  * data buffer needs to be allocated subject to
2915  * limitation of file size.
2916  */
2917  if (map->pi_state == PI_DEGRADED &&
2918  within_eof) {
2919  map->pi_databufs[row][col] =
2921  if (map->pi_databufs[row][col] ==
2922  NULL) {
2923  rc = M0_ERR(-ENOMEM);
2924  break;
2925  }
2926  mark_page_as_read_failed(map, row, col,
2927  PA_DATA);
2928  }
2929  if (map->pi_state == PI_HEALTHY)
2930  continue;
2931  }
2932  dbuf = map->pi_databufs[row][col];
2933  /*
2934  * Marks only those data buffers which lie within EOF.
2935  * Since all IO fops receive error
2936  * once sns repair starts (M0_PNDS_SNS_REPAIRING state)
2937  * read is not done for any of these fops.
2938  * Hence all pages other than the one which encountered
2939  * failure (PA_READ_FAILED flag set) are read in
2940  * degraded mode.
2941  */
2942  if (within_eof) {
2943  if (dbuf->db_flags & PA_READ_FAILED ||
2944  is_page_read(dbuf)) {
2945  continue;
2946  }
2947  dbuf->db_flags |= PA_DGMODE_READ;
2948  }
2949  }
2950  }
2951 
2952  if (rc != 0)
2953  goto err;
2954 
2955  csb = M0T1FS_SB(inode->i_sb);
2956  /* If parity group is healthy, there is no need to read parity. */
2957  if (map->pi_state != PI_DEGRADED && !csb->csb_verify)
2958  return M0_RC(0);
2959 
2960  /*
2961  * Populates the index vector if original read IO request did not
2962  * span it. Since recovery is needed using parity algorithms,
2963  * whole parity group needs to be read subject to file size limitation.
2964  * Ergo, parity group index vector contains only one segment
2965  * worth the parity group in size.
2966  */
2967  V_INDEX(&map->pi_ivv, 0) = map->pi_grpid * data_size(play);
2968  V_COUNT(&map->pi_ivv, 0) = min64u(V_INDEX(&map->pi_ivv, 0) +
2969  data_size(play),
2970  inode->i_size) -
2971  V_INDEX(&map->pi_ivv, 0);
2972  /*
2973  * m0_0vec requires all members except the last one to have data count
2974  * multiple of 4K.
2975  */
2976  V_COUNT(&map->pi_ivv, 0) = round_up(
2977  V_COUNT(&map->pi_ivv, 0),
2978  PAGE_SIZE);
2979  V_SEG_NR(&map->pi_ivv) = 1;
2980  indexvec_varr_dump(&map->pi_ivv);
2981  /* parity matrix from parity group. */
2982  for (row = 0; row < rows_nr(play); ++row) {
2983  for (col = 0; col < layout_k(play); ++col) {
2984 
2985  if (map->pi_paritybufs[row][col] == NULL) {
2986  map->pi_paritybufs[row][col] =
2988  if (map->pi_paritybufs[row][col] == NULL) {
2989  rc = M0_ERR(-ENOMEM);
2990  break;
2991  }
2992  }
2993  dbuf = map->pi_paritybufs[row][col];
2995  /* Skips the page if it is marked as PA_READ_FAILED. */
2996  if (dbuf->db_flags & PA_READ_FAILED ||
2997  is_page_read(dbuf)) {
2998  continue;
2999  }
3000  dbuf->db_flags |= PA_DGMODE_READ;
3001  }
3002  }
3003  if (rc != 0)
3004  goto err;
3005  return M0_RC(rc);
3006 err:
3007  return M0_ERR_INFO(rc,"[%p] %s", req,
3008  rc == -ENOMEM ? "Failed to allocate "
3009  "data buffer": "Illegal device queried for status");
3010 }
3011 
3013  uint8_t *failed)
3014 {
3015  struct m0_pdclust_layout *play;
3016  uint32_t col;
3017  uint32_t K = 0;
3018 
3019  play = pdlayout_get(map->pi_ioreq);
3020  for (col = 0; col < layout_n(play); ++col) {
3021  if (map->pi_databufs[0][col] != NULL &&
3022  map->pi_databufs[0][col]->db_flags &
3023  PA_READ_FAILED) {
3024  failed[col] = 1;
3025  ++K;
3026  }
3027 
3028  }
3029  for (col = 0; col < layout_k(play); ++col) {
3030  M0_ASSERT(map->pi_paritybufs[0][col] != NULL);
3031  if (map->pi_paritybufs[0][col]->db_flags &
3032  PA_READ_FAILED) {
3033  failed[col + layout_n(play)] = 1;
3034  ++K;
3035  }
3036  }
3037  return K;
3038 }
3039 
3041 {
3042  int rc = 0;
3043  uint32_t row;
3044  uint32_t col;
3045  uint32_t K;
3046  unsigned long zpage;
3047  struct m0_buf *data;
3048  struct m0_buf *parity;
3049  struct m0_buf failed;
3050  struct m0_pdclust_layout *play;
3051 
3053  M0_PRE(map->pi_state == PI_DEGRADED);
3054 
3055  M0_ENTRY("[%p] map %p", map->pi_ioreq, map);
3056 
3057  play = pdlayout_get(map->pi_ioreq);
3058  M0_ALLOC_ARR(data, layout_n(play));
3059  if (data == NULL)
3060  return M0_ERR_INFO(-ENOMEM, "[%p] Failed to allocate memory"
3061  " for data buf", map->pi_ioreq);
3062 
3063  M0_ALLOC_ARR(parity, layout_k(play));
3064  if (parity == NULL) {
3065  m0_free(data);
3066  return M0_ERR_INFO(-ENOMEM, "[%p] Failed to allocate memory"
3067  " for parity buf", map->pi_ioreq);
3068  }
3069 
3070  zpage = get_zeroed_page(GFP_KERNEL);
3071  if (zpage == 0) {
3072  m0_free(data);
3073  m0_free(parity);
3074  return M0_ERR_INFO(-ENOMEM, "[%p] Failed to allocate page.",
3075  map->pi_ioreq);
3076  }
3077 
3078  failed.b_nob = layout_n(play) + layout_k(play);
3079  failed.b_addr = m0_alloc(failed.b_nob);
3080  if (failed.b_addr == NULL) {
3081  m0_free(data);
3082  m0_free(parity);
3083  free_page(zpage);
3084  return M0_ERR_INFO(-ENOMEM, "[%p] Failed to allocate memory "
3085  "for m0_buf", map->pi_ioreq);
3086  }
3087  K = iomap_dgmode_recov_prepare(map, (uint8_t *)failed.b_addr);
3088  if (K > layout_k(play)) {
3089  M0_LOG(M0_ERROR, "More failures in group %d",
3090  (int)map->pi_grpid);
3091  rc = -EIO;
3092  goto end;
3093  }
3094 
3095  /* Populates data and failed buffers. */
3096  for (row = 0; row < rows_nr(play); ++row) {
3097  for (col = 0; col < layout_n(play); ++col) {
3098  data[col].b_nob = PAGE_SIZE;
3099  if (map->pi_databufs[row][col] == NULL) {
3100  data[col].b_addr = (void *)zpage;
3101  continue;
3102  }
3103  data[col].b_addr = map->pi_databufs[row][col]->
3104  db_buf.b_addr;
3105  }
3106  for (col = 0; col < layout_k(play); ++col) {
3107  M0_ASSERT(map->pi_paritybufs[row][col] != NULL);
3108  parity[col].b_addr = map->pi_paritybufs[row][col]->
3109  db_buf.b_addr;
3110  parity[col].b_nob = PAGE_SIZE;
3111  }
3113  parity, &failed, M0_LA_INVERSE);
3114  if (rc != 0)
3115  goto end;
3116  }
3117 
3118 end:
3119  m0_free(data);
3120  m0_free(parity);
3121  m0_free(failed.b_addr);
3122  free_page(zpage);
3123  return rc == 0 ? M0_RC(0) : M0_ERR_INFO(rc, "Number of failed units"
3124  "in parity group exceeds the"
3125  "total number of parity units"
3126  "in a parity group %d.",
3127  (int)map->pi_grpid);
3128 }
3129 
3131 {
3132  uint64_t seg;
3133  uint64_t grp;
3134  uint64_t grpstart;
3135  uint64_t grpend;
3136  uint64_t *grparray;
3137  uint64_t grparray_sz;
3138  struct m0_pdclust_layout *play;
3139 
3140  M0_ENTRY("[%p]", req);
3141 
3142  play = pdlayout_get(req);
3143 
3144  /* Array of maximum possible number of groups spanned by req. */
3145  grparray_sz = indexvec_varr_count(&req->ir_ivv) / data_size(play) +
3146  2 * V_SEG_NR(&req->ir_ivv);
3147  M0_LOG(M0_DEBUG, "[%p] arr_sz=%llu", req, grparray_sz);
3148  M0_ALLOC_ARR(grparray, grparray_sz);
3149  if (grparray == NULL)
3150  return M0_ERR_INFO(-ENOMEM, "[%p] Failed to allocate memory"
3151  " for int array", req);
3152  /*
3153  * Finds out total number of parity groups spanned by
3154  * io_request::ir_ivec.
3155  */
3156  for (seg = 0; seg < V_SEG_NR(&req->ir_ivv); ++seg) {
3157  grpstart = group_id(V_INDEX(&req->ir_ivv, seg),
3158  data_size(play));
3159  grpend = group_id(v_seg_endpos(&req->ir_ivv, seg) - 1,
3160  data_size(play));
3161  for (grp = grpstart; grp <= grpend; ++grp) {
3162  uint64_t i;
3163  /*
3164  * grparray is a temporary array to record found groups.
3165  * Scan this array for [grpstart, grpend].
3166  * If not found, record it in this array and
3167  * increase ir_iomap_nr.
3168  */
3169  for (i = 0; i < req->ir_iomap_nr; ++i) {
3170  if (grparray[i] == grp)
3171  break;
3172  }
3173  /* 'grp' is not found. Adding it to @grparray */
3174  if (i == req->ir_iomap_nr) {
3175  M0_ASSERT_INFO(i < grparray_sz,
3176  "[%p] nr=%llu size=%llu",
3177  req, i , grparray_sz);
3178  grparray[i] = grp;
3179  ++req->ir_iomap_nr;
3180  }
3181  }
3182  }
3183  m0_free(grparray);
3184  return M0_RC(0);
3185 }
3186 
3188 {
3189  int rc;
3190  uint64_t map;
3191  struct m0_ivec_varr_cursor cursor;
3192  struct m0_pdclust_layout *play;
3193 
3194  M0_PRE(req != NULL);
3195 
3196  M0_ENTRY("[%p]", req);
3197  play = pdlayout_get(req);
3198 
3200  if (rc != 0)
3201  return M0_RC(rc);
3202 
3203  M0_LOG(M0_DEBUG, "[%p] spanned_groups=%llu [N,K,us]=[%d,%d,%llu]",
3204  req, req->ir_iomap_nr, layout_n(play),
3205  layout_k(play), layout_unit_size(play));
3206 
3207  /* req->ir_iomaps is zeroed out on allocation. */
3209  if (req->ir_iomaps == NULL) {
3210  rc = M0_ERR(-ENOMEM);
3211  goto failed;
3212  }
3213 
3214  m0_ivec_varr_cursor_init(&cursor, &req->ir_ivv);
3215 
3216  /*
3217  * cursor is advanced maximum by parity group size in one iteration
3218  * of this loop.
3219  * This is done by pargrp_iomap::pi_ops::pi_populate().
3220  */
3221  for (map = 0; !m0_ivec_varr_cursor_move(&cursor, 0); ++map) {
3222  M0_ASSERT(map < req->ir_iomap_nr);
3223  M0_ASSERT(req->ir_iomaps[map] == NULL);
3225  if (req->ir_iomaps[map] == NULL) {
3226  rc = M0_ERR(-ENOMEM);
3227  goto failed;
3228  }
3229 
3230  ++iommstats.a_pargrp_iomap_nr;
3233  data_size(play)));
3234  if (rc != 0) {
3235  m0_free0(&req->ir_iomaps[map]);
3236  goto failed;
3237  }
3238 
3239  /* @cursor is advanced in the following function */
3241  ir_iomaps[map], &cursor);
3242  if (rc != 0)
3243  goto failed;
3244  M0_LOG(M0_INFO, "[%p] pargrp_iomap id : %llu populated",
3245  req, req->ir_iomaps[map]->pi_grpid);
3246  }
3247  return M0_RC(0);
3248 failed:
3249  if (req->ir_iomaps != NULL)
3251 
3252  return M0_ERR_INFO(rc, "[%p] iomaps_prepare failed", req);
3253 }
3254 
3256 {
3257  uint64_t i;
3258 
3259  M0_ENTRY("[%p]", req);
3260 
3261  M0_PRE(req != NULL);
3262  M0_PRE(req->ir_iomaps != NULL);
3263 
3264  for (i = 0; i < req->ir_iomap_nr; ++i) {
3265  if (req->ir_iomaps[i] != NULL) {
3267  m0_free(req->ir_iomaps[i]);
3268  ++iommstats.d_pargrp_iomap_nr;
3269  }
3270  }
3271  m0_free0(&req->ir_iomaps);
3272  req->ir_iomap_nr = 0;
3273 }
3274 
3276 {
3277  int rc;
3278  uint64_t cnt;
3279  struct io_request *req;
3280  struct dgmode_rwvec *dg;
3281  struct m0_pdclust_layout *play;
3282 
3283  M0_ENTRY();
3284  M0_PRE(ti != NULL);
3285  M0_PRE(ti->ti_dgvec == NULL);
3286 
3287  req = bob_of(ti->ti_nwxfer, struct io_request, ir_nwxfer,
3288  &ioreq_bobtype);
3289  play = pdlayout_get(req);
3291  (layout_n(play) + layout_k(play)));
3292  M0_LOG(M0_DEBUG, "[%p]", req);
3293 
3294  M0_ALLOC_PTR(dg);
3295  if (dg == NULL) {
3296  rc = M0_ERR(-ENOMEM);
3297  goto failed;
3298  }
3299 
3300  dg->dr_tioreq = ti;
3301 
3303  if (rc != 0)
3304  goto failed_free_dg;
3305 
3307  if (rc != 0)
3308  goto failed_free_iv;
3309 
3310  rc = m0_varr_init(&dg->dr_pageattrs, cnt, sizeof(enum page_attr),
3311  (size_t)m0_pagesize_get());
3312  if (rc != 0)
3313  goto failed_free_bv;
3314 
3315  /*
3316  * This value is incremented every time a new segment is added
3317  * to this index vector.
3318  */
3319  V_SEG_NR(&dg->dr_ivec_varr) = 0;
3320 
3321  ti->ti_dgvec = dg;
3322  return M0_RC(0);
3323 
3324 failed_free_bv:
3326 failed_free_iv:
3328 failed_free_dg:
3329  m0_free(dg);
3330 failed:
3331  return M0_ERR_INFO(rc, "[%p] Dgmode read vector allocation failed",
3332  req);
3333 }
3334 
3336 {
3337  M0_ENTRY();
3338 
3339  M0_PRE(dg != NULL);
3340 
3341  dg->dr_tioreq = NULL;
3344  m0_varr_fini(&dg->dr_pageattrs);
3345 }
3346 
3351 static void databufs_set_dgw_mode(struct pargrp_iomap *iomap,
3352  struct m0_ext *ext)
3353 {
3354  uint32_t row_start;
3355  uint32_t row_end;
3356  uint32_t row;
3357  uint32_t col;
3358  struct data_buf *dbuf;
3359 
3360  page_pos_get(iomap, ext->e_start, &row_start, &col);
3361  page_pos_get(iomap, ext->e_end - 1, &row_end, &col);
3362 
3363  for (row = row_start; row <= row_end; ++row) {
3364  dbuf = iomap->pi_databufs[row][col];
3365  if (dbuf->db_flags & PA_WRITE)
3366  dbuf->db_flags |= PA_DGMODE_WRITE;
3367  }
3368 }
3369 
3373 static void paritybufs_set_dgw_mode(struct pargrp_iomap *iomap,
3374  struct m0_pdclust_layout *play,
3375  uint64_t unit)
3376 {
3377  uint32_t row;
3378  uint32_t col;
3379  uint64_t unit_size = layout_unit_size(play);
3380  struct data_buf *dbuf;
3381 
3382  parity_page_pos_get(iomap, unit * unit_size, &row, &col);
3383  for (; row < rows_nr(play); ++row) {
3384  dbuf = iomap->pi_paritybufs[row][col];
3385  if (dbuf->db_flags & PA_WRITE)
3386  dbuf->db_flags |= PA_DGMODE_WRITE;
3387  }
3388 }
3389 
3390 /*
3391  * Distributes file data into target_ioreq objects as required and populates
3392  * target_ioreq::ti_ivv and target_ioreq::ti_bufvec.
3393  */
3394 static int nw_xfer_io_distribute(struct nw_xfer_request *xfer)
3395 {
3396  int rc;
3397  uint64_t i;
3398  uint64_t unit;
3399  uint64_t unit_size;
3400  uint64_t count;
3401  uint64_t pgstart;
3402  uint64_t pgend;
3403  /* Extent representing a data unit. */
3404  struct m0_ext u_ext;
3405  /* Extent representing resultant extent. */
3406  struct m0_ext r_ext;
3407  /* Extent representing a segment from index vector. */
3408  struct m0_ext v_ext;
3409  struct io_request *req;
3410  struct target_ioreq *ti;
3411  struct m0_ivec_varr_cursor cur;
3412  struct m0_pdclust_layout *play;
3413  enum m0_pdclust_unit_type unit_type;
3414  struct m0_pdclust_src_addr src;
3415  struct m0_pdclust_tgt_addr tgt;
3416  struct pargrp_iomap *iomap;
3417  struct inode *inode;
3418  struct m0t1fs_sb *csb;
3419 
3420  M0_ENTRY("nw_xfer_request %p", xfer);
3422 
3423  req = bob_of(xfer, struct io_request, ir_nwxfer, &ioreq_bobtype);
3424  play = pdlayout_get(req);
3425  unit_size = layout_unit_size(play);
3426 
3427  for (i = 0; i < req->ir_iomap_nr; ++i) {
3428  count = 0;
3429  iomap = req->ir_iomaps[i];
3430  pgstart = data_size(play) * iomap->pi_grpid;
3431  pgend = pgstart + data_size(play);
3432  src.sa_group = iomap->pi_grpid;
3433 
3434  M0_LOG(M0_DEBUG, "[%p] iomap=%p [grpid=%llu state=%u]",
3435  req, iomap, iomap->pi_grpid, iomap->pi_state);
3436 
3437  /* traverse parity group ivec by units */
3438  m0_ivec_varr_cursor_init(&cur, &iomap->pi_ivv);
3439  while (!m0_ivec_varr_cursor_move(&cur, count)) {
3440 
3441  unit = (m0_ivec_varr_cursor_index(&cur) - pgstart) /
3442  unit_size;
3443 
3444  u_ext.e_start = pgstart + unit * unit_size;
3445  u_ext.e_end = u_ext.e_start + unit_size;
3446 
3448  v_ext.e_end = v_ext.e_start +
3450 
3451  m0_ext_intersection(&u_ext, &v_ext, &r_ext);
3452  M0_ASSERT(m0_ext_is_valid(&r_ext));
3453  count = m0_ext_length(&r_ext);
3454 
3455  unit_type = m0_pdclust_unit_classify(play, unit);
3456  M0_ASSERT(unit_type == M0_PUT_DATA);
3457 
3459  databufs_set_dgw_mode(iomap, &r_ext);
3460 
3461  src.sa_unit = unit;
3462  rc = xfer->nxr_ops->nxo_tioreq_map(xfer, &src, &tgt,
3463  &ti);
3464  if (rc != 0) {
3465  M0_LOG(M0_DEBUG, "[%p] iomap=%p "
3466  "nxo_tioreq_map() failed, rc=%d",
3467  req, iomap, rc);
3468  goto err;
3469  }
3470 
3471  M0_LOG(M0_DEBUG, "[%p] adding data. ti state=%d\n",
3472  req, ti->ti_state);
3473  ti->ti_ops->tio_seg_add(ti, &src, &tgt, r_ext.e_start,
3474  m0_ext_length(&r_ext), iomap);
3475  }
3476 
3477  inode = iomap_to_inode(iomap);
3478  csb = M0T1FS_SB(inode->i_sb);
3479 
3480  /* process parity units */
3481  if (req->ir_type == IRT_WRITE ||
3482  (req->ir_type == IRT_READ && csb->csb_verify) ||
3484  iomap->pi_state == PI_DEGRADED)) {
3485 
3486  for (unit = 0; unit < layout_k(play); ++unit) {
3487 
3488  src.sa_unit = layout_n(play) + unit;
3489 
3491  src.sa_unit) == M0_PUT_PARITY);
3492 
3493  rc = xfer->nxr_ops->nxo_tioreq_map(xfer, &src,
3494  &tgt, &ti);
3495  if (rc != 0) {
3496  M0_LOG(M0_DEBUG, "[%p] iomap=%p "
3497  "nxo_tioreq_map() failed, rc=%d",
3498  req, iomap, rc);
3499  goto err;
3500  }
3501 
3503  paritybufs_set_dgw_mode(iomap, play,
3504  unit);
3505 
3506  ti->ti_ops->tio_seg_add(ti, &src, &tgt, pgstart,
3507  layout_unit_size(play),
3508  iomap);
3509  }
3510 
3511  if (!csb->csb_oostore || req->ir_type != IRT_WRITE)
3512  continue;
3513 
3514  /* Cob create for spares. */
3515  for (unit = layout_k(play); unit < 2 * layout_k(play);
3516  ++unit) {
3517  src.sa_unit = layout_n(play) + unit;
3518  rc = xfer->nxr_ops->nxo_tioreq_map(xfer, &src,
3519  &tgt, &ti);
3520  if (rc != 0) {
3521  M0_LOG(M0_ERROR, "[%p] iomap=%p "
3522  "nxo_tioreq_map() failed, rc=%d",
3523  req, iomap, rc);
3524  }
3525  if (target_ioreq_type_get(ti) != TI_NONE)
3526  continue;
3528  }
3529  }
3530  }
3531 
3532  return M0_RC(0);
3533 err:
3534  m0_htable_for(tioreqht, ti, &xfer->nxr_tioreqs_hash) {
3535  tioreqht_htable_del(&xfer->nxr_tioreqs_hash, ti);
3536  M0_LOG(M0_INFO, "[%p] target_ioreq deleted for "FID_F,
3537  req, FID_P(&ti->ti_fid));
3538  target_ioreq_fini(ti);
3539  m0_free0(&ti);
3540  ++iommstats.d_target_ioreq_nr;
3541  } m0_htable_endfor;
3542 
3543  return M0_ERR_INFO(rc, "[%p] io_prepare failed", req);
3544 }
3545 
3546 static inline int ioreq_sm_timedwait(struct io_request *req,
3547  uint64_t state)
3548 {
3549  int rc;
3550  M0_PRE(req != NULL);
3551 
3552  M0_ENTRY("[%p] Waiting for %s -> %s, Pending fops %llu, "
3553  "Pending rdbulk %llu", req,
3554  io_states[ioreq_sm_state(req)].sd_name,
3555  io_states[state].sd_name,
3558 
3561  M0_TIME_NEVER);
3563 
3564  if (rc != 0)
3565  M0_LOG(M0_DEBUG, "[%p] rc %d", req, rc);
3566  M0_LEAVE("[%p] rc %d", req, rc);
3567  return rc;
3568 }
3569 
3571 {
3572  int rc = 0;
3573  uint64_t i;
3574  struct pargrp_iomap *iomap;
3575 
3576  M0_ENTRY("[%p]", req);
3579 
3580  for (i = 0; i < req->ir_iomap_nr; ++i) {
3581  iomap = req->ir_iomaps[i];
3582  if (iomap->pi_state == PI_DEGRADED) {
3583  rc = iomap->pi_ops->pi_dgmode_recover(iomap);
3584  if (rc != 0)
3585  return M0_ERR_INFO(rc, "[%p] Failed to recover"
3586  " data", req);
3587  }
3588  }
3589 
3590  return M0_RC(rc);
3591 }
3592 
3597 static uint64_t tolerance_of_level(struct io_request *req, uint64_t lv)
3598 {
3599  struct m0_pdclust_instance *play_instance;
3600  struct m0_pool_version *pver;
3601 
3603 
3604  play_instance = pdlayout_instance(layout_instance(req));
3605  pver = play_instance->pi_base.li_l->l_pver;
3606  return pver->pv_fd_tol_vec[lv];
3607 }
3608 
3615 static bool is_session_marked(struct io_request *req,
3616  struct m0_rpc_session *session)
3617 {
3618  uint64_t i;
3619  uint64_t max_failures;
3620  uint64_t session_id;
3621 
3622  session_id = session->s_session_id;
3624  for (i = 0; i < max_failures; ++i) {
3625  if (req->ir_failed_session[i] == session_id)
3626  return true;
3627  else if (req->ir_failed_session[i] == ~(uint64_t)0) {
3628  req->ir_failed_session[i] = session_id;
3629  return false;
3630  }
3631  }
3632  return false;
3633 }
3634 
3641 static int device_check(struct io_request *req)
3642 {
3643  int rc = 0;
3644  uint32_t fdev_nr = 0;
3645  uint32_t fsvc_nr = 0;
3646  struct target_ioreq *ti;
3647  struct m0_pdclust_layout *play;
3648  enum m0_pool_nd_state state;
3649  uint64_t max_failures;
3651 
3653 
3654  M0_ENTRY("[%p]", req);
3655  M0_PRE(req != NULL);
3657  IRS_WRITE_COMPLETE)));
3658  play = pdlayout_get(req);
3659  m0_htable_for (tioreqht, ti, &req->ir_nwxfer.nxr_tioreqs_hash) {
3660  rc = m0_poolmach_device_state(pm, ti->ti_obj, &state);
3661  if (rc != 0)
3662  return M0_ERR_INFO(rc, "[%p] Failed to retrieve target "
3663  "device state", req);
3664  /* The case when a particular service is down. */
3665  if (ti->ti_rc == -ECANCELED) {
3666  if (!is_session_marked(req, ti->ti_session)) {
3667  M0_CNT_INC(fsvc_nr);
3668  }
3669  /* The case when multiple devices under the same service are
3670  * unavailable. */
3671  } else if (M0_IN(state, (M0_PNDS_FAILED, M0_PNDS_OFFLINE,
3673  !is_session_marked(req, ti->ti_session)) {
3674  M0_CNT_INC(fdev_nr);
3675  }
3676  } m0_htable_endfor;
3677  M0_LOG(M0_DEBUG, "failed devices = %d\ttolerance=%d", (int)fdev_nr,
3678  (int)layout_k(play));
3679  if (is_pver_dud(fdev_nr, layout_k(play), fsvc_nr, max_failures))
3680  return M0_ERR_INFO(-EIO, "[%p] Failed to recover data "
3681  "since number of failed data units "
3682  "(%lu) exceeds number of parity "
3683  "units in parity group (%lu) OR "
3684  "number of failed services (%lu) "
3685  "exceeds number of max failures "
3686  "supported (%lu)",
3687  req, (unsigned long)fdev_nr,
3688  (unsigned long)layout_k(play),
3689  (unsigned long)fsvc_nr,
3690  (unsigned long)max_failures);
3691  return M0_RC(fdev_nr);
3692 }
3693 
3694 /* If there are F(l) failures at level l, and K(l) failures are tolerable for
3695  * the level l, then the condition for pool-version to be non-dud is:
3696  * \sum_over_l {F(l) / K(l)} <= 1
3697  * Once MOTR-899 lands into dev, this function will go away.
3698  */
3699 static bool is_pver_dud(uint32_t fdev_nr, uint32_t dev_k, uint32_t fsvc_nr,
3700  uint32_t svc_k)
3701 {
3702  if (fdev_nr > 0 && dev_k == 0)
3703  return true;
3704  if (fsvc_nr > 0 && svc_k == 0)
3705  return true;
3706  return (svc_k + fsvc_nr > 0) ?
3707  (fdev_nr * svc_k + fsvc_nr * dev_k) > dev_k * svc_k :
3708  fdev_nr > dev_k;
3709 }
3710 
3711 static int ioreq_dgmode_write(struct io_request *req, bool rmw)
3712 {
3713  int rc;
3714  struct target_ioreq *ti;
3715  struct m0t1fs_sb *csb;
3716  struct nw_xfer_request *xfer;
3717 
3719 
3720  xfer = &req->ir_nwxfer;
3721  M0_ENTRY("[%p]", req);
3722  csb = file_to_sb(req->ir_file);
3723  /* In oostore mode we do not enter the degraded mode write. */
3724  if (csb->csb_oostore || M0_IN(xfer->nxr_rc, (0, -E2BIG, -ESTALE)))
3725  return M0_RC(xfer->nxr_rc);
3726 
3727  rc = device_check(req);
3728  if (rc < 0 ) {
3729  return M0_RC(rc);
3730  }
3732  /*
3733  * This IO request has already acquired distributed lock on the
3734  * file by this time.
3735  * Degraded mode write needs to handle 2 prime use-cases.
3736  * 1. SNS repair still to start on associated global fid.
3737  * 2. SNS repair has completed for associated global fid.
3738  * Both use-cases imply unavailability of one or more devices.
3739  *
3740  * In first use-case, repair is yet to start on file. Hence,
3741  * rest of the file data which goes on healthy devices can be
3742  * written safely.
3743  * In this case, the fops meant for failed device(s) will be simply
3744  * dropped and rest of the fops will be sent to respective ioservice
3745  * instances for writing data to servers.
3746  * Later when this IO request relinquishes the distributed lock on
3747  * associated global fid and SNS repair starts on the file, the lost
3748  * data will be regenerated using parity recovery algorithms.
3749  *
3750  * The second use-case implies completion of SNS repair for associated
3751  * global fid and the lost data is regenerated on distributed spare
3752  * units.
3753  * Ergo, all the file data meant for lost device(s) will be redirected
3754  * towards corresponding spare unit(s). Later when SNS rebalance phase
3755  * commences, it will migrate the data from spare to a new device, thus
3756  * making spare available for recovery again.
3757  * In this case, old fops will be discarded and all pages spanned by
3758  * IO request will be reshuffled by redirecting pages meant for
3759  * failed device(s) to its corresponding spare unit(s).
3760  */
3761 
3762  /*
3763  * Finalizes current fops which are not valid anymore.
3764  * Fops need to be finalized in either case since old network buffers
3765  * from IO fops are still enqueued in transfer machine and removal
3766  * of these buffers would lead to finalization of rpc bulk object.
3767  */
3768  M0_LOG(M0_ERROR, "[%p] Degraded write:About to nxo_complete()", req);
3769  xfer->nxr_ops->nxo_complete(xfer, rmw);
3770  /*
3771  * Resets count of data bytes and parity bytes along with
3772  * return status.
3773  * Fops meant for failed devices are dropped in
3774  * nw_xfer_req_dispatch().
3775  */
3776  m0_htable_for(tioreqht, ti, &xfer->nxr_tioreqs_hash) {
3777  ti->ti_databytes = 0;
3778  ti->ti_parbytes = 0;
3779  ti->ti_rc = 0;
3780  ti->ti_req_type = TI_NONE;
3781  } m0_htable_endfor;
3782 
3783  /*
3784  * Redistributes all pages by routing pages for repaired devices
3785  * to spare units for each parity group.
3786  */
3787  rc = xfer->nxr_ops->nxo_distribute(xfer);
3788  if (rc != 0)
3789  return M0_ERR_INFO(rc, "[%p] Failed to redistribute file data "
3790  "between target_ioreq objects", req);
3791 
3792  xfer->nxr_rc = 0;
3793  req->ir_rc = xfer->nxr_rc;
3794 
3795  rc = xfer->nxr_ops->nxo_dispatch(xfer);
3796  if (rc != 0)
3797  return M0_ERR_INFO(rc, "[%p] Failed to dispatch degraded mode"
3798  "write IO fops", req);
3799 
3801  if (rc != 0)
3802  return M0_ERR_INFO(rc, "[%p] Degraded mode write IO failed",
3803  req);
3804  return M0_RC(xfer->nxr_rc);
3805 }
3806 
3807 static int ioreq_dgmode_read(struct io_request *req, bool rmw)
3808 {
3809  int rc = 0;
3810  uint64_t i;
3811  struct io_req_fop *irfop;
3812  struct target_ioreq *ti;
3813  enum m0_pool_nd_state state;
3814  struct m0_poolmach *pm;
3815  struct nw_xfer_request *xfer;
3816  struct pargrp_iomap *iomap;
3817  struct m0t1fs_sb *csb;
3818 
3819 
3821 
3822  csb = M0T1FS_SB(m0t1fs_file_to_inode(req->ir_file)->i_sb);
3823  xfer = &req->ir_nwxfer;
3824  M0_ENTRY("[%p] xfer->nxr_rc=%d", req, xfer->nxr_rc);
3825 
3826  /*
3827  * If all devices are ONLINE, all requests return success.
3828  * In case of read before write, due to CROW, COB will not be present,
3829  * resulting into ENOENT error. When conf cache is drained io should
3830  * not proceed.
3831  */
3832  if (M0_IN(xfer->nxr_rc, (0, -ENOENT, -ESTALE)) ||
3833  /*
3834  * For rmw in oostore case return immediately without
3835  * bothering to check if degraded read can be done.
3836  * Write IO should be aborted in this case.
3837  */
3838  (csb->csb_oostore && req->ir_type == IRT_WRITE))
3839  return M0_RC(xfer->nxr_rc);
3840 
3841  rc = device_check(req);
3842  /*
3843  * Number of failed devices is not a criteria good enough
3844  * by itself. Even if one/more devices failed but IO request
3845  * could complete if IO request did not send any pages to
3846  * failed device(s) at all.
3847  */
3848  if (rc < 0)
3849  return M0_RC(rc);
3850  M0_LOG(M0_DEBUG, "[%p] Proceeding with the degraded read", req);
3852  M0_ASSERT(pm != NULL);
3853  m0_htable_for(tioreqht, ti, &xfer->nxr_tioreqs_hash) {
3854  /*
3855  * Data was retrieved successfully, so no need to check the
3856  * state of the device.
3857  */
3858  if (ti->ti_rc == 0)
3859  continue;
3860  /* state is already queried in device_check() and stored
3861  * in ti->ti_state. Why do we do this again?
3862  */
3863  rc = m0_poolmach_device_state(pm, ti->ti_obj, &state);
3864  if (rc != 0)
3865  return M0_ERR_INFO(rc, "[%p] Failed to retrieve device "
3866  "state", req);
3867  M0_LOG(M0_INFO, "[%p] device state for "FID_F" is %d",
3868  req, FID_P(&ti->ti_fid), state);
3869  ti->ti_state = state;
3870  if (!M0_IN(state, (M0_PNDS_FAILED, M0_PNDS_OFFLINE,
3873  continue;
3874  /*
3875  * Finds out parity groups for which read IO failed and marks
3876  * them as DEGRADED. This is necessary since read IO request
3877  * could be reading only a part of a parity group but if it
3878  * failed, rest of the parity group also needs to be read
3879  * (subject to file size) in order to re-generate lost data.
3880  */
3881  m0_tl_for (iofops, &ti->ti_iofops, irfop) {
3882  rc = io_req_fop_dgmode_read(irfop);
3883  if (rc != 0)
3884  break;
3885  } m0_tl_endfor;
3886  } m0_htable_endfor;
3887 
3888  if (rc != 0)
3889  return M0_ERR_INFO(rc, "[%p] dgmode failed", req);
3890 
3891  M0_LOG(M0_DEBUG, "[%p] dgmap_nr=%u is in dgmode",
3892  req, req->ir_dgmap_nr);
3893  /*
3894  * Starts processing the pages again if any of the parity groups
3895  * spanned by input IO-request is in degraded mode.
3896  */
3897  if (req->ir_dgmap_nr > 0) {
3898  M0_LOG(M0_DEBUG, "[%p] processing the failed parity groups",
3899  req);
3902 
3903  for (i = 0; i < req->ir_iomap_nr; ++i) {
3904  iomap = req->ir_iomaps[i];
3905  rc = iomap->pi_ops->pi_dgmode_postprocess(iomap);
3906  if (rc != 0)
3907  break;
3908  }
3909  } else {
3912  /*
3913  * By this time, the page count in target_ioreq::ti_ivec and
3914  * target_ioreq::ti_bufvec is greater than 1, but it is
3915  * invalid since the distribution is about to change.
3916  * Ergo, page counts in index and buffer vectors are reset.
3917  */
3918 
3919  m0_htable_for(tioreqht, ti, &xfer->nxr_tioreqs_hash) {
3920  V_SEG_NR(&ti->ti_ivv) = 0;
3921  } m0_htable_endfor;
3922  }
3923 
3924  M0_LOG(M0_DEBUG, "[%p] About to nxo_complete()", req);
3925  xfer->nxr_ops->nxo_complete(xfer, rmw);
3926 
3927  m0_htable_for(tioreqht, ti, &xfer->nxr_tioreqs_hash) {
3928  ti->ti_databytes = 0;
3929  ti->ti_parbytes = 0;
3930  ti->ti_rc = 0;
3931  } m0_htable_endfor;
3932 
3933  /* Resets the status code before starting degraded mode read IO. */
3934  req->ir_rc = xfer->nxr_rc = 0;
3935 
3936  rc = xfer->nxr_ops->nxo_distribute(xfer);
3937  if (rc != 0)
3938  return M0_ERR_INFO(rc, "[%p] Failed to prepare dgmode IO "
3939  "fops.", req);
3940 
3941  rc = xfer->nxr_ops->nxo_dispatch(xfer);
3942  if (rc != 0)
3943  return M0_ERR_INFO(rc, "[%p] Failed to dispatch degraded mode "
3944  "IO.", req);
3945 
3947  if (rc != 0)
3948  return M0_ERR_INFO(rc, "[%p] Degraded mode read IO failed.",
3949  req);
3950 
3951  if (xfer->nxr_rc != 0)
3952  return M0_ERR_INFO(xfer->nxr_rc,
3953  "[%p] Degraded mode read IO failed.", req);
3954  /*
3955  * Recovers lost data using parity recovery algorithms only if
3956  * one or more devices were in FAILED, OFFLINE, REPAIRING state.
3957  */
3958  if (req->ir_dgmap_nr > 0) {
3960  if (rc != 0)
3961  return M0_ERR_INFO(rc, "[%p] Failed to recover lost "
3962  "data.", req);
3963  }
3964 
3965  return M0_RC(rc);
3966 }
3967 
3968 extern const struct m0_uint128 m0_rm_m0t1fs_group;
3969 
3970 static int ioreq_file_lock(struct io_request *req)
3971 {
3972  int rc;
3973  struct m0t1fs_inode *mi;
3974 
3975  M0_PRE(req != NULL);
3976  M0_ENTRY("[%p]", req);
3977 
3980  m0_file_lock(&mi->ci_fowner, &req->ir_in);
3981  m0_rm_owner_lock(&mi->ci_fowner);
3984  M0_TIME_NEVER);
3985  m0_rm_owner_unlock(&mi->ci_fowner);
3986  rc = rc ?: req->ir_in.rin_rc;
3987 
3988  return M0_RC(rc);
3989 }
3990 
3991 static void ioreq_file_unlock(struct io_request *req)
3992 {
3993  M0_PRE(req != NULL);
3994  M0_ENTRY("[%p]", req);
3996 }
3997 
3998 static int ioreq_no_lock(struct io_request *req)
3999 {
4000  return 0;
4001 }
4002 
4003 static void ioreq_no_unlock(struct io_request *req)
4004 {;}
4005 
4006 static void device_state_reset(struct nw_xfer_request *xfer, bool rmw)
4007 {
4008  struct target_ioreq *ti;
4009 
4010  M0_PRE(xfer != NULL);
4011  M0_PRE(xfer->nxr_state == NXS_COMPLETE);
4012 
4013  m0_htable_for(tioreqht, ti, &xfer->nxr_tioreqs_hash) {
4014  ti->ti_state = M0_PNDS_ONLINE;
4015  } m0_htable_endfor;
4016 }
4017 
4018 static int ioreq_iosm_handle(struct io_request *req)
4019 {
4020  int rc;
4021  bool rmw;
4022  uint64_t i;
4023  struct inode *inode;
4024  struct target_ioreq *ti;
4025  struct nw_xfer_request *xfer;
4026  struct m0t1fs_sb *csb;
4027 
4029  xfer = &req->ir_nwxfer;
4030  M0_ENTRY("[%p] sb %p", req, file_to_sb(req->ir_file));
4031  csb = M0T1FS_SB(m0t1fs_file_to_inode(req->ir_file)->i_sb);
4032 
4033  for (i = 0; i < req->ir_iomap_nr; ++i) {
4034  if (M0_IN(req->ir_iomaps[i]->pi_rtype,
4036  break;
4037  }
4038 
4039  /*
4040  * Acquires lock before proceeding to do actual IO.
4041  */
4042  rc = req->ir_ops->iro_file_lock(req);
4043  if (rc != 0) {
4044  M0_LOG(M0_ERROR, "[%p] iro_file_lock() failed: rc=%d", req, rc);
4045  goto fail;
4046  }
4047 
4048  /* @todo Do error handling based on m0_sm::sm_rc. */
4049  /*
4050  * Since m0_sm is part of io_request, for any parity group
4051  * which is partial, read-modify-write state transition is followed
4052  * for all parity groups.
4053  */
4054  M0_LOG(M0_DEBUG, "[%p] map=%llu map_nr=%llu",
4055  req, i, req->ir_iomap_nr);
4056  if (i == req->ir_iomap_nr) {
4057  enum io_req_state state;
4058 
4059  rmw = false;
4060  state = req->ir_type == IRT_READ ? IRS_READING :
4061  IRS_WRITING;
4062  if (state == IRS_WRITING) {
4064  CD_COPY_FROM_USER, 0);
4065  if (rc != 0) {
4066  M0_LOG(M0_ERROR, "[%p] iro_user_data_copy() "
4067  "failed: rc=%d", req, rc);
4068  goto fail_locked;
4069  }
4071  if (rc != 0) {
4072  M0_LOG(M0_ERROR, "[%p] iro_parity_recalc() "
4073  "failed: rc=%d", req, rc);
4074  goto fail_locked;
4075  }
4076  }
4077  ioreq_sm_state_set(req, state);
4078  rc = xfer->nxr_ops->nxo_dispatch(xfer);
4079  if (rc != 0) {
4080  M0_LOG(M0_ERROR, "[%p] nxo_dispatch() failed: rc=%d",
4081  req, rc);
4082  goto fail_locked;
4083  }
4084  state = req->ir_type == IRT_READ ? IRS_READ_COMPLETE:
4086  rc = ioreq_sm_timedwait(req, state);
4087  if (rc != 0) {
4088  M0_LOG(M0_ERROR, "[%p] ioreq_sm_timedwait() failed: "
4089  "rc=%d", req, rc);
4090  goto fail_locked;
4091  }
4092  if (req->ir_rc != 0) {
4093  rc = req->ir_rc;
4094  M0_LOG(M0_ERROR, "[%p] ir_rc=%d", req, rc);
4095  goto fail_locked;
4096  }
4097  if (state == IRS_READ_COMPLETE) {
4098 
4099  /*
4100  * Returns immediately if all devices are
4101  * in healthy state.
4102  */
4103  rc = req->ir_ops->iro_dgmode_read(req, rmw);
4104  if (rc != 0) {
4105  M0_LOG(M0_ERROR, "[%p] iro_dgmode_read() "
4106  "failed: rc=%d", req, rc);
4107  goto fail_locked;
4108  }
4110  if (rc != 0) {
4111  M0_LOG(M0_ERROR, "[%p] parity verification "
4112  "failed: rc=%d", req, rc);
4113  goto fail_locked;
4114  }
4116  CD_COPY_TO_USER, 0);
4117  if (rc != 0) {
4118  M0_LOG(M0_ERROR, "[%p] iro_user_data_copy() "
4119  "failed: rc=%d", req, rc);
4120  goto fail_locked;
4121  }
4122  } else {
4123  M0_ASSERT(state == IRS_WRITE_COMPLETE);
4124  /*
4125  * Returns immediately if all devices are
4126  * in healthy state.
4127  */
4128  rc = req->ir_ops->iro_dgmode_write(req, rmw);
4129  if (rc != 0) {
4130  M0_LOG(M0_ERROR, "[%p] iro_dgmode_write() "
4131  "failed: rc=%d", req, rc);
4132  goto fail_locked;
4133  }
4134  }
4135  } else {
4136  uint32_t seg;
4137  m0_bcount_t read_pages = 0;
4138 
4139  rmw = true;
4140  m0_htable_for(tioreqht, ti, &xfer->nxr_tioreqs_hash) {
4141  for (seg = 0; seg < V_SEG_NR(&ti->ti_bufvec); ++seg)
4142  if (PA(&ti->ti_pageattrs, seg) & PA_READ)
4143  ++read_pages;
4144  } m0_htable_endfor;
4145 
4146  /* Read IO is issued only if byte count > 0. */
4147  if (read_pages > 0) {
4149  rc = xfer->nxr_ops->nxo_dispatch(xfer);
4150  if (rc != 0) {
4151  M0_LOG(M0_ERROR, "[%p] nxo_dispatch() failed: "
4152  "rc=%d", req, rc);
4153  goto fail_locked;
4154  }
4155  }
4156 
4157  /* Waits for read completion if read IO was issued. */
4158  if (read_pages > 0) {
4160  if (rc != 0) {
4161  M0_LOG(M0_ERROR, "[%p] ioreq_sm_timedwait() "
4162  "failed: rc=%d", req, rc);
4163  goto fail_locked;
4164  }
4165 
4166  /*
4167  * Returns immediately if all devices are
4168  * in healthy state.
4169  */
4170  rc = req->ir_ops->iro_dgmode_read(req, rmw);
4171  if (rc != 0) {
4172  M0_LOG(M0_ERROR, "[%p] iro_dgmode_read() "
4173  "failed: rc=%d", req, rc);
4174  goto fail_locked;
4175  }
4176  }
4177 
4178  /*
4179  * If fops dispatch fails, we need to wait till all io fop
4180  * callbacks are acked since some IO fops might have been
4181  * dispatched.
4182  *
4183  * Only fully modified pages from parity groups which have
4184  * chosen read-rest approach or aligned parity groups,
4185  * are copied since read-old approach needs reading of
4186  * all spanned pages,
4187  * (no matter fully modified or partially modified)
4188  * in order to calculate parity correctly.
4189  */
4192  if (rc != 0) {
4193  M0_LOG(M0_ERROR, "[%p] iro_user_data_copy() failed: "
4194  "rc=%d", req, rc);
4195  goto fail_locked;
4196  }
4197 
4198  /* Copies
4199  * - fully modified pages from parity groups which have
4200  * chosen read_old approach and
4201  * - partially modified pages from all parity groups.
4202  */
4204  if (rc != 0) {
4205  M0_LOG(M0_ERROR, "[%p] iro_user_data_copy() failed: "
4206  "rc=%d", req, rc);
4207  goto fail_locked;
4208  }
4209 
4210  /* Finalizes the old read fops. */
4211  if (read_pages > 0) {
4212  M0_LOG(M0_DEBUG, "[%p] About to nxo_complete()", req);
4213  xfer->nxr_ops->nxo_complete(xfer, rmw);
4214  if (req->ir_rc != 0) {
4215  M0_LOG(M0_ERROR, "[%p] nxo_complete() failed: "
4216  "rc=%d", req, rc);
4217  rc = req->ir_rc;
4218  goto fail_locked;
4219  }
4220  device_state_reset(xfer, rmw);
4221  }
4224  if (rc != 0) {
4225  M0_LOG(M0_ERROR, "[%p] iro_parity_recalc() failed: "
4226  "rc=%d", req, rc);
4227  goto fail_locked;
4228  }
4229  rc = xfer->nxr_ops->nxo_dispatch(xfer);
4230  if (rc != 0) {
4231  M0_LOG(M0_ERROR, "[%p] nxo_dispatch() failed: rc=%d",
4232  req, rc);
4233  goto fail_locked;
4234  }
4235 
4237  if (rc != 0) {
4238  M0_LOG(M0_ERROR, "[%p] ioreq_sm_timedwait() failed: "
4239  "rc=%d", req,
4240  rc);
4241  goto fail_locked;
4242  }
4243 
4244  /* Returns immediately if all devices are in healthy state. */
4245  rc = req->ir_ops->iro_dgmode_write(req, rmw);
4246  if (rc != 0) {
4247  M0_LOG(M0_ERROR, "[%p] iro_dgmode_write() failed: "
4248  "rc=%d", req, rc);
4249  goto fail_locked;
4250  }
4251  }
4252 
4253  /*
4254  * Updates file size on successful write IO.
4255  * New file size is maximum value between old file size and
4256  * valid file position written in current write IO call.
4257  */
4260  uint64_t newsize = max64u(inode->i_size,
4262  V_SEG_NR(&req->ir_ivv) - 1));
4263 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0)
4264  rc = m0t1fs_size_update(req->ir_file->f_path.dentry, newsize);
4265 #else
4266  rc = m0t1fs_size_update(req->ir_file->f_dentry, newsize);
4267 #endif
4268  m0_mutex_lock(&csb->csb_confc_state.cus_lock);
4269  if (rc != 0 && csb->csb_confc_state.cus_state != M0_CC_READY) {
4270  m0_mutex_unlock(&csb->csb_confc_state.cus_lock);
4271  rc = M0_ERR(-ESTALE);
4272  goto fail_locked;
4273  }
4274  m0_mutex_unlock(&csb->csb_confc_state.cus_lock);
4275  M0_LOG(M0_INFO, "[%p] File size set to %llu", req,
4276  inode->i_size);
4277  }
4278 
4280 
4281  M0_LOG(M0_DEBUG, "[%p] About to nxo_complete()", req);
4282  xfer->nxr_ops->nxo_complete(xfer, rmw);
4283 
4284  if (rmw)
4286 
4287  return M0_RC(0);
4288 
4289 fail_locked:
4291 fail:
4293  M0_LOG(M0_DEBUG, "[%p] About to nxo_complete()", req);
4294  xfer->nxr_ops->nxo_complete(xfer, false);
4296  return M0_ERR_INFO(rc, "[%p] ioreq_iosm_handle failed", req);
4297 }
4298 
4299 static int io_request_init(struct io_request *req,
4300  struct file *file,
4301  const struct iovec *iov,
4302  struct m0_indexvec_varr *ivv,
4303  enum io_req_type rw)
4304 {
4305  struct m0t1fs_inode *ci;
4306  struct m0t1fs_sb *csb;
4307  struct m0_pool_version *pver;
4308  struct m0_layout_instance *li;
4309  int rc;
4310  uint32_t seg;
4311  uint32_t i;
4312  uint32_t max_failures;
4313 
4314  M0_ENTRY("[%p] rw %d", req, rw);
4315 
4316  M0_PRE(req != NULL);
4317  M0_PRE(file != NULL);
4318  M0_PRE(iov != NULL);
4319  M0_PRE(ivv != NULL);
4320  M0_PRE(M0_IN(rw, (IRT_READ, IRT_WRITE)));
4321  M0_PRE(M0_IS0(req));
4322 
4323  csb = file_to_sb(file);
4325  if (rc != 0)
4326  return M0_ERR(rc);
4327  req->ir_rc = 0;
4328  req->ir_file = file;
4329  req->ir_type = rw;
4330  req->ir_iovec = iov;
4331  req->ir_iomap_nr = 0;
4332  req->ir_copied_nr = 0;
4333  req->ir_direct_io = !!(file->f_flags & O_DIRECT);
4335  req->ir_ops = csb->csb_oostore ? &ioreq_oostore_ops : &ioreq_ops;
4336 
4337  /*
4338  * rconfc might have refreshed pool versions, and pool version for
4339  * this file might have got evicted forever. Check if we still have
4340  * the ground underneath.
4341  */
4343  pver = m0_pool_version_find(&csb->csb_pools_common, &ci->ci_pver);
4344  if (pver == NULL) {
4345  rc = M0_ERR_INFO(-ENOENT, "Cannot find pool version "FID_F,
4346  FID_P(&ci->ci_pver));
4347  goto err;
4348  }
4349  li = ci->ci_layout_instance;
4350  /*
4351  * File resides on a virtual pool version that got refreshed during
4352  * rconfc update leading to evicting the layout.
4353  */
4354  if (li == NULL) {
4356  if (rc != 0)
4357  goto err;
4358  }
4359  io_request_bob_init(req);
4361  if (req->ir_nwxfer.nxr_rc != 0) {
4363  "[%p] nw_xfer_req_init() failed", req);
4364  goto err;
4365  }
4367  M0_ALLOC_ARR(req->ir_failed_session, max_failures + 1);
4368  if (req->ir_failed_session == NULL) {
4369  rc = M0_ERR_INFO(-ENOMEM, "[%p] Allocation of an array of "
4370  "failed sessions.", req);
4371  goto err;
4372  }
4373  for (i = 0; i < max_failures; ++i) {
4374  req->ir_failed_session[i] = ~(uint64_t)0;
4375  }
4376 
4379 
4381 
4382  if (rc != 0) {
4384  M0_LOG(M0_ERROR, "[%p] Allocation of m0_indexvec_varr", req);
4385  goto err;
4386  }
4387 
4388  for (seg = 0; seg < V_SEG_NR(ivv); ++seg) {
4389  V_INDEX(&req->ir_ivv, seg) = V_INDEX(ivv, seg);
4390  V_COUNT(&req->ir_ivv, seg) = V_COUNT(ivv, seg);
4391  }
4392 
4393  /* Sorts the index vector in increasing order of file offset. */
4397 
4398  return M0_RC(0);
4399 err:
4401  return M0_ERR(rc);
4402 }
4403 
4404 static void io_request_fini(struct io_request *req)
4405 {
4406  struct target_ioreq *ti;
4407  struct m0_sm_group *grp;
4408  struct m0t1fs_sb *csb;
4409 
4411 
4412  M0_ENTRY("[%p]", req);
4413 
4414  csb = file_to_sb(req->ir_file);
4415  grp = req->ir_sm.sm_grp;
4416 
4418 
4419  m0_sm_fini(&req->ir_sm);
4420  io_request_bob_fini(req);
4421  req->ir_file = NULL;
4422  req->ir_iovec = NULL;
4423  req->ir_iomaps = NULL;
4424  req->ir_ops = NULL;
4426 
4427  m0_htable_for(tioreqht, ti, &req->ir_nwxfer.nxr_tioreqs_hash) {
4428  tioreqht_htable_del(&req->ir_nwxfer.nxr_tioreqs_hash, ti);
4429  M0_LOG(M0_DEBUG, "[%p] target_ioreq %p deleted for "FID_F,
4430  req, ti, FID_P(&ti->ti_fid));
4431  /*
4432  * All io_req_fop structures in list target_ioreq::ti_iofops
4433  * are already finalized in nw_xfer_req_complete().
4434  */
4435  target_ioreq_fini(ti);
4436  m0_free(ti);
4437  ++iommstats.d_target_ioreq_nr;
4438  } m0_htable_endfor;
4439 
4441 
4445  M0_LEAVE();
4446 }
4447 
4508  enum m0_pool_nd_state dev_state)
4509 {
4510  return (M0_IN(ioreq_sm_state(req),
4512  dev_state == M0_PNDS_SNS_REPAIRED)
4513  ||
4515  (dev_state == M0_PNDS_SNS_REPAIRED ||
4516  (dev_state == M0_PNDS_SNS_REPAIRING &&
4518 }
4519 
4520 static int nw_xfer_tioreq_map(struct nw_xfer_request *xfer,
4521  const struct m0_pdclust_src_addr *src,
4522  struct m0_pdclust_tgt_addr *tgt,
4523  struct target_ioreq **tio)
4524 {
4525  struct m0_fid tfid;
4526  const struct m0_fid *gfid;
4527  struct io_request *req;
4528  struct m0_rpc_session *session;
4529  struct m0_pdclust_layout *play;
4530  struct m0_pdclust_instance *play_instance;
4531  enum m0_pool_nd_state dev_state;
4532  int rc;
4533  struct m0_poolmach *pm;
4534 
4535  M0_ENTRY("nw_xfer_request %p", xfer);
4537  M0_PRE(src != NULL);
4538  M0_PRE(tgt != NULL);
4539 
4540  req = bob_of(xfer, struct io_request, ir_nwxfer, &ioreq_bobtype);
4541  play = pdlayout_get(req);
4542  play_instance = pdlayout_instance(layout_instance(req));
4543 
4544  m0_fd_fwd_map(play_instance, src, tgt);
4545  tfid = target_fid(req, tgt);
4546 
4547  M0_LOG(M0_DEBUG, "[%p] src_id[%llu:%llu] -> dest_id[%llu:%llu] "
4548  "@ tfid "FID_F, req, src->sa_group, src->sa_unit,
4549  tgt->ta_frame, tgt->ta_obj, FID_P(&tfid));
4550 
4552  M0_ASSERT(pm != NULL);
4553 
4554  rc = m0_poolmach_device_state(pm, tgt->ta_obj, &dev_state);
4555  if (rc != 0)
4556  return M0_RC(rc);
4557 
4559  tfid.f_container, tfid.f_key,
4561  dev_state, tgt->ta_frame, tgt->ta_obj,
4562  src->sa_group, src->sa_unit);
4563 
4564  if (M0_FI_ENABLED("poolmach_client_repaired_device1") &&
4565  tfid.f_container == 1)
4566  dev_state = M0_PNDS_SNS_REPAIRED;
4567 
4568  M0_LOG(M0_INFO, "[%p] tfid="FID_F" dev_state=%d\n",
4569  req, FID_P(&tfid), dev_state);
4570 
4571  if (should_spare_be_mapped(req, dev_state)) {
4572  struct m0_pdclust_src_addr spare = *src;
4573  uint32_t spare_slot;
4574  uint32_t spare_slot_prev;
4575  enum m0_pool_nd_state dev_state_prev;
4576 
4578  rc = m0_sns_repair_spare_map(pm, gfid, play, play_instance,
4579  src->sa_group, src->sa_unit,
4580  &spare_slot, &spare_slot_prev);
4581  if (M0_FI_ENABLED("poolmach_client_repaired_device1") &&
4582  tfid.f_container == 1) {
4583  rc = 0;
4584  spare_slot = layout_n(play) + layout_k(play);
4585  }
4586  if (rc != 0)
4587  return M0_RC(rc);
4588 
4589  /* Check if there is an effective-failure. */
4590  if (spare_slot_prev != src->sa_unit) {
4591  spare.sa_unit = spare_slot_prev;
4592  m0_fd_fwd_map(play_instance, &spare, tgt);
4593  tfid = target_fid(req, tgt);
4595  &dev_state_prev);
4596  if (rc != 0)
4597  return M0_RC(rc);
4598  } else
4599  dev_state_prev = M0_PNDS_SNS_REPAIRED;
4600 
4601  if (dev_state_prev == M0_PNDS_SNS_REPAIRED) {
4602  spare.sa_unit = spare_slot;
4603  m0_fd_fwd_map(play_instance, &spare, tgt);
4604  tfid = target_fid(req, tgt);
4605  }
4606  dev_state = dev_state_prev;
4607  M0_LOG(M0_DEBUG, "[%p] REPAIRED: [%llu:%llu] -> [%llu:%llu] "
4608  "@ tfid " FID_F, req, spare.sa_group, spare.sa_unit,
4609  tgt->ta_frame, tgt->ta_obj, FID_P(&tfid));
4611  tfid.f_container, tfid.f_key,
4612  m0_pdclust_unit_classify(play, spare.sa_unit),
4613  dev_state,
4614  tgt->ta_frame, tgt->ta_obj,
4615  spare.sa_group, spare.sa_unit);
4616  }
4617 
4618  session = target_session(req, tfid);
4619 
4620  rc = nw_xfer_tioreq_get(xfer, &tfid, tgt->ta_obj, session,
4621  layout_unit_size(play) * req->ir_iomap_nr, tio);
4622 
4625  dev_state != M0_PNDS_SNS_REPAIRED)
4626  (*tio)->ti_state = dev_state;
4627 
4628  return M0_RC(rc);
4629 }
4630 
4631 static int target_ioreq_init(struct target_ioreq *ti,
4632  struct nw_xfer_request *xfer,
4633  const struct m0_fid *cobfid,
4634  uint64_t ta_obj,
4635  struct m0_rpc_session *session,
4636  uint64_t size)
4637 {
4638  int rc;
4639  struct io_request *req;
4640  uint64_t cnt;
4641 
4642  M0_PRE(ti != NULL);
4643  M0_PRE(xfer != NULL);
4644  M0_PRE(cobfid != NULL);
4645  M0_PRE(session != NULL);
4646  M0_PRE(size > 0);
4647 
4648  M0_ENTRY("target_ioreq %p, nw_xfer_request %p, "FID_F,
4649  ti, xfer, FID_P(cobfid));
4650 
4651  ti->ti_rc = 0;
4652  ti->ti_ops = &tioreq_ops;
4653  ti->ti_fid = *cobfid;
4654  ti->ti_nwxfer = xfer;
4655  ti->ti_dgvec = NULL;
4656  ti->ti_req_type = TI_NONE;
4657  M0_SET0(&ti->ti_cc_fop);
4658  ti->ti_cc_fop_inited = false;
4659  /*
4660  * Target object is usually in ONLINE state unless explicitly
4661  * told otherwise.
4662  */
4663  ti->ti_state = M0_PNDS_ONLINE;
4664  ti->ti_session = session;
4665  ti->ti_parbytes = 0;
4666  ti->ti_databytes = 0;
4667 
4668  req = bob_of(xfer, struct io_request, ir_nwxfer, &ioreq_bobtype);
4669  ti->ti_obj = ta_obj;
4670 
4671  M0_LOG(M0_DEBUG, "[%p] ti %p", req, ti);
4672  iofops_tlist_init(&ti->ti_iofops);
4673  tioreqht_tlink_init(ti);
4674  target_ioreq_bob_init(ti);
4675  cnt = page_nr(size);
4676 
4678  if (rc != 0)
4679  goto fail;
4680 
4682  if (rc != 0)
4683  goto fail_free_iv;
4684 
4685  rc = m0_varr_init(&ti->ti_pageattrs, cnt, sizeof(enum page_attr),
4686  (size_t)m0_pagesize_get());
4687  if (rc != 0)
4688  goto fail_free_bv;
4689 
4690  /*
4691  * This value is incremented when new segments are added to the
4692  * index vector in target_ioreq_seg_add().
4693  */
4694  V_SEG_NR(&ti->ti_ivv) = 0;
4695 
4697  return M0_RC(0);
4698 
4699 fail_free_bv:
4701 fail_free_iv:
4703 fail:
4704  return M0_ERR_INFO(-ENOMEM, "[%p] Failed to allocate memory in "
4705  "target_ioreq_init", req);
4706 }
4707 
4708 static void target_ioreq_fini(struct target_ioreq *ti)
4709 {
4710  M0_ENTRY("target_ioreq %p, ti->ti_nwxfer %p", ti, ti->ti_nwxfer);
4712 
4713  target_ioreq_bob_fini(ti);
4714  tioreqht_tlink_fini(ti);
4715  iofops_tlist_fini(&ti->ti_iofops);
4716  ti->ti_ops = NULL;
4717  ti->ti_session = NULL;
4718  ti->ti_nwxfer = NULL;
4719 
4722  m0_varr_fini(&ti->ti_pageattrs);
4723  if (ti->ti_dgvec != NULL)
4725 
4726  if (ti->ti_cc_fop_inited) {
4727  struct m0_rpc_item *item = &ti->ti_cc_fop.crf_fop.f_item;
4728  M0_LOG(M0_DEBUG, "item=%p %s osr_xid=%"PRIu64,
4731  ti->ti_cc_fop_inited = false;
4733  }
4734 
4735  M0_LEAVE();
4736 }
4737 
4739  const struct m0_fid *fid)
4740 {
4741  struct target_ioreq *ti;
4742 
4743  M0_ENTRY("nw_xfer_request %p, fid %p", xfer, fid);
4745  M0_PRE(fid != NULL);
4746 
4747  ti = tioreqht_htable_lookup(&xfer->nxr_tioreqs_hash, &fid->f_container);
4748  M0_ASSERT(ergo(ti != NULL, m0_fid_cmp(fid, &ti->ti_fid) == 0));
4749 
4750  M0_LEAVE();
4751  return ti;
4752 }
4753 
4754 static int nw_xfer_tioreq_get(struct nw_xfer_request *xfer,
4755  const struct m0_fid *fid,
4756  uint64_t ta_obj,
4757  struct m0_rpc_session *session,
4758  uint64_t size,
4759  struct target_ioreq **out)
4760 {
4761  int rc = 0;
4762  struct target_ioreq *ti;
4763  struct io_request *req;
4764 
4766  M0_PRE(fid != NULL);
4767  M0_PRE(session != NULL);
4768  M0_PRE(out != NULL);
4769 
4770  req = bob_of(xfer, struct io_request, ir_nwxfer, &ioreq_bobtype);
4771  M0_ENTRY("[%p] "FID_F" ta_obj=%llu size=%llu",
4772  req, FID_P(fid), ta_obj, size);
4773 
4774  ti = target_ioreq_locate(xfer, fid);
4775  if (ti == NULL) {
4776  M0_ALLOC_PTR(ti);
4777  if (ti == NULL)
4778  return M0_ERR_INFO(-ENOMEM, "[%p] Failed to allocate "
4779  "memory for target_ioreq", req);
4780 
4781  rc = target_ioreq_init(ti, xfer, fid, ta_obj, session, size);
4782  if (rc == 0) {
4783  tioreqht_htable_add(&xfer->nxr_tioreqs_hash, ti);
4784  M0_LOG(M0_INFO, "[%p] New target_ioreq %p added for "
4785  FID_F, req, ti, FID_P(fid));
4786  } else {
4787  m0_free(ti);
4788  return M0_ERR_INFO(rc, "[%p] target_ioreq_init() "
4789  "failed", req);
4790  }
4791  ++iommstats.a_target_ioreq_nr;
4792  }
4793  if (ti->ti_dgvec == NULL && M0_IN(ioreq_sm_state(req),
4796 
4797  *out = ti;
4798  return M0_RC(rc);
4799 }
4800 
4801 static struct data_buf *data_buf_alloc_init(enum page_attr pattr)
4802 {
4803  struct data_buf *buf;
4804  unsigned long addr;
4805 
4806  M0_ENTRY();
4807  addr = get_zeroed_page(GFP_KERNEL);
4808  if (addr == 0) {
4809  M0_LOG(M0_ERROR, "Failed to get free page");
4810  return NULL;
4811  }
4812 
4813  ++iommstats.a_page_nr;
4814  M0_ALLOC_PTR(buf);
4815  if (buf == NULL) {
4816  free_page(addr);
4817  M0_LOG(M0_ERROR, "Failed to allocate data_buf");
4818  return NULL;
4819  }
4820 
4821  ++iommstats.a_data_buf_nr;
4822  data_buf_init(buf, (void *)addr, pattr);
4824  M0_LEAVE();
4825  return buf;
4826 }
4827 
4828 static void buf_page_free(struct m0_buf *buf)
4829 {
4830  M0_PRE(buf != NULL);
4831 
4832  free_page((unsigned long)buf->b_addr);
4833  ++iommstats.d_page_nr;
4834  buf->b_addr = NULL;
4835  buf->b_nob = 0;
4836 }
4837 
4838 static void data_buf_dealloc_fini(struct data_buf *buf)
4839 {
4840  M0_ENTRY("data_buf %p", buf);
4842 
4843  if (buf->db_page != NULL)
4844  user_page_unmap(buf, (buf->db_flags & PA_WRITE) ? false : true);
4845  else if (buf->db_buf.b_addr != NULL)
4846  buf_page_free(&buf->db_buf);
4847 
4848  if (buf->db_auxbuf.b_addr != NULL)
4849  buf_page_free(&buf->db_auxbuf);
4850 
4851  data_buf_fini(buf);
4852  m0_free(buf);
4853  ++iommstats.d_data_buf_nr;
4854  M0_LEAVE();
4855 }
4856 
4857 static void target_ioreq_seg_add(struct target_ioreq *ti,
4858  const struct m0_pdclust_src_addr *src,
4859  const struct m0_pdclust_tgt_addr *tgt,
4860  m0_bindex_t gob_offset,
4862  struct pargrp_iomap *map)
4863 {
4864  uint32_t seg;
4865  m0_bindex_t toff;
4866  m0_bindex_t goff;
4867  m0_bindex_t pgstart;
4868  m0_bindex_t pgend;
4869  struct data_buf *buf;
4870  struct io_request *req;
4871  struct m0_pdclust_layout *play;
4872  uint64_t frame = tgt->ta_frame;
4873  uint64_t unit = src->sa_unit;
4874  struct m0_indexvec_varr *ivv;
4875  struct m0_indexvec_varr *bvec;
4876  enum m0_pdclust_unit_type unit_type;
4877  struct m0_varr *pattr;
4878  uint64_t cnt;
4879 
4880  M0_ENTRY("tio req %p, gob_offset %llu, count %llu frame %llu unit %llu",
4881  ti, gob_offset, count, frame, unit);
4883  M0_PRE(map != NULL);
4884 
4885  req = bob_of(ti->ti_nwxfer, struct io_request, ir_nwxfer,
4886  &ioreq_bobtype);
4887  play = pdlayout_get(req);
4888 
4889  unit_type = m0_pdclust_unit_classify(play, unit);
4890  M0_ASSERT(M0_IN(unit_type, (M0_PUT_DATA, M0_PUT_PARITY)));
4891 
4892  toff = target_offset(frame, play, gob_offset);
4893  pgstart = toff;
4894  goff = unit_type == M0_PUT_DATA ? gob_offset : 0;
4895 
4896  M0_LOG(M0_DEBUG, "[%p] %llu: "
4897  "[gpos %6llu, +%llu][%llu,%llu]->[%llu,%llu] %c",
4898  req, map->pi_grpid,
4899  gob_offset, count, src->sa_group, src->sa_unit,
4900  tgt->ta_frame, tgt->ta_obj,
4901  unit_type == M0_PUT_DATA ? 'D' : 'P');
4902 
4903  /* Use ti_dgvec as long as it is dgmode-read/write. */
4906  M0_ASSERT(ti->ti_dgvec != NULL);
4907  ivv = &ti->ti_dgvec->dr_ivec_varr;
4908  bvec = &ti->ti_dgvec->dr_bufvec;
4909  pattr = &ti->ti_dgvec->dr_pageattrs;
4911  (layout_n(play) + layout_k(play)));
4912  M0_LOG(M0_DEBUG, "[%p] map_nr=%llu req state=%u cnt=%llu",
4914  } else {
4915  ivv = &ti->ti_ivv;
4916  bvec = &ti->ti_bufvec;
4917  pattr = &ti->ti_pageattrs;
4919  layout_n(play));
4920  M0_LOG(M0_DEBUG, "[%p] map_nr=%llu req state=%u cnt=%llu",
4922  }
4923 
4924  while (pgstart < toff + count) {
4925  pgend = min64u(pgstart + PAGE_SIZE, toff + count);
4926  seg = V_SEG_NR(ivv);
4927 
4928  V_INDEX(ivv, seg) = pgstart;
4929  V_COUNT(ivv, seg) = pgend - pgstart;
4930 
4931  if (unit_type == M0_PUT_DATA) {
4932  uint32_t row;
4933  uint32_t col;
4934 
4935  page_pos_get(map, goff, &row, &col);
4936  buf = map->pi_databufs[row][col];
4937 
4938  PA(pattr,seg) |= PA_DATA;
4939  M0_LOG(M0_DEBUG, "[%p] ti %p, Data seg %u added",
4940  req, ti, seg);
4941  } else {
4942  buf = map->pi_paritybufs[page_id(goff)]
4943  [unit % layout_n(play)];
4944  PA(pattr,seg) |= PA_PARITY;
4945  M0_LOG(M0_DEBUG, "[%p] ti %p, Parity seg %u added",
4946  req, ti, seg);
4947  }
4948  buf->db_tioreq = ti;
4949  V_ADDR (bvec, seg) = buf->db_buf.b_addr;
4950  V_COUNT(bvec, seg) = V_COUNT(ivv, seg);
4951  PA(pattr, seg) |= buf->db_flags;
4952  M0_LOG(M0_DEBUG, "[%p] ti %p, Seg id %d pageaddr=%p "
4953  "[%llu, %llu] added to target_ioreq with "FID_F
4954  " with flags 0x%x", req, ti, seg, V_ADDR(bvec, seg),
4955  V_INDEX(ivv, seg),
4956  V_COUNT(ivv, seg),
4957  FID_P(&ti->ti_fid),
4958  PA(pattr, seg));
4959 
4960  goff += V_COUNT(ivv, seg);
4961  pgstart = pgend;
4962  ++ V_SEG_NR(ivv);
4963  M0_ASSERT_INFO(V_SEG_NR(ivv) <= cnt,
4964  "[%p] ti %p, v_nr=%u, page_nr=%llu",
4965  req, ti, V_SEG_NR(ivv), cnt);
4966  }
4968  M0_LEAVE();
4969 }
4970 
4971 static int io_req_fop_init(struct io_req_fop *fop,
4972  struct target_ioreq *ti,
4973  enum page_attr pattr)
4974 {
4975  int rc;
4976  struct io_request *req;
4977 
4978  M0_ENTRY("io_req_fop %p, target_ioreq %p", fop, ti);
4979  M0_PRE(fop != NULL);
4980  M0_PRE(ti != NULL);
4981  M0_PRE(M0_IN(pattr, (PA_DATA, PA_PARITY)));
4982 
4983  io_req_fop_bob_init(fop);
4984  iofops_tlink_init(fop);
4985  fop->irf_pattr = pattr;
4986  fop->irf_tioreq = ti;
4987  fop->irf_reply_rc = 0;
4988  fop->irf_ast.sa_cb = io_bottom_half;
4989 
4990  req = bob_of(ti->ti_nwxfer, struct io_request, ir_nwxfer,
4991  &ioreq_bobtype);
4992  M0_ASSERT(M0_IN(ioreq_sm_state(req),
4995 
4996  fop->irf_ast.sa_mach = &req->ir_sm;
4997 
4998  rc = m0_io_fop_init(&fop->irf_iofop, file_to_fid(req->ir_file),
4999  M0_IN(ioreq_sm_state(req),
5003  /*
5004  * Changes ri_ops of rpc item so as to execute m0t1fs's own
5005  * callback on receiving a reply.
5006  */
5007  fop->irf_iofop.if_fop.f_item.ri_ops = &io_item_ops;
5008 
5009  M0_LOG(M0_DEBUG, "[%p] fop %p, m0_ref %p, "FID_F", %p[%u], "
5010  "rbulk %p", req, &fop->irf_iofop.if_fop,
5011  &fop->irf_iofop.if_fop.f_ref,
5012  FID_P(&fop->irf_tioreq->ti_fid), &fop->irf_iofop.if_fop.f_item,
5013  fop->irf_iofop.if_fop.f_item.ri_type->rit_opcode,
5014  &fop->irf_iofop.if_rbulk);
5016  return M0_RC(rc);
5017 }
5018 
5019 static void io_req_fop_fini(struct io_req_fop *fop)
5020 {
5021  M0_ENTRY("io_req_fop %p", fop);
5023 
5024  /*
5025  * IO fop is finalized (m0_io_fop_fini()) through rpc sessions code
5026  * using m0_rpc_item::m0_rpc_item_ops::rio_free().
5027  * see m0_io_item_free().
5028  */
5029 
5030  iofops_tlink_fini(fop);
5031 
5032  /*
5033  * io_req_bob_fini() is not done here so that struct io_req_fop
5034  * can be retrieved from struct m0_rpc_item using bob_of() and
5035  * magic numbers can be checked.
5036  */
5037 
5038  fop->irf_tioreq = NULL;
5039  fop->irf_ast.sa_cb = NULL;
5040  fop->irf_ast.sa_mach = NULL;
5041  M0_LEAVE();
5042 }
5043 
5044 static void irfop_fini(struct io_req_fop *irfop)
5045 {
5046  M0_PRE(irfop != NULL);
5047 
5048  M0_ENTRY("io_req_fop %p, rbulk %p, fop %p, %p[%u]", irfop,
5049  &irfop->irf_iofop.if_rbulk, &irfop->irf_iofop.if_fop,
5050  &irfop->irf_iofop.if_fop.f_item,
5053  io_req_fop_fini(irfop);
5054  m0_free(irfop);
5055  M0_LEAVE();
5056 }
5057 
5058 static void ioreq_failed_fini(struct io_request *req, int rc)
5059 {
5064 }
5065 
5066 /*
5067  * This function can be used by the ioctl which supports fully vectored
5068  * scatter-gather IO. The caller is supposed to provide an index vector
5069  * aligned with user buffers in struct iovec array.
5070  * This function is also used by file->f_op->aio_{read/write} path.
5071  */
5072 M0_INTERNAL ssize_t m0t1fs_aio(struct kiocb *kcb,
5073  const struct iovec *iov,
5074  struct m0_indexvec_varr *ivv,
5075  enum io_req_type rw)
5076 {
5077  int rc;
5078  ssize_t count;
5079  struct io_request *req;
5080  struct m0t1fs_sb *csb;
5081 
5083  M0_ENTRY("indexvec %p, rw %d", ivv, rw);
5084  M0_PRE(kcb != NULL);
5085  M0_PRE(iov != NULL);
5086  M0_PRE(ivv != NULL);
5087  M0_PRE(M0_IN(rw, (IRT_READ, IRT_WRITE)));
5088 
5089  csb = file_to_sb(kcb->ki_filp);
5090 again:
5091  M0_ALLOC_PTR(req);
5092  if (req == NULL)
5093  return M0_ERR_INFO(-ENOMEM, "Failed to allocate memory"
5094  " for io_request");
5095  ++iommstats.a_ioreq_nr;
5096 
5097  rc = io_request_init(req, kcb->ki_filp, iov, ivv, rw);
5098  if (rc != 0) {
5099  count = 0;
5100  goto last;
5101  }
5103  if (rc != 0) {
5104  M0_LOG(M0_ERROR, "[%p] Failed to prepare IO fops, rc %d",
5105  req, rc);
5107  count = 0;
5108  goto last;
5109  }
5110 
5112  if (rc != 0) {
5113  M0_LOG(M0_ERROR, "[%p] Failed to distribute file data "
5114  "between target_ioreq objects, rc %d", req, rc);
5117  count = 0;
5118  goto last;
5119  }
5120 
5122  if (rc == 0)
5123  rc = req->ir_rc;
5125  M0_LOG(M0_INFO, "[%p] nxr_bytes = %llu, copied_nr = %llu, count %lu, "
5126  "rc %d", req, req->ir_nwxfer.nxr_bytes, req->ir_copied_nr,
5127  count, rc);
5128 
5130 
5132 last:
5133  M0_LOG(M0_DEBUG, "[%p] rc = %d, io request returned %lu bytes",
5134  req, rc, count);
5135  m0_free(req);
5136  ++iommstats.d_ioreq_nr;
5137 
5138  if (rc == -EAGAIN)
5139  goto again;
5140 
5141  M0_LEAVE();
5142  return rc != 0 ? rc : count;
5143 }
5144 
5145 static struct m0_indexvec_varr *indexvec_create(unsigned long seg_nr,
5146  const struct iovec *iov,
5147  loff_t pos)
5148 {
5149  int rc;
5150  uint32_t i;
5151  struct m0_indexvec_varr *ivv;
5152 
5153  /*
5154  * Apparently, we need to use a new API to process io request
5155  * which can accept m0_indexvec_varr so that it can be reused by
5156  * the ioctl which provides fully vectored scatter-gather IO
5157  * to cluster library users.
5158  * For that, we need to prepare a m0_indexvec_varr and supply it
5159  * to this function.
5160  */
5161  M0_ENTRY("seg_nr %lu position %llu", seg_nr, pos);
5162  M0_ALLOC_PTR(ivv);
5163  if (ivv == NULL) {
5164  M0_LEAVE();
5165  return NULL;
5166  }
5167 
5169  if (rc != 0) {
5170  m0_free(ivv);
5171  M0_LEAVE();
5172  return NULL;
5173  }
5174 
5175  for (i = 0; i < seg_nr; ++i) {
5176  V_INDEX(ivv, i) = pos;
5177  V_COUNT(ivv, i) = iov[i].iov_len;
5178  pos += iov[i].iov_len;
5179  }
5180  M0_POST(indexvec_varr_count(ivv) > 0);
5181 
5182  M0_LEAVE();
5183  return ivv;
5184 }
5185 
5186 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
5187 static ssize_t file_dio_write(struct kiocb *kcb, struct iov_iter *from)
5188 {
5189  struct file *file = kcb->ki_filp;
5190  struct inode *inode = m0t1fs_file_to_inode(file);
5191  ssize_t written;
5192 
5194  M0_ENTRY();
5195 
5196  inode_lock(inode);
5197  written = __generic_file_write_iter(kcb, from);
5198  inode_unlock(inode);
5199 
5200  if (written > 0)
5201  written = generic_write_sync(kcb, written);
5202 
5203  M0_LEAVE();
5204  return written;
5205 }
5206 #else
5207 static ssize_t file_dio_write(struct kiocb *kcb,
5208  const struct iovec *iov,
5209  unsigned long seg_nr,
5210  loff_t pos)
5211 {
5212  struct file *file = kcb->ki_filp;
5213  struct inode *inode = m0t1fs_file_to_inode(file);
5214  ssize_t written;
5215 
5217  M0_ENTRY();
5218  BUG_ON(kcb->ki_pos != pos);
5219 
5220  mutex_lock(&inode->i_mutex);
5221  written = __generic_file_aio_write(kcb, iov, seg_nr, &kcb->ki_pos);
5222  mutex_unlock(&inode->i_mutex);
5223 
5224  if (written > 0) {
5225  written = generic_write_sync(file, pos, written) ?: written;
5226  }
5227 
5228  M0_LEAVE();
5229  return written;
5230 }
5231 #endif
5232 
5233 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
5234 static ssize_t aio_write(struct kiocb *kcb, struct iov_iter *from)
5235 {
5236  size_t count = 0;
5237  ssize_t written;
5238  struct m0_indexvec_varr *ivv;
5239 
5241  M0_PRE(kcb != NULL);
5242  M0_PRE(from != NULL);
5243  M0_ENTRY("struct iovec %p position %llu seg_nr %lu", from->iov, kcb->ki_pos, from->nr_segs);
5244 
5245  if (!file_to_sb(kcb->ki_filp)->csb_active) {
5246  M0_LEAVE();
5247  return M0_ERR(-EINVAL);
5248  }
5249 
5250  count = generic_write_checks(kcb, from);
5251  if (count <= 0) {
5252  M0_LEAVE();
5253  return 0;
5254  }
5255 
5256  if (kcb->ki_filp->f_flags & O_DIRECT) {
5257  written = file_dio_write(kcb, from);
5258  M0_LEAVE();
5259  return written;
5260  }
5261 
5262  ivv = indexvec_create(from->nr_segs, from->iov, kcb->ki_pos);
5263  if (ivv == NULL)
5264  return M0_ERR(-ENOMEM);
5265 
5266  indexvec_varr_dump(ivv);
5267 
5268  M0_LOG(M0_INFO, "Write vec-count = %llu seg_nr %lu",
5269  indexvec_varr_count(ivv), from->nr_segs);
5270  written = m0t1fs_aio(kcb, from->iov, ivv, IRT_WRITE);
5271 
5272  /* Updates file position. */
5273  if (written > 0)
5274  kcb->ki_pos = kcb->ki_pos + written;
5275 
5276  m0_indexvec_varr_free(ivv);
5277  m0_free(ivv);
5278  M0_LOG(M0_DEBUG, "written %llu", (unsigned long long)written);
5279  M0_LEAVE();
5280  return written;
5281 }
5282 #else
5283 static ssize_t aio_write(struct kiocb *kcb, const struct iovec *iov,
5284  unsigned long seg_nr, loff_t pos)
5285 {
5286  int rc;
5287  size_t count = 0;
5288  size_t saved_count;
5289  ssize_t written;
5290  struct m0_indexvec_varr *ivv;
5291 
5293  M0_ENTRY("struct iovec %p position %llu seg_nr %lu", iov, pos, seg_nr);
5294  M0_PRE(kcb != NULL);
5295  M0_PRE(iov != NULL);
5296  M0_PRE(seg_nr > 0);
5297 
5298  if (!file_to_sb(kcb->ki_filp)->csb_active) {
5299  M0_LEAVE();
5300  return M0_ERR(-EINVAL);
5301  }
5302 
5303  rc = generic_segment_checks(iov, &seg_nr, &count, VERIFY_READ);
5304  if (rc != 0) {
5305  M0_LEAVE();
5306  return 0;
5307  }
5308 
5309  saved_count = count;
5310  rc = generic_write_checks(kcb->ki_filp, &pos, &count, 0);
5311  if (rc != 0 || count == 0) {
5312  M0_LEAVE();
5313  return 0;
5314  }
5315 
5316  if (count != saved_count)
5317  seg_nr = iov_shorten((struct iovec *)iov, seg_nr, count);
5318 
5319  if (kcb->ki_filp->f_flags & O_DIRECT) {
5320  written = file_dio_write(kcb, iov, seg_nr, pos);
5321  M0_LEAVE();
5322  return written;
5323  }
5324 
5325  ivv = indexvec_create(seg_nr, iov, pos);
5326  if (ivv == NULL)
5327  return M0_ERR(-ENOMEM);
5328 
5329  indexvec_varr_dump(ivv);
5330 
5331  M0_LOG(M0_INFO, "Write vec-count = %llu seg_nr %lu",
5332  indexvec_varr_count(ivv), seg_nr);
5333  written = m0t1fs_aio(kcb, iov, ivv, IRT_WRITE);
5334 
5335  /* Updates file position. */
5336  if (written > 0)
5337  kcb->ki_pos = pos + written;
5338 
5339  m0_indexvec_varr_free(ivv);
5340  m0_free(ivv);
5341  M0_LOG(M0_DEBUG, "written %llu", (unsigned long long)written);
5342  M0_LEAVE();
5343  return written;
5344 }
5345 #endif
5346 
5347 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
5348 static ssize_t file_aio_write(struct kiocb *kcb, struct iov_iter *from)
5349 #else
5350 static ssize_t file_aio_write(struct kiocb *kcb,
5351  const struct iovec *iov,
5352  unsigned long seg_nr,
5353  loff_t pos)
5354 #endif
5355 {
5356  ssize_t res;
5357  struct m0t1fs_inode *ci = m0t1fs_file_to_m0inode(kcb->ki_filp);
5358 
5360 
5362 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
5363  res = aio_write(kcb, from);
5364  M0_ADDB2_ADD(M0_AVI_FS_IO_DESCR, kcb->ki_pos, res);
5365 #else
5366  res = aio_write(kcb, iov, seg_nr, pos);
5368 #endif
5370  return res;
5371 }
5372 
5373 
5374 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
5375 static ssize_t aio_read(struct kiocb *kcb, struct iov_iter *from)
5376 {
5377  int seg;
5378  size_t count = 0;
5379  loff_t size;
5380  ssize_t res;
5381  struct file *filp;
5382  struct m0_indexvec_varr *ivv;
5383 
5385  M0_PRE(kcb != NULL);
5386  M0_PRE(from != NULL);
5387  M0_ENTRY("struct iovec %p position %llu", from->iov, kcb->ki_pos);
5388 
5389  filp = kcb->ki_filp;
5390  size = i_size_read(m0t1fs_file_to_inode(filp));
5391 
5392  /* Returns if super block is inactive. */
5393  if (!file_to_sb(filp)->csb_active)
5394  return M0_ERR(-EINVAL);
5395  if (kcb->ki_pos >= size)
5396  return M0_RC(0);
5397 
5398  if (filp->f_flags & O_DIRECT) {
5399  res = generic_file_read_iter(kcb, from);
5400  M0_LEAVE();
5401  return res;
5402  }
5403 
5404  count = iov_iter_count(from);
5405  if (count == 0)
5406  /*
5407  * And thus spake POSIX: "Before any action described below is
5408  * taken, and if nbyte is zero, the read() function may detect
5409  * and return errors as described below. In the absence of
5410  * errors, or if error detection is not performed, the read()
5411  * function shall return zero and have no other results."
5412  */
5413  return M0_RC(0);
5414 
5415  /* Index vector has to be created before io_request is created. */
5416  ivv = indexvec_create(from->nr_segs, from->iov, kcb->ki_pos);
5417  if (ivv == NULL)
5418  return M0_ERR(-ENOMEM);
5419 
5420  /*
5421  * For read IO, if any segment from index vector goes beyond EOF,
5422  * they are dropped and the index vector is truncated to EOF boundary.
5423  */
5424  for (seg = 0; seg < V_SEG_NR(ivv); ++seg) {
5425  if (v_seg_endpos(ivv, seg) > size) {
5426  V_COUNT(ivv, seg) = size - V_INDEX(ivv, seg);
5427  V_SEG_NR(ivv) = seg + 1;
5428  break;
5429  }
5430  }
5431  indexvec_varr_dump(ivv);
5432  if (indexvec_varr_count(ivv) == 0) {
5433  m0_indexvec_varr_free(ivv);
5434  m0_free(ivv);
5435  return M0_RC(0);
5436  }
5437 
5438  M0_LOG(M0_INFO, "Read vec-count = %llu", indexvec_varr_count(ivv));
5439  res = m0t1fs_aio(kcb, from->iov, ivv, IRT_READ);
5440  M0_LOG(M0_DEBUG, "Read @%llu vec-count = %8llu return = %8llu(%d)",
5441  kcb->ki_pos, indexvec_varr_count(ivv),
5442  (unsigned long long)res, (int)res);
5443  /* Updates file position. */
5444  if (res > 0)
5445  kcb->ki_pos = kcb->ki_pos + res;
5446 
5447  m0_indexvec_varr_free(ivv);
5448  m0_free(ivv);
5449  M0_LEAVE();
5450  return res;
5451 }
5452 #else
5453 static ssize_t aio_read(struct kiocb *kcb, const struct iovec *iov,
5454  unsigned long seg_nr, loff_t pos)
5455 {
5456  int seg;
5457  size_t count = 0;
5458  loff_t size;
5459  ssize_t res;
5460  struct file *filp;
5461  struct m0_indexvec_varr *ivv;
5462 
5464  M0_ENTRY("struct iovec %p position %llu", iov, pos);
5465  M0_PRE(kcb != NULL);
5466  M0_PRE(iov != NULL);
5467  M0_PRE(seg_nr > 0);
5468 
5469  filp = kcb->ki_filp;
5470  size = i_size_read(m0t1fs_file_to_inode(filp));
5471 
5472  /* Returns if super block is inactive. */
5473  if (!file_to_sb(filp)->csb_active)
5474  return M0_ERR(-EINVAL);
5475  if (pos >= size)
5476  return M0_RC(0);
5477 
5478  if (filp->f_flags & O_DIRECT) {
5479  res = generic_file_aio_read(kcb, iov, seg_nr, pos);
5480  M0_LEAVE();
5481  return res;
5482  }
5483 
5484  /*
5485  * Checks for access privileges and adjusts all segments
5486  * for proper count and total number of segments.
5487  */
5488  res = generic_segment_checks(iov, &seg_nr, &count, VERIFY_WRITE);
5489  if (res != 0) {
5490  M0_LEAVE();
5491  return res;
5492  }
5493 
5494  if (count == 0)
5495  /*
5496  * And thus spake POSIX: "Before any action described below is
5497  * taken, and if nbyte is zero, the read() function may detect
5498  * and return errors as described below. In the absence of
5499  * errors, or if error detection is not performed, the read()
5500  * function shall return zero and have no other results."
5501  */
5502  return M0_RC(0);
5503 
5504  /* Index vector has to be created before io_request is created. */
5505  ivv = indexvec_create(seg_nr, iov, pos);
5506  if (ivv == NULL)
5507  return M0_ERR(-ENOMEM);
5508 
5509  /*
5510  * For read IO, if any segment from index vector goes beyond EOF,
5511  * they are dropped and the index vector is truncated to EOF boundary.
5512  */
5513  for (seg = 0; seg < V_SEG_NR(ivv); ++seg) {
5514  if (v_seg_endpos(ivv, seg) > size) {
5515  V_COUNT(ivv, seg) = size - V_INDEX(ivv, seg);
5516  V_SEG_NR(ivv) = seg + 1;
5517  break;
5518  }
5519  }
5520  indexvec_varr_dump(ivv);
5521  if (indexvec_varr_count(ivv) == 0) {
5522  m0_indexvec_varr_free(ivv);
5523  m0_free(ivv);
5524  return M0_RC(0);
5525  }
5526 
5527  M0_LOG(M0_INFO, "Read vec-count = %llu", indexvec_varr_count(ivv));
5528  res = m0t1fs_aio(kcb, iov, ivv, IRT_READ);
5529  M0_LOG(M0_DEBUG, "Read @%llu vec-count = %8llu return = %8llu(%d)",
5530  pos, indexvec_varr_count(ivv),
5531  (unsigned long long)res, (int)res);
5532  /* Updates file position. */
5533  if (res > 0)
5534  kcb->ki_pos = pos + res;
5535 
5536  m0_indexvec_varr_free(ivv);
5537  m0_free(ivv);
5538  M0_LEAVE();
5539  return res;
5540 }
5541 #endif
5542 
5543 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
5544 static ssize_t file_aio_read(struct kiocb *kcb, struct iov_iter *from)
5545 #else
5546 static ssize_t file_aio_read(struct kiocb *kcb,
5547  const struct iovec *iov,
5548  unsigned long seg_nr,
5549  loff_t pos)
5550 #endif
5551 {
5552  ssize_t res;
5553  struct m0t1fs_inode *ci = m0t1fs_file_to_m0inode(kcb->ki_filp);
5554 
5556 
5558 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
5559  res = aio_read(kcb, from);
5560  M0_ADDB2_ADD(M0_AVI_FS_IO_DESCR, kcb->ki_pos, res);
5561 #else
5562  res = aio_read(kcb, iov, seg_nr, pos);
5564 #endif
5565 
5567  return res;
5568 }
5569 
5570 int m0t1fs_flush(struct file *file, fl_owner_t id)
5571 {
5572  struct inode *inode = m0t1fs_file_to_inode(file);
5573  struct m0t1fs_inode *ci = M0T1FS_I(inode);
5574  struct m0t1fs_mdop mo;
5575  struct m0t1fs_sb *csb = m0inode_to_sb(ci);
5576  int rc;
5577 
5579  M0_ENTRY("inode links:%d inode writecount = %d close size %d",
5580  (unsigned int)inode->i_nlink,
5581  atomic_read(&inode->i_writecount),
5582  (unsigned int)inode->i_size);
5583 
5584  if (!csb->csb_oostore || inode->i_nlink == 0 ||
5585  atomic_read(&inode->i_writecount) == 0)
5586  return M0_RC(0);
5587 
5588  M0_SET0(&mo);
5589  mo.mo_attr.ca_tfid = *m0t1fs_inode_fid(ci);
5590  mo.mo_attr.ca_size = inode->i_size;
5591  mo.mo_attr.ca_nlink = inode->i_nlink;
5592  mo.mo_attr.ca_pver = m0t1fs_file_to_pver(file)->pv_id;
5593  mo.mo_attr.ca_lid = ci->ci_layout_id;
5594  mo.mo_attr.ca_valid |= (M0_COB_SIZE | M0_COB_NLINK |
5596 
5598  return rc != 0 ? M0_ERR_INFO(rc, FID_F, FID_P(&mo.mo_attr.ca_tfid)) :
5599  M0_RC(rc);
5600 }
5601 
5602 const struct file_operations m0t1fs_reg_file_operations = {
5603  .llseek = generic_file_llseek,
5604 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
5605  .read_iter = file_aio_read,
5606  .write_iter = file_aio_write,
5607 #else
5608  .aio_read = file_aio_read,
5609  .aio_write = file_aio_write,
5610  .read = do_sync_read,
5611  .write = do_sync_write,
5612 #endif
5613 #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
5614  .unlocked_ioctl = m0t1fs_ioctl,
5615 #else
5616  .ioctl = m0t1fs_ioctl,
5617 #endif
5618  .fsync = m0t1fs_fsync,
5619  .flush = m0t1fs_flush,
5620 };
5621 
5622 static void client_passive_recv(const struct m0_net_buffer_event *evt)
5623 {
5624  struct m0_rpc_bulk *rbulk;
5625  struct m0_rpc_bulk_buf *buf;
5626  struct m0_net_buffer *nb;
5627  struct m0_io_fop *iofop;
5628  struct io_req_fop *reqfop;
5629  struct io_request *ioreq;
5630  uint32_t req_sm_state;
5631 
5632  M0_ENTRY();
5633 
5634  M0_PRE(evt != NULL);
5635  M0_PRE(evt->nbe_buffer != NULL);
5636 
5637  nb = evt->nbe_buffer;
5638  buf = (struct m0_rpc_bulk_buf *)nb->nb_app_private;
5639  rbulk = buf->bb_rbulk;
5640  iofop = container_of(rbulk, struct m0_io_fop, if_rbulk);
5641  reqfop = bob_of(iofop, struct io_req_fop, irf_iofop, &iofop_bobtype);
5642  ioreq = bob_of(reqfop->irf_tioreq->ti_nwxfer, struct io_request,
5643  ir_nwxfer, &ioreq_bobtype);
5644  M0_ASSERT(rbulk == &reqfop->irf_iofop.if_rbulk);
5645  M0_LOG(M0_DEBUG, "[%p] PASSIVE recv, e %p, status %d, len %llu, "
5646  "nbuf %p", ioreq, evt, evt->nbe_status, evt->nbe_length, nb);
5647 
5648  M0_ASSERT(m0_is_read_fop(&iofop->if_fop));
5649  M0_LOG(M0_DEBUG, "[%p] Pending fops %llu, Pending rdbulk %llu, "
5650  "fop %p, item %p, "FID_F", rbulk %p",
5651  ioreq, m0_atomic64_get(&ioreq->ir_nwxfer.nxr_iofop_nr),
5653  &iofop->if_fop, &iofop->if_fop.f_item,
5654  FID_P(&reqfop->irf_tioreq->ti_fid), rbulk);
5655 
5656  /*
5657  * buf will be released in this callback. But rbulk is still valid
5658  * after that.
5659  */
5661  if (evt->nbe_status != 0)
5662  return;
5663  m0_mutex_lock(&ioreq->ir_nwxfer.nxr_lock);
5664  req_sm_state = ioreq_sm_state(ioreq);
5665  if (req_sm_state != IRS_READ_COMPLETE &&
5666  req_sm_state != IRS_WRITE_COMPLETE) {
5667  /*
5668  * It is possible that io_bottom_half() has already
5669  * reduced the nxr_rdbulk_nr to 0 by this time, due to FOP
5670  * receiving some error.
5671  */
5672  if (m0_atomic64_get(&ioreq->ir_nwxfer.nxr_rdbulk_nr) > 0)
5674  if (should_req_sm_complete(ioreq)) {
5675  ioreq_sm_state_set(ioreq,
5676  (M0_IN(req_sm_state,
5677  (IRS_READING,
5681  }
5682  }
5683 
5685  M0_LEAVE();
5686 }
5687 
5689  .nbc_cb = {
5694  }
5695 };
5696 
5697 static int iofop_async_submit(struct m0_io_fop *iofop,
5698  struct m0_rpc_session *session)
5699 {
5700  int rc;
5701  struct m0_fop_cob_rw *rwfop;
5702  struct io_req_fop *reqfop;
5703  struct io_request *req;
5704  struct m0_rpc_item *item;
5705 
5706  M0_ENTRY("m0_io_fop %p m0_rpc_session %p", iofop, session);
5707  M0_PRE(iofop != NULL);
5708  M0_PRE(session != NULL);
5709 
5710  rwfop = io_rw_get(&iofop->if_fop);
5711  M0_ASSERT(rwfop != NULL);
5712 
5713  reqfop = bob_of(iofop, struct io_req_fop, irf_iofop, &iofop_bobtype);
5714  req = bob_of(reqfop->irf_tioreq->ti_nwxfer, struct io_request,
5715  ir_nwxfer, &ioreq_bobtype);
5716 
5718  rwfop->crw_desc.id_descs,
5720  if (rc != 0)
5721  goto out;
5722 
5723  iofop->if_fop.f_item.ri_session = session;
5724  item = &iofop->if_fop.f_item;
5725  item->ri_nr_sent_max = M0T1FS_RPC_MAX_RETRIES;
5726  item->ri_resend_interval = M0T1FS_RPC_RESEND_INTERVAL;
5727  rc = m0_rpc_post(item);
5728  M0_LOG(M0_DEBUG, "[%p] IO fop %p, %p[%u], rbulk %p, submitted to rpc, "
5729  "rc %d, ri_error %d", req, &iofop->if_fop, item,
5730  item->ri_type->rit_opcode, &iofop->if_rbulk, rc, item->ri_error);
5731  /*
5732  * Ignoring error from m0_rpc_post() so that the subsequent fop
5733  * submission goes on. This is to ensure that the ioreq gets into dgmode
5734  * subsequently without exiting from the healthy mode IO itself.
5735  */
5736 
5737  return M0_RC(0);
5738  /*
5739  * In case error is encountered either by m0_rpc_bulk_store() or
5740  * m0_rpc_post(), queued net buffers, if any, will be deleted at
5741  * io_req_fop_release.
5742  */
5743 out:
5744  return M0_RC(rc);
5745 }
5746 
5747 static void io_req_fop_release(struct m0_ref *ref)
5748 {
5749  struct m0_fop *fop;
5750  struct m0_io_fop *iofop;
5751  struct io_req_fop *reqfop;
5752  struct m0_rpc_bulk *rbulk;
5753  struct nw_xfer_request *xfer;
5754  struct m0_fop_cob_rw *rwfop;
5755  struct m0_rpc_machine *rmach;
5756  struct m0_rpc_item *item;
5757  struct io_request *req;
5758 
5759  M0_ENTRY("ref %p", ref);
5760  M0_PRE(ref != NULL);
5761 
5762  fop = container_of(ref, struct m0_fop, f_ref);
5763  rmach = m0_fop_rpc_machine(fop);
5764  iofop = container_of(fop, struct m0_io_fop, if_fop);
5765  reqfop = bob_of(iofop, struct io_req_fop, irf_iofop, &iofop_bobtype);
5766  rbulk = &iofop->if_rbulk;
5767  xfer = reqfop->irf_tioreq->ti_nwxfer;
5768  req = bob_of(xfer, struct io_request, ir_nwxfer, &ioreq_bobtype);
5769  item = &fop->f_item;
5770 
5771  M0_LOG(M0_DEBUG, "[%p] fop %p, Pending fops %llu, Pending rdbulk %llu",
5772  req, fop,
5773  (unsigned long long)m0_atomic64_get(&xfer->nxr_iofop_nr),
5774  (unsigned long long)m0_atomic64_get(&xfer->nxr_rdbulk_nr));
5775  M0_LOG(M0_DEBUG, "[%p] fop %p, "FID_F", %p[%u], ri_error %d, "
5776  "rbulk %p", req, &iofop->if_fop,
5777  FID_P(&reqfop->irf_tioreq->ti_fid), item,
5778  item->ri_type->rit_opcode, item->ri_error, rbulk);
5779 
5780  /*
5781  * Release the net buffers if rpc bulk object is still dirty.
5782  * And wait on channel till all net buffers are deleted from
5783  * transfer machine.
5784  */
5785  m0_mutex_lock(&xfer->nxr_lock);
5786  m0_mutex_lock(&rbulk->rb_mutex);
5787  if (!m0_tlist_is_empty(&rpcbulk_tl, &rbulk->rb_buflist)) {
5788  struct m0_clink clink;
5789  size_t buf_nr;
5790  size_t non_queued_buf_nr;
5791 
5793  m0_clink_add(&rbulk->rb_chan, &clink);
5794  buf_nr = rpcbulk_tlist_length(&rbulk->rb_buflist);
5795  non_queued_buf_nr = m0_rpc_bulk_store_del_unqueued(rbulk);
5796  m0_mutex_unlock(&rbulk->rb_mutex);
5797 
5798  m0_rpc_bulk_store_del(rbulk);
5799  M0_LOG(M0_DEBUG, "[%p] fop %p, %p[%u], bulk %p, buf_nr %llu, "
5800  "non_queued_buf_nr %llu", req, &iofop->if_fop, item,
5801  item->ri_type->rit_opcode, rbulk,
5802  (unsigned long long)buf_nr,
5803  (unsigned long long)non_queued_buf_nr);
5804  if (m0_is_read_fop(&iofop->if_fop))
5806  non_queued_buf_nr);
5808  /* rio_replied() is not invoked for this item. */
5809  m0_atomic64_dec(&xfer->nxr_iofop_nr);
5810  m0_mutex_unlock(&xfer->nxr_lock);
5811  /*
5812  * If there were some queued net bufs which had to be deleted,
5813  * then it is required to wait for their callbacks.
5814  */
5815  if (buf_nr > non_queued_buf_nr) {
5816  /*
5817  * rpc_machine_lock may be needed from nlx_tm_ev_worker
5818  * thread, which is going to wake us up. So we should
5819  * release it to avoid deadlock.
5820  */
5821  m0_rpc_machine_unlock(rmach);
5822  m0_chan_wait(&clink);
5823  m0_rpc_machine_lock(rmach);
5824  }
5826  m0_clink_fini(&clink);
5827  } else {
5828  m0_mutex_unlock(&rbulk->rb_mutex);
5829  m0_mutex_unlock(&xfer->nxr_lock);
5830  }
5832  M0_LOG(M0_DEBUG, "[%p] fop %p, Pending fops %llu, Pending rdbulk %llu",
5833  req, fop,
5834  (unsigned long long)m0_atomic64_get(&xfer->nxr_iofop_nr),
5835  (unsigned long long)m0_atomic64_get(&xfer->nxr_rdbulk_nr));
5836  M0_LOG(M0_DEBUG, "[%p] fop %p, "FID_F", %p[%u], ri_error %d, "
5837  "rbulk %p", req, &iofop->if_fop,
5838  FID_P(&reqfop->irf_tioreq->ti_fid), item,
5839  item->ri_type->rit_opcode, item->ri_error, rbulk);
5840 
5841  rwfop = io_rw_get(&iofop->if_fop);
5842  M0_ASSERT(rwfop != NULL);
5843  io_req_fop_fini(reqfop);
5844  /* see io_req_fop_fini(). */
5845  io_req_fop_bob_fini(reqfop);
5846  m0_io_fop_fini(iofop);
5847  m0_free(reqfop);
5848  ++iommstats.d_io_req_fop_nr;
5849 }
5850 
5851 static void cc_rpc_item_cb(struct m0_rpc_item *item)
5852 {
5853  struct io_request *req;
5854  struct cc_req_fop *cc_fop;
5855  struct target_ioreq *ti;
5856  struct m0_fop *fop;
5857  struct m0_fop *rep_fop;
5858 
5860  cc_fop = container_of(fop, struct cc_req_fop, crf_fop);
5861  ti = container_of(cc_fop, struct target_ioreq, ti_cc_fop);
5862  req = bob_of(ti->ti_nwxfer, struct io_request,
5863  ir_nwxfer, &ioreq_bobtype);
5864  cc_fop->crf_ast.sa_cb = cc_bottom_half;
5865  cc_fop->crf_ast.sa_datum = (void *)ti;
5866  /* Reference on fop and its reply are released in cc_bottom_half. */
5867  m0_fop_get(fop);
5868  if (item->ri_reply != NULL) {
5871  }
5872 
5873  m0_sm_ast_post(req->ir_sm.sm_grp, &cc_fop->crf_ast);
5874 }
5875 
5876 static void cc_bottom_half(struct m0_sm_group *grp, struct m0_sm_ast *ast)
5877 {
5878  struct nw_xfer_request *xfer;
5879  struct target_ioreq *ti;
5880  struct cc_req_fop *cc_fop;
5881  struct io_request *req;
5882  struct m0_fop_cob_op_reply *reply;
5883  struct m0_fop *reply_fop = NULL;
5884  struct m0t1fs_inode *inode;
5885  struct m0t1fs_sb *csb;
5886  struct m0_rpc_item *req_item;
5887  struct m0_rpc_item *reply_item;
5888  int rc;
5889 
5890  ti = (struct target_ioreq *)ast->sa_datum;
5891  req = bob_of(ti->ti_nwxfer, struct io_request, ir_nwxfer,
5892  &ioreq_bobtype);
5893  xfer = ti->ti_nwxfer;
5894  cc_fop = &ti->ti_cc_fop;
5895  req_item = &cc_fop->crf_fop.f_item;
5896  reply_item = req_item->ri_reply;
5897  rc = req_item->ri_error;
5898  if (reply_item != NULL) {
5899  reply_fop = m0_rpc_item_to_fop(reply_item);
5900  rc = rc ?: m0_rpc_item_generic_reply_rc(reply_item);
5901  }
5902  if (rc < 0 || reply_item == NULL) {
5903  M0_ASSERT(ergo(reply_item == NULL, rc != 0));
5904  goto ref_dec;
5905  }
5906 
5907  reply = m0_fop_data(m0_rpc_item_to_fop(reply_item));
5908  /*
5909  * Ignoring the case when an attempt is made to create a cob on target
5910  * where previous IO had created it.
5911  */
5912  rc = rc ? M0_IN(reply->cor_rc, (0, -EEXIST)) ? 0 : reply->cor_rc : 0;
5913 
5914  /*
5915  * In case the conf is updated is revoked
5916  * abort the ongoing request.
5917  */
5919  csb = M0T1FS_SB(inode->ci_inode.i_sb);
5920  m0_mutex_lock(&csb->csb_confc_state.cus_lock);
5921  if (csb->csb_confc_state.cus_state != M0_CC_READY)
5922  rc = M0_ERR(-ESTALE);
5923  m0_mutex_unlock(&csb->csb_confc_state.cus_lock);
5924 ref_dec:
5925  if (ti->ti_rc == 0 && rc != 0)
5926  ti->ti_rc = rc;
5927  if (xfer->nxr_rc == 0 && rc != 0)
5928  xfer->nxr_rc = rc;
5929  m0_fop_put0_lock(&cc_fop->crf_fop);
5930  if (reply_fop != NULL)
5932  m0_mutex_lock(&xfer->nxr_lock);
5933  m0_atomic64_dec(&xfer->nxr_ccfop_nr);
5936  m0_mutex_unlock(&xfer->nxr_lock);
5937 }
5938 
5940 {
5941  struct m0t1fs_sb *csb;
5942  struct m0t1fs_inode *inode;
5943 
5944 
5946  csb = M0T1FS_SB(inode->ci_inode.i_sb);
5947 
5948  return m0_atomic64_get(&req->ir_nwxfer.nxr_iofop_nr) == 0 &&
5950  ((csb->csb_oostore && ioreq_sm_state(req) == IRS_WRITING) ?
5951  m0_atomic64_get(&req->ir_nwxfer.nxr_ccfop_nr) == 0 : true);
5952 }
5953 
5954 static void io_rpc_item_cb(struct m0_rpc_item *item)
5955 {
5956  struct m0_fop *fop;
5957  struct m0_fop *rep_fop;
5958  struct m0_io_fop *iofop;
5959  struct io_req_fop *reqfop;
5960  struct io_request *ioreq;
5961 
5962  M0_PRE(item != NULL);
5963  M0_ENTRY("rpc_item %p[%u]", item, item->ri_type->rit_opcode);
5964 
5966  iofop = container_of(fop, struct m0_io_fop, if_fop);
5967  reqfop = bob_of(iofop, struct io_req_fop, irf_iofop, &iofop_bobtype);
5968  ioreq = bob_of(reqfop->irf_tioreq->ti_nwxfer, struct io_request,
5970  /*
5971  * NOTE: RPC errors are handled in io_bottom_half(), which is called
5972  * by reqfop->irf_ast.
5973  */
5974 
5975  /*
5976  * Acquire a reference on IO reply fop since its contents
5977  * are needed for policy decisions in io_bottom_half().
5978  * io_bottom_half() takes care of releasing the reference.
5979  */
5980  if (item->ri_reply != NULL) {
5983  }
5984 
5985  M0_LOG(M0_INFO, "[%p] io_req_fop %p, target fid "FID_F" item %p[%u], "
5986  "ri_error %d", ioreq, reqfop, FID_P(&reqfop->irf_tioreq->ti_fid),
5988  /*
5989  * Acquire a reference on IO fop so that it does not get
5990  * released until io_bottom_half() is executed for it.
5991  * io_bottom_half() takes care of releasing the reference.
5992  */
5993  m0_fop_get(&reqfop->irf_iofop.if_fop);
5994  m0_sm_ast_post(ioreq->ir_sm.sm_grp, &reqfop->irf_ast);
5995  M0_LEAVE();
5996 }
5997 
5998 M0_INTERNAL struct m0_file *m0_fop_to_file(struct m0_fop *fop)
5999 {
6000  struct m0_io_fop *iofop;
6001  struct io_req_fop *irfop;
6002  struct io_request *ioreq;
6003 
6004  iofop = container_of(fop, struct m0_io_fop, if_fop);
6005  irfop = bob_of(iofop, struct io_req_fop, irf_iofop, &iofop_bobtype);
6006  ioreq = bob_of(irfop->irf_tioreq->ti_nwxfer, struct io_request,
6008 
6009  return &m0t1fs_file_to_m0inode(ioreq->ir_file)->ci_flock;
6010 }
6011 
6012 M0_INTERNAL struct m0t1fs_sb *m0_fop_to_sb(struct m0_fop *fop)
6013 {
6014  struct m0_io_fop *iofop;
6015  struct io_req_fop *irfop;
6016  struct io_request *ioreq;
6017 
6018  iofop = container_of(fop, struct m0_io_fop, if_fop);
6019  irfop = bob_of(iofop, struct io_req_fop, irf_iofop, &iofop_bobtype);
6020  ioreq = bob_of(irfop->irf_tioreq->ti_nwxfer, struct io_request,
6022  return file_to_sb(ioreq->ir_file);
6023 }
6024 
6025 static void io_bottom_half(struct m0_sm_group *grp, struct m0_sm_ast *ast)
6026 {
6027  struct io_req_fop *irfop;
6028  struct io_request *req;
6029  struct target_ioreq *tioreq;
6030  struct nw_xfer_request *xfer;
6031  struct m0_io_fop *iofop;
6032  struct m0_fop *reply_fop = NULL;
6033  struct m0_rpc_item *req_item;
6034  struct m0_rpc_item *reply_item;
6035  struct m0_fop_cob_rw_reply *rw_reply;
6036  struct m0_reqh_service_ctx *ctx;
6037  struct m0t1fs_inode *inode;
6038  struct m0t1fs_sb *csb;
6039  struct m0_be_tx_remid *remid;
6040  uint64_t actual_bytes = 0;
6041  int rc;
6042 
6043  M0_ENTRY("sm_group %p sm_ast %p", grp, ast);
6044  M0_PRE(grp != NULL);
6045  M0_PRE(ast != NULL);
6046 
6047  irfop = bob_of(ast, struct io_req_fop, irf_ast, &iofop_bobtype);
6048  tioreq = irfop->irf_tioreq;
6049  req = bob_of(tioreq->ti_nwxfer, struct io_request, ir_nwxfer,
6050  &ioreq_bobtype);
6051  xfer = tioreq->ti_nwxfer;
6052 
6053  M0_ASSERT(xfer == &req->ir_nwxfer);
6054  M0_ASSERT(M0_IN(irfop->irf_pattr, (PA_DATA, PA_PARITY)));
6058  IRS_FAILED)));
6059  M0_ASSERT(req->ir_file != NULL);
6060 
6061  iofop = &irfop->irf_iofop;
6062  req_item = &iofop->if_fop.f_item;
6063  reply_item = req_item->ri_reply;
6064  M0_LOG(M0_DEBUG, "[%p] nxr_iofop_nr %llu, nxr_rdbulk_nr %llu, "
6065  "req item %p[%u], ri_error %d", req,
6066  (unsigned long long)m0_atomic64_get(&xfer->nxr_iofop_nr),
6067  (unsigned long long)m0_atomic64_get(&xfer->nxr_rdbulk_nr),
6068  req_item, req_item->ri_type->rit_opcode, req_item->ri_error);
6069 
6070  rc = req_item->ri_error;
6071  if (reply_item != NULL) {
6072  rc = rc ?: m0_rpc_item_generic_reply_rc(reply_item);
6073  }
6074  if (rc < 0 || reply_item == NULL) {
6075  M0_ASSERT(ergo(reply_item == NULL, rc != 0));
6076  M0_LOG(M0_ERROR, "[%p] item %p, rc=%d", req, req_item, rc);
6077  goto ref_dec;
6078  }
6079 
6080  reply_fop = m0_rpc_item_to_fop(reply_item);
6082 
6083  rw_reply = io_rw_rep_get(reply_fop);
6084  rc = rw_reply->rwr_rc;
6085  remid = &rw_reply->rwr_mod_rep.fmr_remid;
6086  req->ir_sns_state = rw_reply->rwr_repair_done;
6087  M0_LOG(M0_DEBUG, "[%p] item %p[%u], reply received = %d, "
6088  "sns state = %d", req, req_item,
6089  req_item->ri_type->rit_opcode, rc, req->ir_sns_state);
6090 
6091  irfop->irf_reply_rc = rc;
6092 
6093  /* update pending transaction number */
6096  csb = M0T1FS_SB(inode->ci_inode.i_sb);
6097  m0_mutex_lock(&csb->csb_confc_state.cus_lock);
6098  if (csb->csb_confc_state.cus_state != M0_CC_READY) {
6099  m0_mutex_unlock(&csb->csb_confc_state.cus_lock);
6100  rc = M0_ERR(-ESTALE);
6101  goto ref_dec;
6102  }
6103  m0_mutex_unlock(&csb->csb_confc_state.cus_lock);
6105  actual_bytes = rw_reply->rwr_count;
6106 
6107 ref_dec:
6108  /* For whatever reason, io didn't complete successfully.
6109  * Clear read bulk count */
6110  if (rc < 0 && m0_is_read_fop(&iofop->if_fop))
6112  m0_rpc_bulk_buf_length(&iofop->if_rbulk));
6113  if (tioreq->ti_rc == 0)
6114  tioreq->ti_rc = rc;
6115 
6116  /* For stale conf cache override the error. */
6117  if (rc == -ESTALE || (xfer->nxr_rc == 0 && rc != 0)) {
6118  xfer->nxr_rc = rc;
6119  M0_LOG(M0_ERROR, "[%p][type=%d] rc %d, tioreq->ti_rc %d, "
6120  "nwxfer rc = %d @"FID_F,
6121  req, req->ir_type, rc, tioreq->ti_rc,
6122  xfer->nxr_rc, FID_P(&tioreq->ti_fid));
6123  }
6124 
6125  if (irfop->irf_pattr == PA_DATA)
6126  tioreq->ti_databytes += iofop->if_rbulk.rb_bytes;
6127  else
6128  tioreq->ti_parbytes += iofop->if_rbulk.rb_bytes;
6129 
6130  M0_LOG(M0_INFO, "[%p] fop %p, Returned no of bytes = %llu, "
6131  "expected = %llu", req, &iofop->if_fop, actual_bytes,
6132  iofop->if_rbulk.rb_bytes);
6133  /* Drop reference on request and reply fop. */
6134  m0_fop_put0_lock(&iofop->if_fop);
6136  m0_atomic64_dec(&file_to_sb(req->ir_file)->csb_pending_io_nr);
6137 
6138  m0_mutex_lock(&xfer->nxr_lock);
6139  m0_atomic64_dec(&xfer->nxr_iofop_nr);
6140  if (should_req_sm_complete(req)) {
6144  }
6145  m0_mutex_unlock(&xfer->nxr_lock);
6146 
6147  M0_LOG(M0_DEBUG, "[%p] item %p, ref %llu, "FID_F", Pending fops %llu, "
6148  "Pending rdbulk %llu", req, req_item,
6149  (unsigned long long)m0_ref_read(&iofop->if_fop.f_ref),
6150  FID_P(&tioreq->ti_fid), m0_atomic64_get(&xfer->nxr_iofop_nr),
6151  m0_atomic64_get(&xfer->nxr_rdbulk_nr));
6152  M0_LEAVE();
6153 }
6154 
6155 static int nw_xfer_req_dispatch(struct nw_xfer_request *xfer)
6156 {
6157  int rc = 0;
6158  struct io_req_fop *irfop;
6159  struct io_request *req;
6160  struct target_ioreq *ti;
6161  struct m0t1fs_sb *csb;
6162  uint64_t nr_dispatched = 0;
6163  int post_error = 0;
6164  int ri_error;
6165 
6166  M0_ENTRY();
6167 
6168  M0_PRE(xfer != NULL);
6169  req = bob_of(xfer, struct io_request, ir_nwxfer, &ioreq_bobtype);
6170 
6171  M0_LOG(M0_DEBUG, "[%p]", req);
6173  csb = req->ir_file->f_path.mnt->mnt_sb->s_fs_info;
6174  m0_htable_for(tioreqht, ti, &xfer->nxr_tioreqs_hash) {
6175  if (ti->ti_state != M0_PNDS_ONLINE) {
6176  M0_LOG(M0_INFO, "[%p] Skipped iofops prepare for "FID_F,
6177  req, FID_P(&ti->ti_fid));
6178  continue;
6179  }
6180  if (target_ioreq_type_get(ti) == TI_COB_CREATE &&
6182  rc = ti->ti_ops->tio_cc_fops_prepare(ti);
6183  if (rc != 0)
6184  return M0_ERR_INFO(rc, "[%p] cob create fop"
6185  "failed", req);
6186  continue;
6187  }
6188  rc = ti->ti_ops->tio_iofops_prepare(ti, PA_DATA);
6189  if (rc != 0)
6190  return M0_ERR_INFO(rc, "[%p] data fop failed", req);
6191 
6192  rc = ti->ti_ops->tio_iofops_prepare(ti, PA_PARITY);
6193  if (rc != 0)
6194  return M0_ERR_INFO(rc, "[%p] parity fop failed", req);
6195  } m0_htable_endfor;
6196 
6197  m0_htable_for(tioreqht, ti, &xfer->nxr_tioreqs_hash) {
6198  /* Skips the target device if it is not online. */
6199  if (ti->ti_state != M0_PNDS_ONLINE) {
6200  M0_LOG(M0_INFO, "[%p] Skipped device "FID_F,
6201  req, FID_P(&ti->ti_fid));
6202  continue;
6203  }
6204  M0_LOG(M0_DEBUG, "[%p] Before Submitting fops for device "FID_F
6205  ", fops length of ti %u, total fops nr %llu", req,
6206  FID_P(&ti->ti_fid),
6207  (int)iofops_tlist_length(&ti->ti_iofops),
6208  m0_atomic64_get(&xfer->nxr_iofop_nr));
6209 
6210  if (target_ioreq_type_get(ti) == TI_COB_CREATE &&
6212  /*
6213  * An error returned by rpc post has been ignored.
6214  * It will be handled in the respective bottom half.
6215  */
6217  continue;
6218  }
6219  m0_tl_for (iofops, &ti->ti_iofops, irfop) {
6220  rc = iofop_async_submit(&irfop->irf_iofop,
6221  ti->ti_session);
6222  ri_error = irfop->irf_iofop.if_fop.f_item.ri_error;
6223  M0_LOG(M0_DEBUG, "[%p] Submitted fops for device "
6224  FID_F"@%p, item %p, fops nr=%llu, rc=%d, "
6225  "ri_error=%d", req, FID_P(&ti->ti_fid), irfop,
6226  &irfop->irf_iofop.if_fop.f_item,
6227  m0_atomic64_get(&xfer->nxr_iofop_nr), rc,
6228  ri_error);
6229  if (rc != 0)
6230  goto out;
6231 
6233  csb_pending_io_nr);
6234  if (ri_error == 0)
6235  M0_CNT_INC(nr_dispatched);
6236  else if (post_error == 0)
6237  post_error = ri_error;
6238  } m0_tl_endfor;
6239 
6240  } m0_htable_endfor;
6241 
6242 out:
6243  if (rc == 0 && nr_dispatched == 0 && post_error == 0) {
6244  /* No fop has been dispatched.
6245  *
6246  * This might happen in dgmode reading:
6247  * In 'parity verify' mode, a whole parity group, including
6248  * data and parity units are all read from ioservices.
6249  * If some units failed to read, no need to read extra unit.
6250  * The units needed for recovery are ready.
6251  */
6253  M0_ASSERT(req->ir_type == IRT_READ && csb->csb_verify);
6255  } else if (rc == 0)
6256  xfer->nxr_state = NXS_INFLIGHT;
6257  M0_LOG(M0_DEBUG, "[%p] nxr_iofop_nr %llu, nxr_rdbulk_nr %llu, "
6258  "nr_dispatched %llu", req,
6259  (unsigned long long)m0_atomic64_get(&xfer->nxr_iofop_nr),
6260  (unsigned long long)m0_atomic64_get(&xfer->nxr_rdbulk_nr),
6261  (unsigned long long)nr_dispatched);
6262 
6263  return M0_RC(rc);
6264 }
6265 
6266 static void nw_xfer_req_complete(struct nw_xfer_request *xfer, bool rmw)
6267 {
6268  struct io_request *req;
6269  struct target_ioreq *ti;
6270  struct io_req_fop *irfop;
6271  struct m0_fop *fop;
6272  struct m0_rpc_item *item;
6273  struct m0t1fs_inode *inode;
6274  struct m0t1fs_sb *csb;
6275 
6276  M0_ENTRY("nw_xfer_request %p, rmw %s", xfer,
6277  rmw ? (char *)"true" : (char *)"false");
6278  M0_PRE(xfer != NULL);
6279 
6280  xfer->nxr_state = NXS_COMPLETE;
6281  req = bob_of(xfer, struct io_request, ir_nwxfer, &ioreq_bobtype);
6283  csb = M0T1FS_SB(inode->ci_inode.i_sb);
6284 
6285  M0_LOG(M0_DEBUG, "[%p] nxr_iofop_nr %llu, nxr_rdbulk_nr %llu, "
6286  "rmw %s", req,
6287  (unsigned long long)m0_atomic64_get(&xfer->nxr_iofop_nr),
6288  (unsigned long long)m0_atomic64_get(&xfer->nxr_rdbulk_nr),
6289  rmw ? (char *)"true" : (char *)"false");
6290 
6291  m0_htable_for(tioreqht, ti, &xfer->nxr_tioreqs_hash) {
6292  /* Maintains only the first error encountered. */
6293  if (xfer->nxr_rc == 0) {
6294  xfer->nxr_rc = ti->ti_rc;
6295  M0_LOG(M0_DEBUG, "[%p] nwxfer rc = %d",
6296  req, xfer->nxr_rc);
6297  }
6298 
6299  xfer->nxr_bytes += ti->ti_databytes;
6300  ti->ti_databytes = 0;
6301 
6302  if (csb->csb_oostore && ti->ti_req_type == TI_COB_CREATE &&
6305  continue;
6306  }
6307  m0_tl_teardown(iofops, &ti->ti_iofops, irfop) {
6308  fop = &irfop->irf_iofop.if_fop;
6310  M0_LOG(M0_DEBUG, "[%p] fop %p, ref %llu, "
6311  "item %p[%u], ri_error %d, ri_state %d",
6312  req, fop,
6313  (unsigned long long)m0_ref_read(&fop->f_ref),
6315  item->ri_sm.sm_state);
6316 
6317  /* Maintains only the first error encountered. */
6318  if (xfer->nxr_rc == 0 &&
6320  xfer->nxr_rc = item->ri_error;
6321  M0_LOG(M0_DEBUG, "[%p] nwxfer rc = %d",
6322  req, xfer->nxr_rc);
6323  }
6324 
6327  item->ri_rmachine != NULL));
6328  if (item->ri_rmachine == NULL) {
6329  M0_ASSERT(ti->ti_session != NULL);
6332  }
6333 
6334  M0_LOG(M0_DEBUG, "[%p] item %p, target fid "
6335  FID_F"fop %p, "
6336  "ref %llu", req, item, FID_P(&ti->ti_fid), fop,
6337  (unsigned long long)m0_ref_read(&fop->f_ref));
6339  }
6340  } m0_htable_endfor;
6341 
6342  M0_LOG(M0_INFO, "[%p] Number of bytes %s = %llu",
6343  req, req->ir_type == IRT_READ? "read" : "written",
6344  xfer->nxr_bytes);
6345 
6346  M0_LOG(M0_DEBUG, "[%p] nxr_rc %d, nxr_iofop_nr %llu, "
6347  "nxr_rdbulk_nr %llu", req, xfer->nxr_rc,
6348  (unsigned long long)m0_atomic64_get(&xfer->nxr_iofop_nr),
6349  (unsigned long long)m0_atomic64_get(&xfer->nxr_rdbulk_nr));
6350  M0_ASSERT(ergo(xfer->nxr_rc == 0, nw_xfer_request_invariant(xfer)));
6351 
6352  /*
6353  * This function is invoked from 4 states - IRS_READ_COMPLETE,
6354  * IRS_WRITE_COMPLETE, IRS_DEGRADED_READING, IRS_DEGRADED_WRITING.
6355  * And the state change is applicable only for healthy state IO,
6356  * meaning for states IRS_READ_COMPLETE and IRS_WRITE_COMPLETE.
6357  */
6358  if (M0_IN(ioreq_sm_state(req),
6360  if (!rmw)
6362  else if (ioreq_sm_state(req) == IRS_READ_COMPLETE)
6363  xfer->nxr_bytes = 0;
6364  }
6365  req->ir_rc = xfer->nxr_rc;
6366  M0_LEAVE();
6367 }
6368 
6374 static int io_req_fop_dgmode_read(struct io_req_fop *irfop)
6375 {
6376  int rc;
6377  uint32_t cnt;
6378  uint32_t seg;
6379  uint32_t seg_nr;
6380  uint64_t grpid;
6381  uint64_t pgcur = 0;
6382  m0_bindex_t *index;
6383  struct io_request *req;
6384  struct m0_fop *fop;
6385  struct m0_rpc_bulk *rbulk;
6386  struct pargrp_iomap *map = NULL;
6387  struct m0_rpc_bulk_buf *rbuf;
6388 
6389  M0_PRE(irfop != NULL);
6390 
6391  req = bob_of(irfop->irf_tioreq->ti_nwxfer, struct io_request,
6392  ir_nwxfer, &ioreq_bobtype);
6393  rbulk = &irfop->irf_iofop.if_rbulk;
6394  fop = &irfop->irf_iofop.if_fop;
6395 
6396  M0_ENTRY("[%p] target fid "FID_F", fop %p, %p[%u] ", req,
6397  FID_P(&irfop->irf_tioreq->ti_fid), fop,
6398  &fop->f_item, m0_fop_opcode(fop));
6399 
6400  m0_tl_for (rpcbulk, &rbulk->rb_buflist, rbuf) {
6401 
6402  index = rbuf->bb_zerovec.z_index;
6404 
6405  for (seg = 0; seg < seg_nr; ) {
6406 
6407  grpid = pargrp_id_find(index[seg], req, irfop);
6408  for (cnt = 1, ++seg; seg < seg_nr; ++seg) {
6409 
6410  M0_ASSERT(ergo(seg > 0, index[seg] >
6411  index[seg - 1]));
6412  M0_ASSERT((index[seg] & ~PAGE_MASK) == 0);
6413 
6414  if (grpid ==
6415  pargrp_id_find(index[seg], req, irfop))
6416  ++cnt;
6417  else
6418  break;
6419  }
6420  ioreq_pgiomap_find(req, grpid, &pgcur, &map);
6421  M0_ASSERT(map != NULL);
6422  rc = map->pi_ops->pi_dgmode_process(map,
6423  irfop->irf_tioreq, &index[seg - cnt],
6424  cnt);
6425  if (rc != 0)
6426  return M0_ERR_INFO(rc, "[%p] fop %p, %p[%u] "
6427  "Parity group dgmode process failed",
6428  req, fop, &fop->f_item,
6429  m0_fop_opcode(fop));
6430  }
6431  } m0_tl_endfor;
6432  return M0_RC(0);
6433 }
6434 
6435 /*
6436  * Used in precomputing io fop size while adding rpc bulk buffer and
6437  * data buffers.
6438  */
6439 static inline uint32_t io_desc_size(struct m0_net_domain *ndom)
6440 {
6441  return
6442  /* size of variables ci_nr and nbd_len */
6445  /* size of nbd_data */
6447 }
6448 
6449 static inline uint32_t io_seg_size(void)
6450 {
6451  return sizeof(struct m0_ioseg);
6452 }
6453 
6454 static uint32_t io_di_size(const struct io_request *req)
6455 {
6456  struct m0_file *file;
6457 
6458  file = &m0t1fs_file_to_m0inode(req->ir_file)->ci_flock;
6459  if (file->fi_di_ops->do_out_shift(file) == 0)
6460  return 0;
6462 }
6463 
6464 static int bulk_buffer_add(struct io_req_fop *irfop,
6465  struct m0_net_domain *dom,
6466  struct m0_rpc_bulk_buf **rbuf,
6467  uint32_t *delta,
6468  uint32_t maxsize)
6469 {
6470  int rc;
6471  int seg_nr;
6472  struct io_request *req;
6473  struct m0_indexvec_varr *ivv;
6474 
6475  M0_PRE(irfop != NULL);
6476  M0_PRE(dom != NULL);
6477  M0_PRE(rbuf != NULL);
6478  M0_PRE(delta != NULL);
6479  M0_PRE(maxsize > 0);
6480  M0_ENTRY("io_req_fop %p net_domain %p delta_size %d",
6481  irfop, dom, *delta);
6482 
6483  req = bob_of(irfop->irf_tioreq->ti_nwxfer, struct io_request,
6484  ir_nwxfer, &ioreq_bobtype);
6485 
6486  if (M0_IN(ioreq_sm_state(req), (IRS_READING, IRS_WRITING))) {
6487  ivv = &irfop->irf_tioreq->ti_ivv;
6488  } else {
6489  ivv = &irfop->irf_tioreq->ti_dgvec->dr_ivec_varr;
6490  }
6491 
6493  V_SEG_NR(ivv));
6494  *delta += io_desc_size(dom);
6495 
6496  if (m0_io_fop_size_get(&irfop->irf_iofop.if_fop) + *delta < maxsize) {
6497 
6499  dom, NULL, rbuf);
6500  if (rc != 0) {
6501  *delta -= io_desc_size(dom);
6502  return M0_ERR_INFO(rc, "[%p] Failed to add "
6503  "rpc_bulk_buffer", req);
6504  }
6505  } else {
6506  rc = M0_ERR(-ENOSPC);
6507  *delta -= io_desc_size(dom);
6508  }
6509 
6510  M0_POST(ergo(rc == 0, *rbuf != NULL));
6511  return M0_RC(rc);
6512 }
6513 
6514 static void cc_fop_release(struct m0_ref *ref)
6515 {
6516  struct m0_fop *fop;
6517 
6518  M0_ENTRY();
6519  fop = container_of(ref, struct m0_fop, f_ref);
6520  m0_fop_fini(fop);
6521  M0_LEAVE();
6522 }
6523 
6525 {
6526  struct m0_fop *fop;
6527  struct m0_fop_cob_common *common;
6528  struct io_request *req;
6529  int rc;
6530 
6532  fop = &ti->ti_cc_fop.crf_fop;
6535  if (rc != 0) {
6536  m0_fop_fini(fop);
6537  goto out;
6538  }
6539  ti->ti_cc_fop_inited = true;
6543  fop->f_item.ri_nr_sent_max = M0T1FS_RPC_MAX_RETRIES;
6544  fop->f_item.ri_resend_interval = M0T1FS_RPC_RESEND_INTERVAL;
6545  req = bob_of(ti->ti_nwxfer, struct io_request, ir_nwxfer,
6546  &ioreq_bobtype);
6547  common = m0_cobfop_common_get(fop);
6548  common->c_gobfid = *file_to_fid(req->ir_file);
6549  common->c_cobfid = ti->ti_fid;
6550  common->c_pver = m0t1fs_file_to_m0inode(req->ir_file)->ci_pver;
6551  common->c_cob_type = M0_COB_IO;
6552  common->c_cob_idx = m0_fid_cob_device_id(&ti->ti_fid);
6553  common->c_flags |= M0_IO_FLAG_CROW;
6554  common->c_body.b_pver = m0t1fs_file_to_m0inode(req->ir_file)->ci_pver;
6555  common->c_body.b_nlink = 1;
6556  common->c_body.b_valid |= M0_COB_PVER;
6557  common->c_body.b_valid |= M0_COB_NLINK;
6558  common->c_body.b_valid |= M0_COB_LID;
6559  common->c_body.b_lid = m0t1fs_file_to_m0inode(req->ir_file)->ci_layout_id;
6561 
6562 out:
6563  return M0_RC(rc);
6564 }
6565 
6567  enum page_attr filter)
6568 {
6569  int rc = 0;
6570  uint32_t seg = 0;
6571  /* Number of segments in one m0_rpc_bulk_buf structure. */
6572  uint32_t bbsegs;
6573  uint32_t maxsize;
6574  uint32_t delta;
6575  enum page_attr rw;
6576  struct m0_varr *pattr;
6577  struct m0_indexvec_varr *bvec;
6578  struct io_request *req;
6579  struct m0_indexvec_varr *ivv = NULL;
6580  struct io_req_fop *irfop;
6581  struct m0_net_domain *ndom;
6582  struct m0_rpc_bulk_buf *rbuf;
6583  struct m0_io_fop *iofop;
6584  struct m0_fop_cob_rw *rw_fop;
6585  struct nw_xfer_request *xfer;
6586 
6588  M0_PRE(M0_IN(filter, (PA_DATA, PA_PARITY)));
6589 
6590  xfer = ti->ti_nwxfer;
6591  req = bob_of(xfer, struct io_request, ir_nwxfer, &ioreq_bobtype);
6592 
6593  M0_ASSERT(M0_IN(ioreq_sm_state(req),
6596 
6597  M0_ENTRY("[%p] prepare io fops for target ioreq %p filter 0x%x, tfid "
6598  FID_F, req, ti, filter, FID_P(&ti->ti_fid));
6599 
6601  if (rc != 0 && rc != -ECANCELED)
6602  return M0_ERR(rc);
6603 
6604  if (M0_IN(ioreq_sm_state(req), (IRS_READING, IRS_WRITING))) {
6605  ivv = &ti->ti_ivv;
6606  bvec = &ti->ti_bufvec;
6607  pattr = &ti->ti_pageattrs;
6608  } else {
6609  if (ti->ti_dgvec == NULL) {
6610  return M0_RC(0);
6611  }
6612  ivv = &ti->ti_dgvec->dr_ivec_varr;
6613  bvec = &ti->ti_dgvec->dr_bufvec;
6614  pattr = &ti->ti_dgvec->dr_pageattrs;
6615  }
6616 
6621  PA_READ;
6623 
6624  while (seg < V_SEG_NR(ivv)) {
6625 
6626  delta = 0;
6627  bbsegs = 0;
6628 
6629  M0_LOG(M0_DEBUG, "[%p] seg=%u@%u pageattr=0x%x, filter=0x%x, "
6630  "rw=0x%x",
6631  req, seg, V_SEG_NR(ivv),
6632  PA(pattr, seg), filter, rw);
6633 
6634  if (!(PA(pattr, seg) & filter) || !(PA(pattr, seg) & rw)) {
6635  M0_LOG(M0_DEBUG, "[%p] skipping, pageattr = 0x%x, "
6636  "filter = 0x%x, rw = 0x%x",
6637  req, PA(pattr, seg), filter, rw);
6638  ++seg;
6639  continue;
6640  }
6641  M0_ALLOC_PTR(irfop);
6642  if (irfop == NULL) {
6643  rc = M0_ERR(-ENOMEM);
6644  goto err;
6645  }
6646  rc = io_req_fop_init(irfop, ti, filter);
6647  if (rc != 0) {
6648  m0_free(irfop);
6649  goto err;
6650  }
6651  ++iommstats.a_io_req_fop_nr;
6652 
6653  iofop = &irfop->irf_iofop;
6654  rw_fop = io_rw_get(&iofop->if_fop);
6655 
6656  rc = bulk_buffer_add(irfop, ndom, &rbuf, &delta, maxsize);
6657  if (rc != 0) {
6658  io_req_fop_fini(irfop);
6659  m0_free(irfop);
6660  goto err;
6661  }
6662  delta += io_seg_size();
6663 
6664  /*
6665  * Adds io segments and io descriptor only if it fits within
6666  * permitted size.
6667  */
6668  while (seg < V_SEG_NR(ivv) &&
6669  m0_io_fop_size_get(&iofop->if_fop) + delta < maxsize) {
6670 
6671  M0_LOG(M0_DEBUG, "[%p] adding: seg=%u@%u pa=0x%x, "
6672  "filter=0x%x, rw=0x%x", req, seg,
6673  V_SEG_NR(ivv),
6674  PA(pattr, seg), filter, rw);
6675 
6676  /*
6677  * Adds a page to rpc bulk buffer only if it passes
6678  * through the filter.
6679  */
6680  if ((PA(pattr, seg) & rw) && (PA(pattr, seg) & filter)) {
6681  delta += io_seg_size() + io_di_size(req);
6682 
6684  V_ADDR (bvec, seg),
6685  V_COUNT(ivv, seg),
6686  V_INDEX(ivv, seg),
6687  ndom);
6688 
6689  if (rc == -EMSGSIZE) {
6690 
6691  /*
6692  * Fix the number of segments in
6693  * current m0_rpc_bulk_buf structure.
6694  */
6695  rbuf->bb_nbuf->nb_buffer.ov_vec.v_nr =
6696  bbsegs;
6697  rbuf->bb_zerovec.z_bvec.ov_vec.v_nr =
6698  bbsegs;
6699  bbsegs = 0;
6700 
6701  delta -= io_seg_size() -
6702  io_di_size(req);
6703  rc = bulk_buffer_add(irfop, ndom,
6704  &rbuf, &delta, maxsize);
6705  if (rc == -ENOSPC)
6706  break;
6707  else if (rc != 0)
6708  goto fini_fop;
6709 
6710  /*
6711  * Since current bulk buffer is full,
6712  * new bulk buffer is added and
6713  * existing segment is attempted to
6714  * be added to new bulk buffer.
6715  */
6716  continue;
6717  } else if (rc == 0)
6718  ++bbsegs;
6719  }
6720  ++seg;
6721  }
6722 
6723  if (m0_io_fop_byte_count(iofop) == 0) {
6724  irfop_fini(irfop);
6725  continue;
6726  }
6727 
6728  rbuf->bb_nbuf->nb_buffer.ov_vec.v_nr = bbsegs;
6729  rbuf->bb_zerovec.z_bvec.ov_vec.v_nr = bbsegs;
6730 
6731  rw_fop->crw_fid = ti->ti_fid;
6732  rw_fop->crw_index = ti->ti_obj;
6733  rw_fop->crw_pver =
6734  m0t1fs_file_to_m0inode(req->ir_file)->ci_pver;
6735  rw_fop->crw_lid = m0t1fs_file_to_m0inode(req->ir_file)->ci_layout_id;
6736 
6737  rc = m0_io_fop_prepare(&iofop->if_fop);
6738  if (rc != 0)
6739  goto fini_fop;
6740 
6741  if (m0_is_read_fop(&iofop->if_fop))
6744  &iofop->if_rbulk));
6745 
6746  m0_atomic64_inc(&xfer->nxr_iofop_nr);
6747  iofops_tlist_add(&ti->ti_iofops, irfop);
6748 
6749  M0_LOG(M0_DEBUG, "[%p] fop=%p bulk=%p (%s) @"FID_F
6750  " pending io fops = %llu, pending read bulks = %llu "
6751  "list_len=%d",
6752  req, &iofop->if_fop, &iofop->if_rbulk,
6753  m0_is_read_fop(&iofop->if_fop) ? "r" : "w",
6754  FID_P(&ti->ti_fid),
6755  m0_atomic64_get(&xfer->nxr_iofop_nr),
6757  (int)iofops_tlist_length(&ti->ti_iofops));
6758  }
6759 
6760  return M0_RC(0);
6761 fini_fop:
6762  irfop_fini(irfop);
6763 err:
6764  m0_tl_teardown(iofops, &ti->ti_iofops, irfop) {
6765  irfop_fini(irfop);
6766  }
6767 
6768  return M0_ERR_INFO(rc, "[%p] iofops_prepare failed", req);
6769 }
6770 
6771 const struct inode_operations m0t1fs_reg_inode_operations = {
6772  .setattr = m0t1fs_setattr,
6773  .getattr = m0t1fs_getattr,
6774 #if LINUX_VERSION_CODE < KERNEL_VERSION(4,9,0)
6775  .setxattr = m0t1fs_setxattr,
6776  .getxattr = m0t1fs_getxattr,
6777  .removexattr = m0t1fs_removexattr,
6778 #endif
6779  .listxattr = m0t1fs_listxattr,
6780 };
6781 
6782 
6783 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,7,0)
6784 static ssize_t m0t1fs_direct_IO(struct kiocb *kcb,
6785  struct iov_iter *from)
6786 {
6787  struct m0_indexvec_varr *ivv;
6788  ssize_t retval;
6789  loff_t size;
6790  int seg;
6791  int rw;
6792 
6794  M0_ENTRY();
6795  rw = iov_iter_rw(from);
6796  M0_LOG(M0_DEBUG, "m0t1fs_direct_IO: rw=%s pos=%lld seg_nr=%lu "
6797  "addr=%p len=%lu", rw == READ ? "READ" : "WRITE",
6798  (long long)kcb->ki_pos, from->nr_segs, from->iov->iov_base,
6799  from->iov->iov_len);
6800 
6801  M0_PRE(M0_IN(rw, (READ, WRITE)));
6802 
6803  size = i_size_read(m0t1fs_file_to_inode(kcb->ki_filp));
6804  ivv = indexvec_create(from->nr_segs, from->iov, kcb->ki_pos);
6805  if (ivv == NULL)
6806  return M0_ERR(-ENOMEM);
6807  if (rw == READ) {
6808  /* Truncate vector to eliminate reading beyond the EOF */
6809  for (seg = 0; seg < V_SEG_NR(ivv); ++seg)
6810  if (v_seg_endpos(ivv, seg) > size) {
6811  V_SEG_NR(ivv) = seg + 1;
6812  V_COUNT(ivv, seg) = size - V_INDEX(ivv, seg);
6813  break;
6814  }
6815  }
6816 
6817  retval = m0t1fs_aio(kcb, from->iov, ivv, rw == READ ? IRT_READ : IRT_WRITE);
6818 
6819  /*
6820  * m0t1fs_direct_IO() must process all requested data or return error.
6821  * Otherwise generic kernel code will use unimplemented callbacks to
6822  * continue buffered I/O (e.g. write_begin()).
6823  */
6824  M0_ASSERT_INFO(retval < 0 || retval == indexvec_varr_count(ivv),
6825  "%" PRIi64 " != %" PRIi64, (int64_t)retval,
6826  indexvec_varr_count(ivv));
6827 
6828  m0_indexvec_varr_free(ivv);
6829  m0_free(ivv);
6830  M0_LEAVE();
6831  return retval;
6832 }
6833 
6834 
6835 #else
6836 static ssize_t m0t1fs_direct_IO(int rw,
6837  struct kiocb *kcb,
6838  const struct iovec *iov,
6839  loff_t pos,
6840  unsigned long seg_nr)
6841 {
6842  struct m0_indexvec_varr *ivv;
6843  ssize_t retval;
6844  loff_t size;
6845  int seg;
6846 
6848  M0_ENTRY();
6849  M0_LOG(M0_DEBUG, "m0t1fs_direct_IO: rw=%s pos=%lld seg_nr=%lu "
6850  "addr=%p len=%lu", rw == READ ? "READ" : "WRITE",
6851  (long long)pos, seg_nr, iov->iov_base, iov->iov_len);
6852 
6853  M0_PRE(M0_IN(rw, (READ, WRITE)));
6854 
6855  size = i_size_read(m0t1fs_file_to_inode(kcb->ki_filp));
6856  ivv = indexvec_create(seg_nr, iov, pos);
6857  if (ivv == NULL)
6858  return M0_ERR(-ENOMEM);
6859  if (rw == READ) {
6860  /* Truncate vector to eliminate reading beyond the EOF */
6861  for (seg = 0; seg < V_SEG_NR(ivv); ++seg)
6862  if (v_seg_endpos(ivv, seg) > size) {
6863  V_SEG_NR(ivv) = seg + 1;
6864  V_COUNT(ivv, seg) = size - V_INDEX(ivv, seg);
6865  break;
6866  }
6867  }
6868 
6869  retval = m0t1fs_aio(kcb, iov, ivv, rw == READ ? IRT_READ : IRT_WRITE);
6870 
6871  /*
6872  * m0t1fs_direct_IO() must process all requested data or return error.
6873  * Otherwise generic kernel code will use unimplemented callbacks to
6874  * continue buffered I/O (e.g. write_begin()).
6875  */
6876  M0_ASSERT_INFO(retval < 0 || retval == indexvec_varr_count(ivv),
6877  "%" PRIi64 " != %" PRIi64, (int64_t)retval,
6878  indexvec_varr_count(ivv));
6879 
6880  m0_indexvec_varr_free(ivv);
6881  m0_free(ivv);
6882  M0_LEAVE();
6883  return retval;
6884 }
6885 #endif
6886 
6887 const struct address_space_operations m0t1fs_aops = {
6888  .direct_IO = m0t1fs_direct_IO,
6889 };
6890 
6891 #undef M0_TRACE_SUBSYSTEM
M0_INTERNAL long m0t1fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
Definition: ioctl.c:36
ssize_t m0t1fs_listxattr(struct dentry *dentry, char *buffer, size_t size)
Definition: dir.c:538
static void m0_atomic64_inc(struct m0_atomic64 *a)
uint32_t b_nlink
Definition: md_fops.h:81
M0_INTERNAL int m0_rpc_post(struct m0_rpc_item *item)
Definition: rpc.c:63
static int io_request_init(struct io_request *req, struct file *file, const struct iovec *iov, struct m0_indexvec_varr *ivv, enum io_req_type rw)
Definition: file.c:4299
static ssize_t aio_read(struct kiocb *kcb, struct iov_iter *from)
Definition: file.c:5375
static int ioreq_file_lock(struct io_request *req)
Definition: file.c:3970
static int user_data_copy(struct pargrp_iomap *map, m0_bindex_t start, m0_bindex_t end, struct iov_iter *it, enum copy_direction dir, enum page_attr filter)
Definition: file.c:1294
uint32_t rit_opcode
Definition: item.h:474
uint32_t m0_fop_opcode(const struct m0_fop *fop)
Definition: fop.c:226
static uint32_t iomap_dgmode_recov_prepare(struct pargrp_iomap *map, uint8_t *failed)
Definition: file.c:3012
uint64_t crw_lid
Definition: io_fops.h:394
static size_t nr
Definition: dump.c:1505
M0_INTERNAL void m0_chan_wait(struct m0_clink *link)
Definition: chan.c:336
uint64_t c_flags
Definition: io_fops.h:477
enum sns_repair_state ir_sns_state
M0_INTERNAL bool m0_ivec_varr_cursor_move_to(struct m0_ivec_varr_cursor *cur, m0_bindex_t dest)
Definition: vec.c:1250
uint64_t ir_iomap_nr
m0_time_t ri_resend_interval
Definition: item.h:144
uint64_t rwr_count
Definition: io_fops.h:324
static int ioreq_dgmode_recover(struct io_request *req)
Definition: file.c:3570
#define M0_PRE(cond)
#define M0_ALLOC_ARR(arr, nr)
Definition: memory.h:84
M0_TL_DECLARE(rpcbulk, M0_INTERNAL, struct m0_rpc_bulk_buf)
#define V_INDEX(ivec, i)
Definition: file.c:395
M0_INTERNAL void m0_sm_fail(struct m0_sm *mach, int fail_state, int32_t rc)
Definition: sm.c:468
static const struct io_request_ops ioreq_oostore_ops
Definition: file.c:961
static struct m0_fid target_fid(const struct io_request *req, struct m0_pdclust_tgt_addr *tgt)
Definition: file.c:668
M0_INTERNAL m0_bcount_t m0_ext_length(const struct m0_ext *ext)
Definition: ext.c:42
M0_INTERNAL m0_bcount_t m0_io_fop_byte_count(struct m0_io_fop *iofop)
Definition: io_fops.c:1925
static m0_bindex_t seg_set(struct pargrp_iomap *map, uint32_t seg, struct m0_ivec_varr_cursor *cur, m0_bindex_t grpend)
Definition: file.c:2329
#define COUNT(ivec, i)
Definition: file.c:392
M0_INTERNAL void m0_mutex_unlock(struct m0_mutex *mutex)
Definition: mutex.c:66
static bool target_ioreq_invariant(struct target_ioreq *ti)
Definition: file.c:1174
static const struct io_request_ops ioreq_ops
Definition: file.c:947
M0_INTERNAL struct m0t1fs_sb * m0_fop_to_sb(struct m0_fop *fop)
Definition: file.c:6012
M0_INTERNAL int m0t1fs_inode_layout_init(struct m0t1fs_inode *ci)
Definition: inode.c:570
int m0t1fs_removexattr(struct dentry *dentry, const char *name)
Definition: dir.c:550
M0_INTERNAL struct m0_fop_cob_common * m0_cobfop_common_get(struct m0_fop *fop)
Definition: io_fops.c:990
#define m0_htable_for(name, var, htable)
Definition: hash.h:483
struct m0_fop crf_fop
static void parity_page_pos_get(struct pargrp_iomap *map, m0_bindex_t index, uint32_t *row, uint32_t *col)
Definition: file.c:744
m0_bindex_t e_end
Definition: ext.h:40
static uint32_t seg_nr
Definition: net.c:119
int const char const void size_t int flags
Definition: dir.c:328
struct m0_layout * li_l
Definition: layout.h:590
uint64_t sa_group
Definition: pdclust.h:241
static m0_bcount_t seg_endpos(const struct m0_indexvec *ivec, uint32_t i)
Definition: file.c:420
static const uint64_t k1
Definition: hash_fnc.c:34
uint32_t b_valid
Definition: md_fops.h:76
#define NULL
Definition: misc.h:38
M0_INTERNAL void m0_clink_init(struct m0_clink *link, m0_chan_cb_t cb)
Definition: chan.c:201
map
Definition: processor.c:112
int(* iro_parity_recalc)(struct io_request *req)
static int dgmode_rwvec_alloc_init(struct target_ioreq *ti)
Definition: file.c:3275
uint64_t pa_unit_size
Definition: pdclust.h:118
struct m0_atomic64 nxr_rdbulk_nr
struct m0_indexvec_varr pi_ivv
M0_INTERNAL int m0_rpc_bulk_store(struct m0_rpc_bulk *rbulk, const struct m0_rpc_conn *conn, struct m0_net_buf_desc_data *to_desc, const struct m0_net_buffer_callbacks *bulk_cb)
Definition: bulk.c:520
M0_INTERNAL void m0_clink_del_lock(struct m0_clink *link)
Definition: chan.c:293
struct m0_pool_version * l_pver
Definition: layout.h:261
static void io_req_fop_fini(struct io_req_fop *fop)
Definition: file.c:5019
static void pargrp_iomap_fini(struct pargrp_iomap *map)
Definition: file.c:1881
static struct buffer * cur(struct m0_addb2_mach *mach, m0_bcount_t space)
Definition: addb2.c:791
uint32_t crw_index
Definition: io_fops.h:388
struct m0_bufvec nb_buffer
Definition: net.h:1322
int(* iro_dgmode_write)(struct io_request *req, bool rmw)
Definition: idx_mock.c:52
m0_bindex_t * z_index
Definition: vec.h:516
struct m0_buf db_auxbuf
#define ergo(a, b)
Definition: misc.h:293
static int ioreq_iomaps_prepare(struct io_request *req)
Definition: file.c:3187
uint32_t rwr_repair_done
Definition: io_fops.h:333
Definition: storage.c:103
uint32_t ci_nr
Definition: vec.h:618
void(* sa_cb)(struct m0_sm_group *grp, struct m0_sm_ast *)
Definition: sm.h:506
static bool is_page_read(struct data_buf *dbuf)
Definition: file.c:565
static int target_cob_create_fop_prepare(struct target_ioreq *ti)
Definition: file.c:6524
#define M0_MEMBER_SIZE(type, member)
Definition: misc.h:62
Definition: sm.h:350
const m0_time_t M0_TIME_NEVER
Definition: time.c:108
void * b_addr
Definition: buf.h:39
M0_INTERNAL struct m0_pool_version * m0_pool_version_find(struct m0_pools_common *pc, const struct m0_fid *id)
Definition: pool.c:586
static struct io_request req
Definition: file.c:100
uint32_t ir_dgmap_nr
struct m0_file file
Definition: di.c:36
uint32_t pa_N
Definition: pdclust.h:104
static struct m0_sm_group * grp
Definition: bytecount.c:38
M0_INTERNAL void m0_fop_init(struct m0_fop *fop, struct m0_fop_type *fopt, void *data, void(*fop_release)(struct m0_ref *))
Definition: fop.c:79
M0_INTERNAL int m0_rpc_bulk_buf_databuf_add(struct m0_rpc_bulk_buf *rbuf, void *buf, m0_bcount_t count, m0_bindex_t index, struct m0_net_domain *netdom)
Definition: bulk.c:331
struct data_buf *** pi_paritybufs
struct m0_poolmach pv_mach
Definition: pool.h:133
static void data_buf_fini(struct data_buf *buf)
Definition: file.c:1157
static int nw_xfer_tioreq_map(struct nw_xfer_request *xfer, const struct m0_pdclust_src_addr *src, struct m0_pdclust_tgt_addr *tgt, struct target_ioreq **tio)
Definition: file.c:4520
#define M0_LOG(level,...)
Definition: trace.h:167
M0_LEAVE()
struct m0_sm_ast crf_ast
static void nw_xfer_req_complete(struct nw_xfer_request *xfer, bool rmw)
Definition: file.c:6266
M0_INTERNAL void m0_sm_ast_post(struct m0_sm_group *grp, struct m0_sm_ast *ast)
Definition: sm.c:135
static uint32_t layout_k(const struct m0_pdclust_layout *play)
Definition: file.c:520
static ssize_t file_aio_read(struct kiocb *kcb, struct iov_iter *from)
Definition: file.c:5544
static void m0_atomic64_sub(struct m0_atomic64 *a, int64_t num)
int(* nxo_tioreq_map)(struct nw_xfer_request *xfer, const struct m0_pdclust_src_addr *src, struct m0_pdclust_tgt_addr *tgt, struct target_ioreq **tio)
static struct m0t1fs_sb * file_to_sb(const struct file *file)
Definition: file.c:482
M0_INTERNAL const struct m0_fid * m0t1fs_inode_fid(const struct m0t1fs_inode *ci)
Definition: inode.c:61
struct m0_layout_instance pi_base
Definition: pdclust.h:173
uint32_t pa_K
Definition: pdclust.h:107
M0_INTERNAL int m0_sns_repair_spare_map(struct m0_poolmach *pm, const struct m0_fid *fid, struct m0_pdclust_layout *pl, struct m0_pdclust_instance *pi, uint64_t group, uint64_t unit, uint32_t *spare_slot_out, uint32_t *spare_slot_out_prev)
uint64_t(* do_out_shift)(const struct m0_file *file)
Definition: di.h:109
struct m0_vec ov_vec
Definition: vec.h:147
struct m0_chan rb_chan
Definition: bulk.h:258
static m0_bcount_t v_seg_endpos(struct m0_indexvec_varr *ivec, uint32_t i)
Definition: file.c:428
static ssize_t aio_write(struct kiocb *kcb, struct iov_iter *from)
Definition: file.c:5234
static const struct m0_bob_type ioreq_bobtype
Definition: file.c:340
static bool m0_is_po2(uint64_t val)
Definition: arith.h:153
struct m0_rpc_bulk if_rbulk
Definition: io_fops.h:177
M0_INTERNAL void m0_buf_init(struct m0_buf *buf, void *data, uint32_t nob)
Definition: buf.c:37
struct m0_sm ri_sm
Definition: item.h:181
void(* nxo_complete)(struct nw_xfer_request *xfer, bool rmw)
struct m0_bufvec data
Definition: di.c:40
const struct address_space_operations m0t1fs_aops
Definition: file.c:6887
static uint32_t io_seg_size(void)
Definition: file.c:6449
M0_INTERNAL m0_bindex_t m0_ivec_varr_cursor_conti(const struct m0_ivec_varr_cursor *cur, m0_bindex_t dest)
Definition: vec.c:1271
#define V_ADDR(bv, i)
Definition: file.c:396
uint64_t ta_obj
Definition: pdclust.h:256
M0_INTERNAL m0_bcount_t m0_ivec_varr_cursor_step(const struct m0_ivec_varr_cursor *cur)
Definition: vec.c:1224
static void buf_page_free(struct m0_buf *buf)
Definition: file.c:4828
int(* pi_dgmode_recover)(struct pargrp_iomap *map)
static void seg_idx_inc_round(struct pargrp_iomap *map, uint32_t seg, uint64_t sz)
Definition: file.c:2341
enum target_ioreq_type ti_req_type
struct m0_indexvec_varr ti_bufvec
#define PA(pa, i)
Definition: file.c:400
static int sum
Definition: rwlock.c:53
struct m0_net_domain * ntm_dom
Definition: net.h:853
int32_t ri_error
Definition: item.h:161
struct m0_net_buf_desc_data * id_descs
Definition: io_fops.h:313
void * m0_fop_data(const struct m0_fop *fop)
Definition: fop.c:220
uint32_t nbd_len
uint32_t c_cob_type
Definition: io_fops.h:474
static struct m0_be_emap_cursor it
Definition: extmap.c:46
#define m0_varr_endfor
Definition: varr.h:264
M0_HT_DESCR_DEFINE(tioreqht, "Hash of target_ioreq objects", static, struct target_ioreq, ti_link, ti_magic, M0_T1FS_TIOREQ_MAGIC, M0_T1FS_TLIST_HEAD_MAGIC, ti_fid.f_container, tioreqs_hash_func, tioreq_key_eq)
M0_INTERNAL void m0_file_lock(struct m0_rm_owner *owner, struct m0_rm_incoming *req)
Definition: file.c:522
static struct m0_bob_type iofop_bobtype
Definition: file.c:339
uint64_t m0_bindex_t
Definition: types.h:80
uint64_t ti_obj
struct m0_varr ti_pageattrs
#define M0_BITS(...)
Definition: misc.h:236
struct m0_fid c_cobfid
Definition: io_fops.h:465
uint64_t m0_bcount_t
Definition: types.h:77
Definition: sm.h:504
static void io_rpc_item_cb(struct m0_rpc_item *item)
Definition: file.c:5954
M0_INTERNAL int m0_poolmach_device_state(struct m0_poolmach *pm, uint32_t device_index, enum m0_pool_nd_state *state_out)
Definition: pool_machine.c:816
M0_INTERNAL int m0_parity_math_recover(struct m0_parity_math *math, struct m0_buf *data, struct m0_buf *parity, struct m0_buf *fails, enum m0_parity_linsys_algo algo)
Definition: parity_math.c:383
int m0t1fs_flush(struct file *file, fl_owner_t id)
Definition: file.c:5570
#define PAGE_SIZE
Definition: lnet_ut.c:277
static int void * buf
Definition: dir.c:1019
static uint64_t round_up(uint64_t val, uint64_t size)
Definition: file.c:711
#define container_of(ptr, type, member)
Definition: misc.h:33
struct m0_rm_credit rin_want
Definition: rm.h:1450
static struct m0_rpc_session session
Definition: formation2.c:38
#define M0_SET0(obj)
Definition: misc.h:64
M0_INTERNAL void m0_mutex_lock(struct m0_mutex *mutex)
Definition: mutex.c:49
static void ioreq_sm_state_set(struct io_request *req, int state)
Definition: file.c:1039
M0_ADDB2_ADD(M0_AVI_FS_CREATE, new_fid.f_container, new_fid.f_key, mode, rc)
static int pargrp_iomap_parity_verify(struct pargrp_iomap *map)
Definition: file.c:1409
m0_bcount_t nbe_length
Definition: net.h:1226
int(* iro_parity_verify)(struct io_request *req)
M0_INTERNAL int m0_parity_math_diff(struct m0_parity_math *math, struct m0_buf *old, struct m0_buf *new, struct m0_buf *parity, uint32_t index)
Definition: parity_math.c:371
static int io_req_fop_init(struct io_req_fop *fop, struct target_ioreq *ti, enum page_attr pattr)
Definition: file.c:4971
struct m0_net_buffer * nbe_buffer
Definition: net.h:1194
M0_INTERNAL int m0_fid_cmp(const struct m0_fid *fid0, const struct m0_fid *fid1)
Definition: fid.c:170
struct m0t1fs_sb * csb
Definition: dir.c:330
struct m0_sm_ast irf_ast
struct m0_fid crw_pver
Definition: io_fops.h:391
static int io_req_fop_dgmode_read(struct io_req_fop *irfop)
Definition: file.c:6374
uint64_t bt_magix
Definition: bob.h:77
#define M0_SWAP(v0, v1)
Definition: arith.h:207
static struct m0_rpc_item * item
Definition: item.c:56
struct m0_pdclust_attr pl_attr
Definition: pdclust.h:150
static void databufs_set_dgw_mode(struct pargrp_iomap *iomap, struct m0_ext *ext)
Definition: file.c:3351
struct target_ioreq * irf_tioreq
const char * bt_name
Definition: bob.h:73
static struct inode * iomap_to_inode(const struct pargrp_iomap *map)
Definition: file.c:467
Definition: sock.c:887
static m0_bcount_t count
Definition: xcode.c:167
M0_INTERNAL uint64_t m0_round_up(uint64_t val, uint64_t size)
Definition: misc.c:181
static ssize_t file_aio_write(struct kiocb *kcb, struct iov_iter *from)
Definition: file.c:5348
static int bulk_buffer_add(struct io_req_fop *irfop, struct m0_net_domain *dom, struct m0_rpc_bulk_buf **rbuf, uint32_t *delta, uint32_t maxsize)
Definition: file.c:6464
struct inode * inode
Definition: dir.c:624
M0_INTERNAL bool m0_tlist_is_empty(const struct m0_tl_descr *d, const struct m0_tl *list)
Definition: tlist.c:96
M0_INTERNAL void m0_rpc_bulk_buflist_empty(struct m0_rpc_bulk *rbulk)
Definition: bulk.c:279
static int ioreq_sm_timedwait(struct io_request *req, uint64_t state)
Definition: file.c:3546
struct target_ioreq * dr_tioreq
enum m0_pool_nd_state ti_state
#define m0_tl_endfor
Definition: tlist.h:700
M0_INTERNAL int m0_sm_timedwait(struct m0_sm *mach, uint64_t states, m0_time_t deadline)
Definition: sm.c:387
struct m0_fid fid
Definition: di.c:46
M0_INTERNAL uint64_t m0_round_down(uint64_t val, uint64_t size)
Definition: misc.c:187
static int pargrp_iomap_dgmode_recover(struct pargrp_iomap *map)
Definition: file.c:3040
return M0_RC(rc)
M0_INTERNAL void m0_parity_math_calculate(struct m0_parity_math *math, struct m0_buf *data, struct m0_buf *parity)
Definition: parity_math.c:362
m0_bcount_t ir_copied_nr
#define M0_ASSERT_EX(cond)
static uint32_t unit_size
Definition: layout.c:53
#define M0_ENTRY(...)
Definition: trace.h:170
static uint32_t io_di_size(const struct io_request *req)
Definition: file.c:6454
M0_INTERNAL int m0_pagesize_get(void)
Definition: memory.c:233
Definition: buf.h:37
static struct m0_sm_ast ast[NR]
Definition: locality.c:44
uint64_t osr_xid
Definition: onwire.h:105
M0_INTERNAL void m0_sm_group_unlock(struct m0_sm_group *grp)
Definition: sm.c:96
M0_INTERNAL bool m0t1fs_inode_bob_check(struct m0t1fs_inode *bob)
int32_t m0_rpc_item_generic_reply_rc(const struct m0_rpc_item *reply)
Definition: fom_generic.c:81
Definition: vec.h:625
Definition: filter.py:1
static const struct m0_sm_conf io_sm_conf
Definition: file.c:1025
static struct m0t1fs_inode m0inode
Definition: fsync.c:87
static char * addr
Definition: node_k.c:37
void m0_fop_put0_lock(struct m0_fop *fop)
Definition: fop.c:213
int i
Definition: dir.c:1033
void m0_fop_rpc_machine_set(struct m0_fop *fop, struct m0_rpc_machine *mach)
Definition: fop.c:352
M0_INTERNAL m0_bcount_t m0_rpc_session_get_max_item_payload_size(const struct m0_rpc_session *session)
Definition: session.c:775
struct m0_sm rin_sm
Definition: rm.h:1436
m0_pdclust_unit_type
Definition: pdclust.h:89
enum page_attr db_flags
#define PRIu64
Definition: types.h:58
M0_INTERNAL int m0_indexvec_varr_alloc(struct m0_indexvec_varr *ivec, uint32_t len)
Definition: vec.c:1136
struct m0_rpc_machine * c_rpc_machine
Definition: conn.h:278
struct m0_fid crw_fid
Definition: io_fops.h:385
static m0_bindex_t gfile_offset(m0_bindex_t toff, const struct pargrp_iomap *map, const struct m0_pdclust_layout *play, const struct m0_pdclust_src_addr *src)
Definition: file.c:648
static uint32_t rows_nr(struct m0_pdclust_layout *play)
Definition: file.c:691
static void cc_fop_release(struct m0_ref *ref)
Definition: file.c:6514
int32_t nbe_status
Definition: net.h:1218
M0_INTERNAL bool m0_ext_is_valid(const struct m0_ext *ext)
Definition: ext.c:90
static int ioreq_dgmode_read(struct io_request *req, bool rmw)
Definition: file.c:3807
struct m0_rpc_machine * m0_fop_rpc_machine(const struct m0_fop *fop)
Definition: fop.c:360
#define M0_ERR_INFO(rc, fmt,...)
Definition: trace.h:215
int(* nxo_distribute)(struct nw_xfer_request *xfer)
uint64_t ti_parbytes
static int io_spare_map(const struct pargrp_iomap *map, const struct m0_pdclust_src_addr *src, uint32_t *spare_slot, uint32_t *spare_slot_prev, enum m0_pool_nd_state *eff_state)
Definition: file.c:2667
static void client_passive_recv(const struct m0_net_buffer_event *evt)
Definition: file.c:5622
return M0_ERR(-EOPNOTSUPP)
struct io_mem_stats iommstats
Definition: file.c:322
static void cc_rpc_item_cb(struct m0_rpc_item *item)
Definition: file.c:5851
static void target_ioreq_type_set(struct target_ioreq *ti, enum target_ioreq_type type)
Definition: file.c:584
void * sa_datum
Definition: sm.h:508
M0_INTERNAL void m0_rpc_machine_unlock(struct m0_rpc_machine *machine)
Definition: rpc_machine.c:558
M0_INTERNAL struct m0_fop_cob_rw_reply * io_rw_rep_get(struct m0_fop *fop)
Definition: io_fops.c:1056
struct m0_fop if_fop
Definition: io_fops.h:174
void * b_addr
Definition: buf.h:231
M0_INTERNAL void m0_rpc_bulk_default_cb(const struct m0_net_buffer_event *evt)
Definition: bulk.c:140
Definition: trace.h:482
static void mark_page_as_read_failed(struct pargrp_iomap *map, uint32_t row, uint32_t col, enum page_attr page_type)
Definition: file.c:2701
Definition: cnt.h:36
static m0_bindex_t data_page_offset_get(struct pargrp_iomap *map, uint32_t row, uint32_t col)
Definition: file.c:767
int(* pi_populate)(struct pargrp_iomap *iomap, struct m0_ivec_varr_cursor *cursor)
void m0_addb2_push(uint64_t id, int n, const uint64_t *value)
Definition: addb2.c:412
static void ioreq_no_unlock(struct io_request *req)
Definition: file.c:4003
M0_INTERNAL m0_bindex_t m0_ivec_varr_cursor_index(const struct m0_ivec_varr_cursor *cur)
Definition: vec.c:1237
M0_INTERNAL struct m0_file * m0_fop_to_file(struct m0_fop *fop)
Definition: file.c:5998
Definition: refs.h:34
#define m0_tl_teardown(name, head, obj)
Definition: tlist.h:708
int(* tio_cc_fops_prepare)(struct target_ioreq *ti)
struct m0_fid pv_id
Definition: pool.h:113
static bool io_request_invariant(struct io_request *req)
Definition: file.c:1057
struct m0_net_buffer * bb_nbuf
Definition: bulk.h:177
struct m0_parity_math pi_math
Definition: pdclust.h:223
M0_INTERNAL struct m0t1fs_inode * m0t1fs_inode_to_m0inode(const struct inode *inode)
Definition: file.c:462
static int ioreq_no_lock(struct io_request *req)
Definition: file.c:3998
enum pargrp_iomap_state pi_state
#define m0_free0(pptr)
Definition: memory.h:77
void(* tio_seg_add)(struct target_ioreq *ti, const struct m0_pdclust_src_addr *src, const struct m0_pdclust_tgt_addr *tgt, m0_bindex_t gob_offset, m0_bcount_t count, struct pargrp_iomap *map)
static uint64_t page_nr(m0_bcount_t size)
Definition: file.c:492
M0_INTERNAL size_t m0_io_fop_size_get(struct m0_fop *fop)
Definition: io_fops.c:1589
struct m0_net_transfer_mc rm_tm
Definition: rpc_machine.h:88
m0_bcount_t b_nob
Definition: buf.h:38
struct m0_io_descs crw_desc
Definition: io_fops.h:400
static uint64_t page_id(m0_bindex_t offset)
Definition: file.c:686
#define M0_ASSERT(cond)
const char * scf_name
Definition: sm.h:352
struct m0_buf db_buf
struct m0t1fs_mdop mo
Definition: dir.c:332
struct page * db_page
struct nw_xfer_request ir_nwxfer
M0_THREAD_ENTER
Definition: dir.c:336
struct m0_fid pver
Definition: idx_dix.c:74
struct m0_rpc_item_header2 ri_header
Definition: item.h:193
void m0_sm_state_set(struct m0_sm *mach, int state)
Definition: sm.c:478
struct m0_rpc_machine * m0_fop_session_machine(const struct m0_rpc_session *s)
Definition: fop.c:453
uint32_t c_cob_idx
Definition: io_fops.h:471
m0_pool_nd_state
Definition: pool_machine.h:57
M0_INTERNAL struct m0t1fs_inode * m0t1fs_file_to_m0inode(const struct file *file)
Definition: file.c:444
static void data_buf_dealloc_fini(struct data_buf *buf)
Definition: file.c:4838
static struct m0_pdclust_instance * pdlayout_instance(const struct m0_layout_instance *li)
Definition: file.c:504
const struct nw_xfer_ops * nxr_ops
M0_INTERNAL bool m0_is_io_fop_rep(const struct m0_fop *fop)
Definition: io_fops.c:945
uint64_t ta_frame
Definition: pdclust.h:254
static struct m0_fop reply_fop
Definition: fsync.c:64
struct m0_sm ir_sm
#define M0_ADDB2_OBJ(obj)
Definition: addb2.h:276
#define m0_htable_forall(name, var, htable,...)
Definition: hash.h:465
M0_INTERNAL int m0t1fs_ref_get_lock(struct m0t1fs_sb *csb)
Definition: super.c:722
static ssize_t file_dio_write(struct kiocb *kcb, struct iov_iter *from)
Definition: file.c:5187
struct m0t1fs_inode * ci
Definition: dir.c:622
static void ioreq_file_unlock(struct io_request *req)
Definition: file.c:3991
#define bob_of(ptr, type, field, bt)
Definition: bob.h:140
static void m0_atomic64_dec(struct m0_atomic64 *a)
M0_INTERNAL struct m0_poolmach * m0t1fs_file_to_poolmach(const struct file *file)
Definition: file.c:457
struct m0_atomic64 nxr_ccfop_nr
static int unit_state(const struct m0_pdclust_src_addr *src, const struct io_request *req, enum m0_pool_nd_state *state)
Definition: file.c:2645
static struct m0_bufvec bvec
Definition: xcode.c:169
static void dgmode_rwvec_dealloc_fini(struct dgmode_rwvec *dg)
Definition: file.c:3335
int m0t1fs_setxattr(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) int m0t1fs_setxattr(struct dentry *dentry
M0_INTERNAL int m0_varr_init(struct m0_varr *arr, uint64_t nr, size_t size, size_t bufsize)
Definition: varr.c:114
int32_t rin_rc
Definition: rm.h:1446
static int ioreq_parity_recalc(struct io_request *req)
Definition: file.c:1649
int m0t1fs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
Definition: fsync.c:354
static struct m0_stob_domain * dom
Definition: storage.c:38
struct m0_varr dr_pageattrs
void(* iro_iomaps_destroy)(struct io_request *req)
uint64_t pi_grpid
void * nb_app_private
Definition: net.h:1477
uint64_t b_lid
Definition: md_fops.h:90
M0_INTERNAL struct m0_pdclust_layout * m0_layout_to_pdl(const struct m0_layout *l)
Definition: pdclust.c:382
struct m0_fop * m0_fop_get(struct m0_fop *fop)
Definition: fop.c:162
const struct m0_rpc_item_type * ri_type
Definition: item.h:200
M0_INTERNAL ssize_t m0t1fs_aio(struct kiocb *kcb, const struct iovec *iov, struct m0_indexvec_varr *ivv, enum io_req_type rw)
Definition: file.c:5072
static uint64_t layout_unit_size(const struct m0_pdclust_layout *play)
Definition: file.c:525
struct m0_fid c_gobfid
Definition: io_fops.h:460
struct m0_rpc_item * ri_reply
Definition: item.h:163
void * m0_alloc(size_t size)
Definition: memory.c:126
struct m0_semaphore m0t1fs_cpus_sem
Definition: m0t1fs.c:44
struct m0_fop_mod_rep rwr_mod_rep
Definition: io_fops.h:339
struct m0_sm_group * sm_grp
Definition: sm.h:321
M0_INTERNAL uint32_t m0_fid_cob_device_id(const struct m0_fid *cob_fid)
Definition: fid_convert.c:81
M0_INTERNAL void m0_mutex_init(struct m0_mutex *mutex)
Definition: mutex.c:35
struct m0_fid b_pver
Definition: md_fops.h:93
uint64_t f_container
Definition: fid.h:39
enum pargrp_iomap_rmwtype pi_rtype
uint64_t ri_nr_sent_max
Definition: item.h:146
#define M0_POST(cond)
struct m0_0vec bb_zerovec
Definition: bulk.h:179
Definition: xcode.h:73
target_ioreq_type
static bool nw_xfer_request_invariant(const struct nw_xfer_request *xfer)
Definition: file.c:1090
M0_TL_DEFINE(iofops, static, struct io_req_fop)
int(* pi_dgmode_postprocess)(struct pargrp_iomap *map)
static void ioreq_failed_fini(struct io_request *req, int rc)
Definition: file.c:5058
static struct m0_sm_group * file_to_smgroup(const struct file *file)
Definition: file.c:487
M0_INTERNAL int m0t1fs_setattr(struct dentry *dentry, struct iattr *attr)
Definition: dir.c:1586
struct m0_varr iv_count
Definition: vec.h:708
uint32_t v_nr
Definition: vec.h:51
static int nw_xfer_req_dispatch(struct nw_xfer_request *xfer)
Definition: file.c:6155
static int pargrp_iomap_select_ro_rr(struct pargrp_iomap *map, m0_bcount_t data_pages_nr, m0_bcount_t parity_pages_nr)
Definition: file.c:2471
m0_net_buffer_cb_proc_t nbc_cb[M0_NET_QT_NR]
Definition: net.h:1272
static uint64_t tolerance_of_level(struct io_request *req, uint64_t lv)
Definition: file.c:3597
static bool pargrp_iomap_spans_seg(struct pargrp_iomap *map, m0_bindex_t index, m0_bcount_t count)
Definition: file.c:1931
static m0_bindex_t offset
Definition: dump.c:173
static void pargrp_src_addr(m0_bindex_t index, const struct io_request *req, const struct target_ioreq *tio_req, struct m0_pdclust_src_addr *src)
Definition: file.c:621
M0_INTERNAL void m0_fd_bwd_map(struct m0_pdclust_instance *pi, const struct m0_pdclust_tgt_addr *tgt, struct m0_pdclust_src_addr *src)
Definition: fd.c:959
static int pargrp_iomap_paritybufs_alloc(struct pargrp_iomap *map)
Definition: file.c:2275
static const struct m0_bob_type nwxfer_bobtype
Definition: file.c:342
int(* pi_parity_verify)(struct pargrp_iomap *map)
struct m0_indexvec_varr dr_bufvec
struct m0_htable nxr_tioreqs_hash
M0_INTERNAL int m0_fop_data_alloc(struct m0_fop *fop)
Definition: fop.c:71
static void target_ioreq_fini(struct target_ioreq *ti)
Definition: file.c:4708
M0_INTERNAL void m0_fop_fini(struct m0_fop *fop)
Definition: fop.c:136
struct m0_rpc_session * ti_session
struct m0_indexvec_varr ir_ivv
static bool pargrp_iomap_invariant(struct pargrp_iomap *map)
Definition: file.c:1185
static struct m0_clink clink[RDWR_REQUEST_MAX]
static uint64_t min64u(uint64_t a, uint64_t b)
Definition: arith.h:66
struct m0_tl ti_iofops
static int pargrp_iomap_populate_pi_ivec(struct pargrp_iomap *map, struct m0_ivec_varr_cursor *cursor, bool rmw)
Definition: file.c:2364
static int pargrp_iomap_dgmode_postprocess(struct pargrp_iomap *map)
Definition: file.c:2861
void(* iro_file_unlock)(struct io_request *req)
struct m0_indexvec_varr dr_ivec_varr
M0_INTERNAL int m0t1fs_cob_setattr(struct inode *inode, struct m0t1fs_mdop *mo)
Definition: dir.c:2477
static void page_pos_get(struct pargrp_iomap *map, m0_bindex_t index, uint32_t *row, uint32_t *col)
Definition: file.c:725
static struct fdmi_ctx ctx
Definition: main.c:80
M0_INTERNAL void m0_indexvec_varr_free(struct m0_indexvec_varr *ivec)
Definition: vec.c:1160
#define FID_P(f)
Definition: fid.h:77
static uint64_t data_size(const struct m0_pdclust_layout *play)
Definition: file.c:550
void m0_addb2_pop(uint64_t id)
Definition: addb2.c:440
static const struct m0_rpc_item_ops io_item_ops
Definition: file.c:810
M0_TL_DESCR_DECLARE(rpcbulk, M0_EXTERN)
const struct target_ioreq_ops * ti_ops
int(* iro_dgmode_recover)(struct io_request *req)
static const uint64_t k2
Definition: hash_fnc.c:35
static void irfop_fini(struct io_req_fop *irfop)
Definition: file.c:5044
static uint64_t round_down(uint64_t val, uint64_t size)
Definition: file.c:697
static int nw_xfer_io_distribute(struct nw_xfer_request *xfer)
Definition: file.c:3394
M0_INTERNAL const char * m0_rpc_item_opname(const struct m0_rpc_item *item)
Definition: item.c:1195
struct m0_bufvec z_bvec
Definition: vec.h:514
static uint32_t layout_n(const struct m0_pdclust_layout *play)
Definition: file.c:515
static int ioreq_parity_verify(struct io_request *req)
Definition: file.c:1611
const struct iovec * ir_iovec
static ssize_t m0t1fs_direct_IO(struct kiocb *kcb, struct iov_iter *from)
Definition: file.c:6784
copy_direction
static struct m0_pdclust_layout * pdlayout_get(const struct io_request *req)
Definition: file.c:510
static int64_t m0_atomic64_get(const struct m0_atomic64 *a)
static int pargrp_iomap_dgmode_process(struct pargrp_iomap *map, struct target_ioreq *tio, m0_bindex_t *index, uint32_t count)
Definition: file.c:2759
void(* rio_replied)(struct m0_rpc_item *item)
Definition: item.h:300
static int pargrp_iomap_parity_recalc(struct pargrp_iomap *map)
Definition: file.c:1502
#define m0_forall(var, nr,...)
Definition: misc.h:112
uint64_t sa_unit
Definition: pdclust.h:243
uint32_t sd_flags
Definition: sm.h:378
static int ioreq_dgmode_write(struct io_request *req, bool rmw)
Definition: file.c:3711
M0_INTERNAL int m0_rpc_session_validate(struct m0_rpc_session *session)
Definition: session.c:573
const struct m0_net_buffer_callbacks client_buf_bulk_cb
Definition: file.c:5688
struct m0_fop_type m0_fop_cob_readv_fopt
Definition: io_fops.c:71
M0_INTERNAL void m0_ivec_varr_cursor_init(struct m0_ivec_varr_cursor *cur, struct m0_indexvec_varr *ivec)
Definition: vec.c:1183
M0_INTERNAL size_t m0_rpc_bulk_buf_length(struct m0_rpc_bulk *rbulk)
Definition: bulk.c:550
static const struct m0_bob_type tioreq_bobtype
Definition: file.c:338
#define PRIu32
Definition: types.h:66
uint64_t ti_databytes
M0_INTERNAL size_t m0_rpc_bulk_store_del_unqueued(struct m0_rpc_bulk *rbulk)
Definition: bulk.c:190
static bool should_spare_be_mapped(struct io_request *req, enum m0_pool_nd_state dev_state)
Definition: file.c:4507
static int pargrp_iomap_readold_auxbuf_alloc(struct pargrp_iomap *map)
Definition: file.c:2099
struct m0_pdclust_tgt_addr tgt
Definition: fd.c:110
static uint8_t fail[DATA_UNIT_COUNT_MAX+PARITY_UNIT_COUNT_MAX]
static const struct m0_rpc_item_ops cc_item_ops
Definition: file.c:814
static struct m0_parity_math * parity_math(struct io_request *req)
Definition: file.c:555
static void paritybufs_set_dgw_mode(struct pargrp_iomap *iomap, struct m0_pdclust_layout *play, uint64_t unit)
Definition: file.c:3373
M0_INTERNAL int64_t m0_ref_read(const struct m0_ref *ref)
Definition: refs.c:44
M0_BOB_DEFINE(static, &tioreq_bobtype, target_ioreq)
static void cc_bottom_half(struct m0_sm_group *grp, struct m0_sm_ast *ast)
Definition: file.c:5876
static const struct m0_bob_type pgiomap_bobtype
Definition: file.c:341
static void io_bottom_half(struct m0_sm_group *grp, struct m0_sm_ast *ast)
Definition: file.c:6025
M0_INTERNAL void m0_varr_fini(struct m0_varr *arr)
Definition: varr.c:486
M0_INTERNAL void m0_file_unlock(struct m0_rm_incoming *req)
Definition: file.c:540
#define M0_CNT_INC(cnt)
Definition: arith.h:226
static int ioreq_iosm_handle(struct io_request *req)
Definition: file.c:4018
static int pargrp_iomap_seg_process(struct pargrp_iomap *map, uint64_t seg, bool rmw)
Definition: file.c:1965
#define indexvec_varr_dump(ivec)
Definition: file.c:411
static void nw_xfer_request_fini(struct nw_xfer_request *xfer)
Definition: file.c:1234
#define M0_FI_ENABLED(tag)
Definition: finject.h:231
struct m0_ref f_ref
Definition: fop.h:80
Definition: ext.h:37
static int ioreq_iomaps_parity_groups_cal(struct io_request *req)
Definition: file.c:3130
Definition: fid.h:38
static bool tioreq_key_eq(const void *key1, const void *key2)
Definition: file.c:600
uint64_t f_key
Definition: fid.h:40
m0_bindex_t e_start
Definition: ext.h:39
M0_INTERNAL void m0_sm_init(struct m0_sm *mach, const struct m0_sm_conf *conf, uint32_t state, struct m0_sm_group *grp)
Definition: sm.c:313
#define M0_IS0(obj)
Definition: misc.h:70
M0_INTERNAL void m0_rpc_machine_lock(struct m0_rpc_machine *machine)
Definition: rpc_machine.c:551
static struct m0_rpc_session * target_session(struct io_request *req, struct m0_fid tfid)
Definition: file.c:679
struct m0_fid ti_fid
static uint64_t indexvec_varr_count(struct m0_indexvec_varr *varr)
Definition: file.c:535
static struct m0_layout_instance * layout_instance(const struct io_request *req)
Definition: file.c:498
static bool io_req_fop_invariant(const struct io_req_fop *fop)
Definition: file.c:1165
static int pargrp_iomap_init(struct pargrp_iomap *map, struct io_request *req, uint64_t grpid)
Definition: file.c:1795
#define M0_ALLOC_PTR(ptr)
Definition: memory.h:86
struct cc_req_fop ti_cc_fop
M0_INTERNAL void m0_clink_add(struct m0_chan *chan, struct m0_clink *link)
Definition: chan.c:228
static int ioreq_user_data_copy(struct io_request *req, enum copy_direction dir, enum page_attr filter)
Definition: file.c:1700
static bool pargrp_iomap_invariant_nr(struct io_request *req)
Definition: file.c:1201
const struct m0_rpc_item_ops * ri_ops
Definition: item.h:149
static void ioreq_sm_state_set_nolock(struct io_request *req, int state)
Definition: file.c:1049
#define PRIi64
Definition: types.h:59
int(* nxo_dispatch)(struct nw_xfer_request *xfer)
int(* iro_dgmode_read)(struct io_request *req, bool rmw)
struct m0_mutex nxr_lock
M0_INTERNAL m0_bcount_t m0_net_domain_get_max_buffer_desc_size(struct m0_net_domain *dom)
const struct m0_uint128 m0_rm_m0t1fs_group
Definition: inode.c:59
enum nw_xfer_state nxr_state
struct m0_indexvec_varr ti_ivv
static uint64_t pargrp_iomap_auxbuf_alloc(struct pargrp_iomap *map, uint32_t row, uint32_t col)
Definition: file.c:2076
struct m0_rpc_session * ri_session
Definition: item.h:147
static int target_ioreq_init(struct target_ioreq *ti, struct nw_xfer_request *xfer, const struct m0_fid *cobfid, uint64_t ta_obj, struct m0_rpc_session *session, uint64_t size)
Definition: file.c:4631
static bool data_buf_invariant_nr(const struct pargrp_iomap *map)
Definition: file.c:1118
static void io_req_fop_release(struct m0_ref *ref)
Definition: file.c:5747
static void target_ioreq_seg_add(struct target_ioreq *ti, const struct m0_pdclust_src_addr *src, const struct m0_pdclust_tgt_addr *tgt, m0_bindex_t gob_offset, m0_bcount_t count, struct pargrp_iomap *map)
Definition: file.c:4857
struct m0_fop_type m0_fop_cob_create_fopt
Definition: io_fops.c:75
struct m0_rpc_item * m0_fop_to_rpc_item(const struct m0_fop *fop)
Definition: fop.c:338
static void ioreq_sm_failed(struct io_request *req, int rc)
Definition: file.c:1031
static int pargrp_iomap_populate(struct pargrp_iomap *map, struct m0_ivec_varr_cursor *cursor)
Definition: file.c:2506
M0_INTERNAL enum m0_pdclust_unit_type m0_pdclust_unit_classify(const struct m0_pdclust_layout *pl, int unit)
Definition: pdclust.c:425
static void nw_xfer_request_init(struct nw_xfer_request *xfer)
Definition: file.c:1207
M0_TL_DESCR_DEFINE(iofops, "List of IO fops", static, struct io_req_fop, irf_link, irf_magic, M0_T1FS_IOFOP_MAGIC, M0_T1FS_TIOREQ_MAGIC)
m0_bcount_t size
Definition: di.c:39
static uint64_t parity_units_page_nr(const struct m0_pdclust_layout *play)
Definition: file.c:530
page_attr
#define _0C(exp)
Definition: assert.h:311
static int start(struct m0_fom *fom)
Definition: trigger_fom.c:321
#define V_COUNT(ivec, i)
Definition: file.c:397
struct data_buf *** pi_databufs
M0_INTERNAL void m0_mutex_fini(struct m0_mutex *mutex)
Definition: mutex.c:42
M0_INTERNAL void m0_clink_fini(struct m0_clink *link)
Definition: chan.c:208
m0_bcount_t rb_bytes
Definition: bulk.h:260
static int nw_xfer_tioreq_get(struct nw_xfer_request *xfer, const struct m0_fid *fid, uint64_t ta_obj, struct m0_rpc_session *session, uint64_t size, struct target_ioreq **out)
Definition: file.c:4754
void m0_fop_put_lock(struct m0_fop *fop)
Definition: fop.c:199
M0_INTERNAL bool m0_rpc_bulk_is_empty(struct m0_rpc_bulk *rbulk)
Definition: bulk.c:539
static void ioreq_pgiomap_find(struct io_request *req, uint64_t grpid, uint64_t *cursor, struct pargrp_iomap **out)
Definition: file.c:1675
const struct io_request_ops * ir_ops
struct m0_atomic64 nxr_iofop_nr
static struct m0_fop * fop
Definition: item.c:57
#define INDEX(ivec, i)
Definition: file.c:391
static void user_page_unmap(struct data_buf *dbuf, bool set_dirty)
Definition: file.c:1284
M0_INTERNAL int32_t m0_net_domain_get_max_buffer_segments(struct m0_net_domain *dom)
static void indexvec_sort(struct m0_indexvec_varr *ivec)
Definition: file.c:1768
struct m0_io_fop irf_iofop
M0_INTERNAL void m0_sm_group_lock(struct m0_sm_group *grp)
Definition: sm.c:83
struct pargrp_iomap ** ir_iomaps
struct m0_fop * m0_rpc_item_to_fop(const struct m0_rpc_item *item)
Definition: fop.c:346
static int target_ioreq_iofops_prepare(struct target_ioreq *ti, enum page_attr filter)
Definition: file.c:6566
static const struct m0_bob_type dtbuf_bobtype
Definition: file.c:343
int(* pi_parity_recalc)(struct pargrp_iomap *map)
M0_INTERNAL void m0_rm_owner_unlock(struct m0_rm_owner *owner)
Definition: rm.c:603
M0_INTERNAL int user_page_map(struct data_buf *dbuf, unsigned long user_addr)
Definition: file.c:1247
static uint64_t group_id(m0_bindex_t index, m0_bcount_t dtsize)
Definition: file.c:560
M0_INTERNAL struct m0_rpc_session * m0t1fs_container_id_to_session(const struct m0_pool_version *pver, uint64_t container_id)
Definition: super.c:166
static struct m0_be_seg * seg
Definition: btree.c:40
int(* iro_iomaps_prepare)(struct io_request *req)
M0_INTERNAL void iov_iter_advance(struct iov_iter *i, size_t bytes)
static uint32_t ioreq_sm_state(const struct io_request *req)
Definition: file.c:975
struct m0_fid c_pver
Definition: io_fops.h:468
#define M0_ASSERT_INFO(cond, fmt,...)
#define m0_varr_for(arr, type, idx, obj)
Definition: varr.h:259
struct m0_tl rb_buflist
Definition: bulk.h:256
#define V_SEG_NR(ivec)
Definition: file.c:398
static void data_buf_init(struct data_buf *buf, void *addr, uint64_t flags)
Definition: file.c:1146
static uint64_t tioreqs_hash_func(const struct m0_htable *htable, const void *k)
Definition: file.c:593
M0_INTERNAL void m0_io_fop_fini(struct m0_io_fop *iofop)
Definition: io_fops.c:897
M0_INTERNAL int m0_io_fop_init(struct m0_io_fop *iofop, const struct m0_fid *gfid, struct m0_fop_type *ftype, void(*fop_release)(struct m0_ref *))
Definition: io_fops.c:865
static void device_state_reset(struct nw_xfer_request *xfer, bool rmw)
Definition: file.c:4006
static const struct pargrp_iomap_ops iomap_ops
Definition: file.c:876
static bool is_session_marked(struct io_request *req, struct m0_rpc_session *session)
Definition: file.c:3615
M0_INTERNAL void m0_ext_intersection(const struct m0_ext *e0, const struct m0_ext *e1, struct m0_ext *result)
Definition: ext.c:81
M0_INTERNAL struct m0_pdclust_instance * m0_layout_instance_to_pdi(const struct m0_layout_instance *li)
Definition: pdclust.c:400
struct inode * dir
Definition: dir.c:1028
M0_INTERNAL void m0_semaphore_down(struct m0_semaphore *semaphore)
Definition: semaphore.c:49
Definition: nucleus.c:42
struct nw_xfer_request * ti_nwxfer
struct m0_rm_incoming ir_in
io_req_state
#define out(...)
Definition: gen.c:41
M0_INTERNAL void m0_rpc_bulk_store_del(struct m0_rpc_bulk *rbulk)
Definition: bulk.c:215
Definition: file.h:81
M0_INTERNAL bool m0_is_read_fop(const struct m0_fop *fop)
Definition: io_fops.c:916
int type
Definition: dir.c:1031
static uint32_t target_ioreq_type_get(struct target_ioreq *ti)
Definition: file.c:579
static uint64_t pargrp_id_find(m0_bindex_t index, const struct io_request *req, const struct io_req_fop *ir_fop)
Definition: file.c:638
M0_INTERNAL void m0_rm_owner_lock(struct m0_rm_owner *owner)
Definition: rm.c:592
struct m0_fid gfid
Definition: dir.c:626
M0_INTERNAL void m0_semaphore_up(struct m0_semaphore *semaphore)
Definition: semaphore.c:65
struct m0_uint128 cr_group_id
Definition: rm.h:506
static void seg_align(struct pargrp_iomap *map, uint32_t seg, m0_bindex_t end, uint64_t sz)
Definition: file.c:2351
M0_INTERNAL struct m0_fop_cob_rw * io_rw_get(struct m0_fop *fop)
Definition: io_fops.c:1037
static bool should_req_sm_complete(struct io_request *req)
Definition: file.c:5939
static int32_t min32(int32_t a, int32_t b)
Definition: arith.h:36
struct target_ioreq * db_tioreq
M0_INTERNAL bool m0_fid_is_valid(const struct m0_fid *fid)
Definition: fid.c:96
static uint64_t iomap_page_nr(struct pargrp_iomap *map)
Definition: file.c:545
M0_INTERNAL void m0_fd_fwd_map(struct m0_pdclust_instance *pi, const struct m0_pdclust_src_addr *src, struct m0_pdclust_tgt_addr *tgt)
Definition: fd.c:838
static void ioreq_iomaps_destroy(struct io_request *req)
Definition: file.c:3255
M0_INTERNAL int m0_io_fop_prepare(struct m0_fop *fop)
Definition: io_fops.c:1513
struct m0_rpc_machine * ri_rmachine
Definition: item.h:160
int(* iro_iosm_handle)(struct io_request *req)
Definition: varr.h:121
#define M0_PRE_EX(cond)
static struct m0_dtm_oper_descr reply
Definition: transmit.c:94
static void m0_atomic64_add(struct m0_atomic64 *a, int64_t num)
M0_INTERNAL int m0_rpc_bulk_buf_add(struct m0_rpc_bulk *rbulk, uint32_t segs_nr, m0_bcount_t length, struct m0_net_domain *netdom, struct m0_net_buffer *nb, struct m0_rpc_bulk_buf **out)
Definition: bulk.c:291
const struct inode_operations m0t1fs_reg_inode_operations
Definition: file.c:6771
M0_INTERNAL struct m0_pool_version * m0t1fs_file_to_pver(const struct file *file)
Definition: file.c:449
uint64_t s_session_id
Definition: session.h:309
struct m0_fop_type m0_fop_cob_writev_fopt
Definition: io_fops.c:72
static const struct target_ioreq_ops tioreq_ops
Definition: file.c:907
#define m0_tl_for(name, head, obj)
Definition: tlist.h:695
void m0_free(void *data)
Definition: memory.c:146
static int device_check(struct io_request *req)
Definition: file.c:3641
#define m0_htable_endfor
Definition: hash.h:491
uint64_t * ir_failed_session
struct m0_rpc_item f_item
Definition: fop.h:83
struct m0_fop_cob c_body
Definition: io_fops.h:456
int(* iro_user_data_copy)(struct io_request *req, enum copy_direction dir, enum page_attr filter)
uint32_t sm_state
Definition: sm.h:307
static int iofop_async_submit(struct m0_io_fop *iofop, struct m0_rpc_session *session)
Definition: file.c:5697
static bool is_pver_dud(uint32_t fdev_nr, uint32_t dev_k, uint32_t fsvc_nr, uint32_t svc_k)
Definition: file.c:3699
struct m0_pdclust_src_addr src
Definition: fd.c:108
static bool data_buf_invariant(const struct data_buf *db)
Definition: file.c:1110
struct dgmode_rwvec * ti_dgvec
M0_INTERNAL void m0_bob_type_tlist_init(struct m0_bob_type *bt, const struct m0_tl_descr *td)
Definition: bob.c:41
struct file * ir_file
int32_t rc
Definition: trigger_fop.h:47
uint64_t h_bucket_nr
Definition: hash.h:178
static uint32_t io_desc_size(struct m0_net_domain *ndom)
Definition: file.c:6439
M0_INTERNAL struct m0t1fs_sb * m0inode_to_sb(const struct m0t1fs_inode *m0inode)
Definition: file.c:472
#define ARRAY_SIZE(a)
Definition: misc.h:45
const struct pargrp_iomap_ops * pi_ops
M0_INTERNAL void m0t1fs_ref_put_lock(struct m0t1fs_sb *csb)
Definition: super.c:749
#define M0_POST_EX(cond)
#define offsetof(typ, memb)
Definition: misc.h:29
const struct file_operations m0t1fs_reg_file_operations
Definition: file.c:5602
M0_INTERNAL void m0_poolmach_gob2cob(struct m0_poolmach *pm, const struct m0_fid *gfid, uint32_t idx, struct m0_fid *cob_fid)
struct m0_rpc_conn * s_conn
Definition: session.h:312
M0_HT_DEFINE(tioreqht, static, struct target_ioreq, uint64_t)
static void io_request_fini(struct io_request *req)
Definition: file.c:4404
struct m0_be_tx_remid fmr_remid
Definition: fom_generic.h:243
static uint64_t target_offset(uint64_t frame, struct m0_pdclust_layout *play, m0_bindex_t gob_offset)
Definition: file.c:571
M0_INTERNAL int m0t1fs_size_update(struct dentry *dentry, uint64_t newsize)
Definition: dir.c:1525
static struct m0_sm_state_descr io_states[]
Definition: file.c:980
int(* tio_iofops_prepare)(struct target_ioreq *ti, enum page_attr filter)
static uint8_t parity[DATA_UNIT_COUNT_MAX][UNIT_BUFF_SIZE_MAX]
Definition: fop.h:79
void m0t1fs_fsync_record_update(struct m0_reqh_service_ctx *service, struct m0t1fs_sb *csb, struct m0t1fs_inode *inode, struct m0_be_tx_remid *btr)
Definition: fsync.c:397
static const struct m0_fid * file_to_fid(const struct file *file)
Definition: file.c:477
struct m0_mutex rb_mutex
Definition: bulk.h:251
enum page_attr irf_pattr
static uint64_t max64u(uint64_t a, uint64_t b)
Definition: arith.h:71
const struct m0_di_ops * fi_di_ops
Definition: file.h:92
static struct m0_addb2_frame_header last
Definition: storage.c:93
int m0t1fs_getxattr(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) ssize_t m0t1fs_getxattr(struct dentry *dentry
static int pargrp_iomap_readrest(struct pargrp_iomap *map)
Definition: file.c:2203
static struct target_ioreq * target_ioreq_locate(struct nw_xfer_request *xfer, const struct m0_fid *fid)
Definition: file.c:4738
#define FID_F
Definition: fid.h:75
static struct m0_indexvec_varr * indexvec_create(unsigned long seg_nr, const struct iovec *iov, loff_t pos)
Definition: file.c:5145
M0_INTERNAL bool m0_ivec_varr_cursor_move(struct m0_ivec_varr_cursor *cur, m0_bcount_t count)
Definition: vec.c:1198
enum io_req_type ir_type
static void m0_atomic64_set(struct m0_atomic64 *a, int64_t num)
struct m0_fop * rep_fop
Definition: dir.c:334
static uint64_t pargrp_iomap_fullpages_count(struct pargrp_iomap *map)
Definition: file.c:2051
static struct data_buf * data_buf_alloc_init(enum page_attr pattr)
Definition: file.c:4801
int(* iro_file_lock)(struct io_request *req)
M0_INTERNAL struct inode * m0t1fs_file_to_inode(const struct file *file)
Definition: file.c:435
Definition: idx_mock.c:47
static const struct nw_xfer_ops xfer_ops
Definition: file.c:837
static int pargrp_iomap_pages_mark_as_failed(struct pargrp_iomap *map, enum m0_pdclust_unit_type type)
Definition: file.c:2583
#define m0_tl_forall(name, var, head,...)
Definition: tlist.h:735
static int pargrp_iomap_databuf_alloc(struct pargrp_iomap *map, uint32_t row, uint32_t col)
Definition: file.c:1951
M0_INTERNAL struct m0_reqh_service_ctx * m0_reqh_service_ctx_from_session(struct m0_rpc_session *session)
M0_INTERNAL void io_bob_tlists_init(void)
Definition: file.c:790
struct m0_indexvec_varr * vc_ivv
Definition: vec.h:718
M0_INTERNAL void m0_sm_fini(struct m0_sm *mach)
Definition: sm.c:331