2015-09-15 | 分类 Linux | 标签 ceph

RBD镜像的存储

当我们在一个空的pool创建一个image：

# rbd create -p pool100 user1_image1 --size 102400 --image-format 2

会看到pool100中多出下面几个object：

# rados -p pool100 ls
rbd_header.134d2ae8944a
rbd_directory
rbd_id.user1_image1

其中rbd_directory保存当前pool的所有image的信息：

# rados -p pool100 listomapvals  rbd_directory          
id_134d2ae8944a
value: (16 bytes) :
0000 : 0c 00 00 00 75 73 65 72 31 5f 69 6d 61 67 65 31 : ....user1_image1

name_user1_image1
value: (16 bytes) :
0000 : 0c 00 00 00 31 33 34 64 32 61 65 38 39 34 34 61 : ....134d2ae8944a

rbd_id.${image_name}(rbd_id.user1_image1)保存rbd image的id，16个字节：

# rados -p pool100 stat rbd_id.user1_image1               
pool100/rbd_id.user1_image1 mtime 1442282704, size 16
# rados -p pool100 get rbd_id.user1_image1 /tmp/f1.txt
# hexdump -C /tmp/f1.txt 
00000000  0c 00 00 00 31 33 34 64  32 61 65 38 39 34 34 61  |....134d2ae8944a|

rbd_header.${image_id}(rbd_header.134d2ae8944a)保存image的元数据信息：

# rados -p pool100 listomapvals  rbd_header.134d2ae8944a    
features
value: (8 bytes) :
0000 : 01 00 00 00 00 00 00 00                         : ........

object_prefix
value: (25 bytes) :
0000 : 15 00 00 00 72 62 64 5f 64 61 74 61 2e 31 33 34 : ....rbd_data.134
0010 : 64 32 61 65 38 39 34 34 61                      : d2ae8944a

order
value: (1 bytes) :
0000 : 16                                              : .

size
value: (8 bytes) :
0000 : 00 00 00 00 19 00 00 00                         : ........

snap_seq
value: (8 bytes) :
0000 : 00 00 00 00 00 00 00 00                         : ........

即为rbd info的信息:

# rbd -p pool100 info user1_image1
rbd image 'user1_image1':
        size 102400 MB in 25600 objects
        order 22 (4096 kB objects)
        block_name_prefix: rbd_data.134d2ae8944a
        format: 2
        features: layering

可以看到，user1_image1的数据对象前缀为rbd_data.134d2ae8944a。

写入8MB的数据：

# rbd map pool100/user1_image1 
/dev/rbd1
# dd if=/dev/zero of=/dev/rbd1 bs=1048576 count=8

# rados -p pool100 ls
rbd_data.134d2ae8944a.0000000000000000
rbd_data.134d2ae8944a.0000000000000001
rbd_header.134d2ae8944a
rbd_directory
rbd_id.user1_image1

可以看到user1_image1多了2个4M的object。

create image的实现

client

int create_v2(IoCtx& io_ctx, const char *imgname, uint64_t bid, uint64_t size,
		int order, uint64_t features, uint64_t stripe_unit,
		uint64_t stripe_count)
  {
  	///(1)创建rbd_id.<image_name>对象
    id_obj = id_obj_name(imgname); ///rbd_id.<image_name>, object id

    int r = io_ctx.create(id_obj, true); ///create rbd_id.<image_name> object

    ///(2)将image id写到rbd_id.<image_name>
    extra = rand() % 0xFFFFFFFF;
    bid_ss << std::hex << bid << std::hex << extra;
    id = bid_ss.str();
    r = cls_client::set_id(&io_ctx, id_obj, id); ///rbd set_id

    ///(3)exec rbd dir_add_image
    r = cls_client::dir_add_image(&io_ctx, RBD_DIRECTORY, imgname, id); ///rbd dir_add_image

    ///(4)exec rbd create
    oss << RBD_DATA_PREFIX << id; ///"rbd_data."
    header_oid = header_name(id); ///rbd_header.<image_id>
    r = cls_client::create_image(&io_ctx, header_oid, size, order, ///rbd create
				 features, oss.str());

	///(5)exec rbd set_stripe_unit_count
    if ((stripe_unit || stripe_count) &&
	(stripe_count != 1 || stripe_unit != (1ull << order))) {
      r = cls_client::set_stripe_unit_count(&io_ctx, header_oid, ///rbd set_stripe_unit_count
					    stripe_unit, stripe_count);
}

** exec rbd create **

///csl_rbd_client.cc
int create_image(librados::IoCtx *ioctx, const std::string &oid,
		 uint64_t size, uint8_t order, uint64_t features,
		 const std::string &object_prefix)
{
  bufferlist bl, bl2;
  ::encode(size, bl);
  ::encode(order, bl);
  ::encode(features, bl);
  ::encode(object_prefix, (bl));

  return ioctx->exec(oid, "rbd", "create", bl, bl2);
}

///cls_rbd.cc
/**
 * Initialize the header with basic metadata.
 * Extra features may initialize more fields in the future.
 * Everything is stored as key/value pairs as omaps in the header object.
 *
 * If features the OSD does not understand are requested, -ENOSYS is
 * returned.
 *
 * Input:
 * @param size number of bytes in the image (uint64_t)
 * @param order bits to shift to determine the size of data objects (uint8_t)
 * @param features what optional things this image will use (uint64_t)
 * @param object_prefix a prefix for all the data objects
 *
 * Output:
 * @return 0 on success, negative error code on failure
 */
int create(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
{
  string object_prefix;
  uint64_t features, size;
  uint8_t order;

  try {
    bufferlist::iterator iter = in->begin();
    ::decode(size, iter);
    ::decode(order, iter);
    ::decode(features, iter);
    ::decode(object_prefix, iter);
  } catch (const buffer::error &err) {
    return -EINVAL;
  }

  CLS_LOG(20, "create object_prefix=%s size=%llu order=%u features=%llu",
	  object_prefix.c_str(), (unsigned long long)size, order,
	  (unsigned long long)features);

  if (features & ~RBD_FEATURES_ALL) {
    return -ENOSYS;
  }

  if (!object_prefix.size()) {
    return -EINVAL;
  }

  bufferlist stored_prefixbl;
  int r = cls_cxx_map_get_val(hctx, "object_prefix", &stored_prefixbl);
  if (r != -ENOENT) {
    CLS_ERR("reading object_prefix returned %d", r);
    return -EEXIST;
  }

  bufferlist sizebl;
  ::encode(size, sizebl);
  r = cls_cxx_map_set_val(hctx, "size", &sizebl);
  if (r < 0)
    return r;

  bufferlist orderbl;
  ::encode(order, orderbl);
  r = cls_cxx_map_set_val(hctx, "order", &orderbl);
  if (r < 0)
    return r;

  bufferlist featuresbl;
  ::encode(features, featuresbl);
  r = cls_cxx_map_set_val(hctx, "features", &featuresbl);
  if (r < 0)
    return r;

  bufferlist object_prefixbl;
  ::encode(object_prefix, object_prefixbl);
  r = cls_cxx_map_set_val(hctx, "object_prefix", &object_prefixbl);
  if (r < 0)
    return r;

  bufferlist snap_seqbl;
  uint64_t snap_seq = 0;
  ::encode(snap_seq, snap_seqbl);
  r = cls_cxx_map_set_val(hctx, "snap_seq", &snap_seqbl);
  if (r < 0)
    return r;

  return 0;
}

RBD snapshot的存储

当我们对image创建一个snapshot：

# rbd snap create pool100/user1_image1@user1_image1_snap

pool100并不会多出一个新的object。实际上，ceph将image的信息保存到rbd_header.${image_id}中：

# rados -p pool100 listomapvals rbd_header.134d2ae8944a
features
value: (8 bytes) :
0000 : 01 00 00 00 00 00 00 00                         : ........

object_prefix
value: (25 bytes) :
0000 : 15 00 00 00 72 62 64 5f 64 61 74 61 2e 31 33 34 : ....rbd_data.134
0010 : 64 32 61 65 38 39 34 34 61                      : d2ae8944a

order
value: (1 bytes) :
0000 : 16                                              : .

size
value: (8 bytes) :
0000 : 00 00 00 00 19 00 00 00                         : ........

snap_seq
value: (8 bytes) :
0000 : 02 00 00 00 00 00 00 00                         : ........

snapshot_0000000000000002
value: (86 bytes) :
0000 : 03 01 50 00 00 00 02 00 00 00 00 00 00 00 11 00 : ..P.............
0010 : 00 00 75 73 65 72 31 5f 69 6d 61 67 65 31 5f 73 : ..user1_image1_s
0020 : 6e 61 70 00 00 00 00 19 00 00 00 01 00 00 00 00 : nap.............
0030 : 00 00 00 01 01 1c 00 00 00 ff ff ff ff ff ff ff : ................
0040 : ff 00 00 00 00 fe ff ff ff ff ff ff ff 00 00 00 : ................
0050 : 00 00 00 00 00 00                               : ......

snapshot_0000000000000002(snapshot_$SNAPID)包含user1_image1_snap的信息：

struct cls_rbd_snap {
  snapid_t id;
  string name;
  uint64_t image_size;
  uint64_t features;
  uint8_t protection_status;
  cls_rbd_parent parent;
  uint64_t flags;
}

snap_seq为image当前最新的snapshot的snapid。

写入4M数据

# dd if=/dev/sda1 of=/dev/rbd1 bs=1048576 count=4
# ceph osd map pool100 rbd_data.134d2ae8944a.0000000000000000
osdmap e436 pool 'pool100' (91) object 'rbd_data.134d2ae8944a.0000000000000000' -> pg 91.8cd8ecc2 (91.2) -> up ([0,2,1], p0) acting ([0,2,1], p0)

# ls 91.2_head/
rbd\udata.134d2ae8944a.0000000000000000__2_8CD8ECC2__5b  rbd\udata.134d2ae8944a.0000000000000000__head_8CD8ECC2__5b

可以看到rbd_data.134d2ae8944a.0000000000000000多个了一个snap_seq为2的文件。

Ceph使用COW实现snapshot。Image的object为head version，当更新image时，ceph会针对image的snapshot（snap_seq version）拷贝一份数据，即object_${snap_seq}，然后再更新head version。

我们再创建一个snapshot：

# rbd snap create pool100/user1_image1@user1_image1_snap2
# dd if=/dev/sda1 of=/dev/rbd1 bs=1048576 count=4
# ls 91.2_head/
rbd\udata.134d2ae8944a.0000000000000000__2_8CD8ECC2__5b  rbd\udata.134d2ae8944a.0000000000000000__head_8CD8ECC2__5b
rbd\udata.134d2ae8944a.0000000000000000__3_8CD8ECC2__5b

可以看到，rbd_data.134d2ae8944a.0000000000000000多了一份snap_seq为3的副本。

# rados -p pool100 listomapvals rbd_header.134d2ae8944a
…
snap_seq
value: (8 bytes) :
0000 : 03 00 00 00 00 00 00 00                         : ........

snapshot_0000000000000002
value: (86 bytes) :
0000 : 03 01 50 00 00 00 02 00 00 00 00 00 00 00 11 00 : ..P.............
0010 : 00 00 75 73 65 72 31 5f 69 6d 61 67 65 31 5f 73 : ..user1_image1_s
0020 : 6e 61 70 00 00 00 00 19 00 00 00 01 00 00 00 00 : nap.............
0030 : 00 00 00 01 01 1c 00 00 00 ff ff ff ff ff ff ff : ................
0040 : ff 00 00 00 00 fe ff ff ff ff ff ff ff 00 00 00 : ................
0050 : 00 00 00 00 00 00                               : ......

snapshot_0000000000000003
value: (87 bytes) :
0000 : 03 01 51 00 00 00 03 00 00 00 00 00 00 00 12 00 : ..Q.............
0010 : 00 00 75 73 65 72 31 5f 69 6d 61 67 65 31 5f 73 : ..user1_image1_s
0020 : 6e 61 70 32 00 00 00 00 19 00 00 00 01 00 00 00 : nap2............
0030 : 00 00 00 00 01 01 1c 00 00 00 ff ff ff ff ff ff : ................
0040 : ff ff 00 00 00 00 fe ff ff ff ff ff ff ff 00 00 : ................
0050 : 00 00 00 00 00 00 00                            : .......

可以看到rbd_header.${image_id}的变化。

写[4M,8M)的object1

# dd if=/dev/sda1 of=/dev/rbd1 bs=1048576 seek=4 count=4
# ceph osd map pool100 rbd_data.134d2ae8944a.0000000000000001
osdmap e437 pool 'pool100' (91) object 'rbd_data.134d2ae8944a.0000000000000001' -> pg 91.4a392b12 (91.12) -> up ([0,1,2], p0) acting ([0,1,2], p0)

# ls 91.12_head/
rbd\udata.134d2ae8944a.0000000000000001__3_4A392B12__5b  rbd\udata.134d2ae8944a.0000000000000001__head_4A392B12__5b

Ceph会为object1拷贝snap_seq为3的数据。

写[8M,12M)的object2

# dd if=/dev/sda1 of=/dev/rbd1 bs=1048576 seek=8 count=4
# ceph osd map pool100 rbd_data.134d2ae8944a.0000000000000002          
osdmap e437 pool 'pool100' (91) object 'rbd_data.134d2ae8944a.0000000000000002' -> pg 91.a7513028 (91.28) -> up ([0,1,2], p0) acting ([0,1,2], p0)

# ls 91.28_head/
rbd\udata.134d2ae8944a.0000000000000002__head_A7513028__5b

create snapshot

int add_snap(ImageCtx *ictx, const char *snap_name)
{
uint64_t snap_id;
///(1)alloc snap_id
int r = ictx->md_ctx.selfmanaged_snap_create(&snap_id);
if (r < 0) {
  lderr(ictx->cct) << "failed to create snap id: " << cpp_strerror(-r)
		   << dendl;
  return r;
}

///(2)exec rbd snapshot_add
if (ictx->old_format) {
  r = cls_client::old_snapshot_add(&ictx->md_ctx, ictx->header_oid,
				   snap_id, snap_name);
} else {
  r = cls_client::snapshot_add(&ictx->md_ctx, ictx->header_oid,
			   snap_id, snap_name);
}
//...
return 0;
}

** exec rbd snapshot_add **

///cls_rbd_client.cc
int old_snapshot_add(librados::IoCtx *ioctx, const std::string &oid,
		 snapid_t snap_id, const std::string &snap_name)
{
  bufferlist bl, bl2;
  ::encode(snap_name, bl); ///snap name
  ::encode(snap_id, bl);   ///snap id

  return ioctx->exec(oid, "rbd", "snap_add", bl, bl2);
}

/**
 * Adds a snapshot to an rbd header. Ensures the id and name are unique.
 *
 * Input:
 * @param snap_name name of the snapshot (string)
 * @param snap_id id of the snapshot (uint64_t)
 *
 * Output:
 * @returns 0 on success, negative error code on failure.
 * @returns -ESTALE if the input snap_id is less than the image's snap_seq
 * @returns -EEXIST if the id or name are already used by another snapshot
 */
int snapshot_add(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
{
  bufferlist snap_namebl, snap_idbl;
  cls_rbd_snap snap_meta;

  try {
    bufferlist::iterator iter = in->begin();
    ::decode(snap_meta.name, iter);
    ::decode(snap_meta.id, iter);
  } catch (const buffer::error &err) {
    return -EINVAL;
  }

  ///....
  bufferlist snap_metabl, snap_seqbl;
  ::encode(snap_meta, snap_metabl);
  ::encode(snap_meta.id, snap_seqbl); ///snapshot的 snap id

  string snapshot_key;
  key_from_snap_id(snap_meta.id, &snapshot_key);
  map<string, bufferlist> vals;
  vals["snap_seq"] = snap_seqbl; ///更新snap_seq field
  vals[snapshot_key] = snap_metabl; ///snapshot_$ID = struct cls_rbd_snap
  r = cls_cxx_map_set_vals(hctx, &vals);

}

clone snapshot

# rbd clone pool100/user1_image1@user1_image1_snap pool100/user1_image2
# rados -p pool100 ls
rbd_data.134d2ae8944a.0000000000000000
rbd_children
rbd_data.134d2ae8944a.0000000000000001
rbd_id.user1_image2
rbd_header.134d2ae8944a
rbd_directory
rbd_id.user1_image1
rbd_header.1368238e1f29

可以看到，pool100中多了3个object：rbd_children，rbd_header.1368238e1f29，rbd_id.user1_image2

** rbd_children **

# rados -p pool100 listomapvals  rbd_children           
key: (32 bytes):
0000 : 5b 00 00 00 00 00 00 00 0c 00 00 00 31 33 34 64 : [...........134d
0010 : 32 61 65 38 39 34 34 61 02 00 00 00 00 00 00 00 : 2ae8944a........

value: (20 bytes) :
0000 : 01 00 00 00 0c 00 00 00 31 33 36 38 32 33 38 65 : ........1368238e
0010 : 31 66 32 39                                     : 1f29

** rbd_header.1368238e1f29 **

相比user1_image1，多了parent字段：

# rados -p pool100 listomapvals  rbd_header.1368238e1f29               
features
value: (8 bytes) :
0000 : 01 00 00 00 00 00 00 00                         : ........

object_prefix
value: (25 bytes) :
0000 : 15 00 00 00 72 62 64 5f 64 61 74 61 2e 31 33 36 : ....rbd_data.136
0010 : 38 32 33 38 65 31 66 32 39                      : 8238e1f29

order
value: (1 bytes) :
0000 : 16                                              : .

parent
value: (46 bytes) :
0000 : 01 01 28 00 00 00 5b 00 00 00 00 00 00 00 0c 00 : ..(...[.........
0010 : 00 00 31 33 34 64 32 61 65 38 39 34 34 61 02 00 : ..134d2ae8944a..
0020 : 00 00 00 00 00 00 00 00 00 00 19 00 00 00       : ..............

size
value: (8 bytes) :
0000 : 00 00 00 00 19 00 00 00                         : ........

snap_seq
value: (8 bytes) :
0000 : 00 00 00 00 00 00 00 00                         : ........

read snapshot

在深入讨论之前，我们先整理下之前的流程：

create image1
write image1@[object0,object1]

create image1@snap
write image1@object0

create image1@snap2
write image1@object1
write image1@ojbect2

clone image1@snap image2

我们已经知道，当我们基于user1_image1@user1_image1_snap创建新的user1_image2时，pool100并没有对应的rbd_data.1368238e1f29.*的object，如果我们读取user1_image2，ceph如何处理呢？比如，我们读取[4M,8M)，即user1_image2@object1：

#define IMAGE_BUF_SIZE 4194304
err = rados_ioctx_create(cluster, poolname, &io);
if (err < 0) {
    fprintf(stderr, "%s: cannot open rados pool %s: %s\n", argv[0], poolname, strerror(-err));
    rados_shutdown(cluster);
    exit(1);
}

err = rbd_open(io, "user1_image2", &image, NULL);
if (err < 0){
    fprintf(stderr, "open image failed: %s\n", strerror(-err));
    goto out;
}

err = rbd_read(image, IMAGE_BUF_SIZE, IMAGE_BUF_SIZE, buf);
if (err < 0) {
    fprintf(stderr, "%s: cannot read image: %s\n",  poolname, strerror(-err));
}else{
    fprintf(stderr, "read image return :%d\n", err);
}

实际上，Ceph会先尝试读取rbd_data.1368238e1f29.0000000000000001，必然返回ENOENT。这时，client再尝试从parent(user1_image1@snap)读取object1，而并不存在object1-snap，而是返回object1-snap2。对于object1，snap和snap2都对应object1-snap2。

FileStore::read 91.12_head/4a392b12/rbd_data.134d2ae8944a.0000000000000001/3//91 0~4194304/4194304

参考log。

如果我们读取rbd_data.1368238e1f29.0000000000000002，parent(user1_image1@snap)也会返回ENOENT。这时librbd会构造一个4M的zero block：

error opening file /var/lib/ceph/osd/ceph-0/current/91.2a_head/rbd\udata.1368238e1f29.0000000000000002__head_EB9D38AA__5b with flags=2: (2) No such file or directory

参考log2。

如下：

class AioRequest
{   
  void complete(int r)
  {
    if (should_complete(r)) {
      if (m_hide_enoent && r == -ENOENT)
          r = 0;
      m_completion->complete(r);
      delete this;
    }
  }

void C_AioRead::finish(int r)
{
  ldout(m_cct, 10) << "C_AioRead::finish() " << this << " r = " << r << dendl;
  if (r >= 0 || r == -ENOENT) { // this was a sparse_read operation

      m_completion->destriper.add_partial_sparse_result(
    m_cct, m_req->data(), m_req->m_ext_map, m_req->m_object_off,
    m_req->m_buffer_extents);
}

void Striper::StripedReadResult::assemble_result(CephContext *cct, bufferlist& bl, bool zero_tail)
{
  ldout(cct, 10) << "assemble_result(" << this << ") zero_tail=" << zero_tail << dendl;

  // go backwards, so that we can efficiently discard zeros
  map<uint64_t,pair<bufferlist,uint64_t> >::reverse_iterator p = partial.rbegin();
  if (p == partial.rend())
    return;

  uint64_t end = p->first + p->second.second;
  while (p != partial.rend()) {
    // sanity check
    ldout(cct, 20) << "assemble_result(" << this << ") " << p->first << "~" << p->second.second
       << " " << p->second.first.length() << " bytes"
       << dendl;
    assert(p->first == end - p->second.second);
    end = p->first;

    size_t len = p->second.first.length(); ///return data len = 0
    if (len < p->second.second) {
      if (zero_tail || bl.length()) {
        bufferptr bp(p->second.second - len); ///intended len(4M) - data len(0)
        bp.zero();
        bl.push_front(bp); ///zero block
        bl.claim_prepend(p->second.first);
      }

rbd_read

整体流程：