RBD镜像的存储
当我们在一个空的pool创建一个image:
# rbd create -p pool100 user1_image1 --size 102400 --image-format 2
会看到pool100中多出下面几个object:
# rados -p pool100 ls
rbd_header.134d2ae8944a
rbd_directory
rbd_id.user1_image1
其中rbd_directory保存当前pool的所有image的信息:
# rados -p pool100 listomapvals rbd_directory
id_134d2ae8944a
value: (16 bytes) :
0000 : 0c 00 00 00 75 73 65 72 31 5f 69 6d 61 67 65 31 : ....user1_image1
name_user1_image1
value: (16 bytes) :
0000 : 0c 00 00 00 31 33 34 64 32 61 65 38 39 34 34 61 : ....134d2ae8944a
rbd_id.${image_name}(rbd_id.user1_image1)保存rbd image的id,16个字节:
# rados -p pool100 stat rbd_id.user1_image1
pool100/rbd_id.user1_image1 mtime 1442282704, size 16
# rados -p pool100 get rbd_id.user1_image1 /tmp/f1.txt
# hexdump -C /tmp/f1.txt
00000000 0c 00 00 00 31 33 34 64 32 61 65 38 39 34 34 61 |....134d2ae8944a|
rbd_header.${image_id}(rbd_header.134d2ae8944a)保存image的元数据信息:
# rados -p pool100 listomapvals rbd_header.134d2ae8944a
features
value: (8 bytes) :
0000 : 01 00 00 00 00 00 00 00 : ........
object_prefix
value: (25 bytes) :
0000 : 15 00 00 00 72 62 64 5f 64 61 74 61 2e 31 33 34 : ....rbd_data.134
0010 : 64 32 61 65 38 39 34 34 61 : d2ae8944a
order
value: (1 bytes) :
0000 : 16 : .
size
value: (8 bytes) :
0000 : 00 00 00 00 19 00 00 00 : ........
snap_seq
value: (8 bytes) :
0000 : 00 00 00 00 00 00 00 00 : ........
即为rbd info的信息:
# rbd -p pool100 info user1_image1
rbd image 'user1_image1':
size 102400 MB in 25600 objects
order 22 (4096 kB objects)
block_name_prefix: rbd_data.134d2ae8944a
format: 2
features: layering
可以看到,user1_image1的数据对象前缀为rbd_data.134d2ae8944a。
写入8MB的数据:
# rbd map pool100/user1_image1
/dev/rbd1
# dd if=/dev/zero of=/dev/rbd1 bs=1048576 count=8
# rados -p pool100 ls
rbd_data.134d2ae8944a.0000000000000000
rbd_data.134d2ae8944a.0000000000000001
rbd_header.134d2ae8944a
rbd_directory
rbd_id.user1_image1
可以看到user1_image1多了2个4M的object。
create image的实现
client
int create_v2(IoCtx& io_ctx, const char *imgname, uint64_t bid, uint64_t size,
int order, uint64_t features, uint64_t stripe_unit,
uint64_t stripe_count)
{
///(1)创建rbd_id.<image_name>对象
id_obj = id_obj_name(imgname); ///rbd_id.<image_name>, object id
int r = io_ctx.create(id_obj, true); ///create rbd_id.<image_name> object
///(2)将image id写到rbd_id.<image_name>
extra = rand() % 0xFFFFFFFF;
bid_ss << std::hex << bid << std::hex << extra;
id = bid_ss.str();
r = cls_client::set_id(&io_ctx, id_obj, id); ///rbd set_id
///(3)exec rbd dir_add_image
r = cls_client::dir_add_image(&io_ctx, RBD_DIRECTORY, imgname, id); ///rbd dir_add_image
///(4)exec rbd create
oss << RBD_DATA_PREFIX << id; ///"rbd_data."
header_oid = header_name(id); ///rbd_header.<image_id>
r = cls_client::create_image(&io_ctx, header_oid, size, order, ///rbd create
features, oss.str());
///(5)exec rbd set_stripe_unit_count
if ((stripe_unit || stripe_count) &&
(stripe_count != 1 || stripe_unit != (1ull << order))) {
r = cls_client::set_stripe_unit_count(&io_ctx, header_oid, ///rbd set_stripe_unit_count
stripe_unit, stripe_count);
}
** exec rbd create **
///csl_rbd_client.cc
int create_image(librados::IoCtx *ioctx, const std::string &oid,
uint64_t size, uint8_t order, uint64_t features,
const std::string &object_prefix)
{
bufferlist bl, bl2;
::encode(size, bl);
::encode(order, bl);
::encode(features, bl);
::encode(object_prefix, (bl));
return ioctx->exec(oid, "rbd", "create", bl, bl2);
}
///cls_rbd.cc
/**
* Initialize the header with basic metadata.
* Extra features may initialize more fields in the future.
* Everything is stored as key/value pairs as omaps in the header object.
*
* If features the OSD does not understand are requested, -ENOSYS is
* returned.
*
* Input:
* @param size number of bytes in the image (uint64_t)
* @param order bits to shift to determine the size of data objects (uint8_t)
* @param features what optional things this image will use (uint64_t)
* @param object_prefix a prefix for all the data objects
*
* Output:
* @return 0 on success, negative error code on failure
*/
int create(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
{
string object_prefix;
uint64_t features, size;
uint8_t order;
try {
bufferlist::iterator iter = in->begin();
::decode(size, iter);
::decode(order, iter);
::decode(features, iter);
::decode(object_prefix, iter);
} catch (const buffer::error &err) {
return -EINVAL;
}
CLS_LOG(20, "create object_prefix=%s size=%llu order=%u features=%llu",
object_prefix.c_str(), (unsigned long long)size, order,
(unsigned long long)features);
if (features & ~RBD_FEATURES_ALL) {
return -ENOSYS;
}
if (!object_prefix.size()) {
return -EINVAL;
}
bufferlist stored_prefixbl;
int r = cls_cxx_map_get_val(hctx, "object_prefix", &stored_prefixbl);
if (r != -ENOENT) {
CLS_ERR("reading object_prefix returned %d", r);
return -EEXIST;
}
bufferlist sizebl;
::encode(size, sizebl);
r = cls_cxx_map_set_val(hctx, "size", &sizebl);
if (r < 0)
return r;
bufferlist orderbl;
::encode(order, orderbl);
r = cls_cxx_map_set_val(hctx, "order", &orderbl);
if (r < 0)
return r;
bufferlist featuresbl;
::encode(features, featuresbl);
r = cls_cxx_map_set_val(hctx, "features", &featuresbl);
if (r < 0)
return r;
bufferlist object_prefixbl;
::encode(object_prefix, object_prefixbl);
r = cls_cxx_map_set_val(hctx, "object_prefix", &object_prefixbl);
if (r < 0)
return r;
bufferlist snap_seqbl;
uint64_t snap_seq = 0;
::encode(snap_seq, snap_seqbl);
r = cls_cxx_map_set_val(hctx, "snap_seq", &snap_seqbl);
if (r < 0)
return r;
return 0;
}
RBD snapshot的存储
当我们对image创建一个snapshot:
# rbd snap create pool100/user1_image1@user1_image1_snap
pool100并不会多出一个新的object。实际上,ceph将image的信息保存到rbd_header.${image_id}中:
# rados -p pool100 listomapvals rbd_header.134d2ae8944a
features
value: (8 bytes) :
0000 : 01 00 00 00 00 00 00 00 : ........
object_prefix
value: (25 bytes) :
0000 : 15 00 00 00 72 62 64 5f 64 61 74 61 2e 31 33 34 : ....rbd_data.134
0010 : 64 32 61 65 38 39 34 34 61 : d2ae8944a
order
value: (1 bytes) :
0000 : 16 : .
size
value: (8 bytes) :
0000 : 00 00 00 00 19 00 00 00 : ........
snap_seq
value: (8 bytes) :
0000 : 02 00 00 00 00 00 00 00 : ........
snapshot_0000000000000002
value: (86 bytes) :
0000 : 03 01 50 00 00 00 02 00 00 00 00 00 00 00 11 00 : ..P.............
0010 : 00 00 75 73 65 72 31 5f 69 6d 61 67 65 31 5f 73 : ..user1_image1_s
0020 : 6e 61 70 00 00 00 00 19 00 00 00 01 00 00 00 00 : nap.............
0030 : 00 00 00 01 01 1c 00 00 00 ff ff ff ff ff ff ff : ................
0040 : ff 00 00 00 00 fe ff ff ff ff ff ff ff 00 00 00 : ................
0050 : 00 00 00 00 00 00 : ......
snapshot_0000000000000002(snapshot_$SNAPID)包含user1_image1_snap的信息:
struct cls_rbd_snap {
snapid_t id;
string name;
uint64_t image_size;
uint64_t features;
uint8_t protection_status;
cls_rbd_parent parent;
uint64_t flags;
}
snap_seq为image当前最新的snapshot的snapid。
写入4M数据
# dd if=/dev/sda1 of=/dev/rbd1 bs=1048576 count=4
# ceph osd map pool100 rbd_data.134d2ae8944a.0000000000000000
osdmap e436 pool 'pool100' (91) object 'rbd_data.134d2ae8944a.0000000000000000' -> pg 91.8cd8ecc2 (91.2) -> up ([0,2,1], p0) acting ([0,2,1], p0)
# ls 91.2_head/
rbd\udata.134d2ae8944a.0000000000000000__2_8CD8ECC2__5b rbd\udata.134d2ae8944a.0000000000000000__head_8CD8ECC2__5b
可以看到rbd_data.134d2ae8944a.0000000000000000多个了一个snap_seq为2的文件。
Ceph使用COW实现snapshot。Image的object为head version,当更新image时,ceph会针对image的snapshot(snap_seq version)拷贝一份数据,即object_${snap_seq},然后再更新head version。
我们再创建一个snapshot:
# rbd snap create pool100/user1_image1@user1_image1_snap2
# dd if=/dev/sda1 of=/dev/rbd1 bs=1048576 count=4
# ls 91.2_head/
rbd\udata.134d2ae8944a.0000000000000000__2_8CD8ECC2__5b rbd\udata.134d2ae8944a.0000000000000000__head_8CD8ECC2__5b
rbd\udata.134d2ae8944a.0000000000000000__3_8CD8ECC2__5b
可以看到,rbd_data.134d2ae8944a.0000000000000000多了一份snap_seq为3的副本。
# rados -p pool100 listomapvals rbd_header.134d2ae8944a
…
snap_seq
value: (8 bytes) :
0000 : 03 00 00 00 00 00 00 00 : ........
snapshot_0000000000000002
value: (86 bytes) :
0000 : 03 01 50 00 00 00 02 00 00 00 00 00 00 00 11 00 : ..P.............
0010 : 00 00 75 73 65 72 31 5f 69 6d 61 67 65 31 5f 73 : ..user1_image1_s
0020 : 6e 61 70 00 00 00 00 19 00 00 00 01 00 00 00 00 : nap.............
0030 : 00 00 00 01 01 1c 00 00 00 ff ff ff ff ff ff ff : ................
0040 : ff 00 00 00 00 fe ff ff ff ff ff ff ff 00 00 00 : ................
0050 : 00 00 00 00 00 00 : ......
snapshot_0000000000000003
value: (87 bytes) :
0000 : 03 01 51 00 00 00 03 00 00 00 00 00 00 00 12 00 : ..Q.............
0010 : 00 00 75 73 65 72 31 5f 69 6d 61 67 65 31 5f 73 : ..user1_image1_s
0020 : 6e 61 70 32 00 00 00 00 19 00 00 00 01 00 00 00 : nap2............
0030 : 00 00 00 00 01 01 1c 00 00 00 ff ff ff ff ff ff : ................
0040 : ff ff 00 00 00 00 fe ff ff ff ff ff ff ff 00 00 : ................
0050 : 00 00 00 00 00 00 00 : .......
可以看到rbd_header.${image_id}的变化。
写[4M,8M)的object1
# dd if=/dev/sda1 of=/dev/rbd1 bs=1048576 seek=4 count=4
# ceph osd map pool100 rbd_data.134d2ae8944a.0000000000000001
osdmap e437 pool 'pool100' (91) object 'rbd_data.134d2ae8944a.0000000000000001' -> pg 91.4a392b12 (91.12) -> up ([0,1,2], p0) acting ([0,1,2], p0)
# ls 91.12_head/
rbd\udata.134d2ae8944a.0000000000000001__3_4A392B12__5b rbd\udata.134d2ae8944a.0000000000000001__head_4A392B12__5b
Ceph会为object1拷贝snap_seq为3的数据。
写[8M,12M)的object2
# dd if=/dev/sda1 of=/dev/rbd1 bs=1048576 seek=8 count=4
# ceph osd map pool100 rbd_data.134d2ae8944a.0000000000000002
osdmap e437 pool 'pool100' (91) object 'rbd_data.134d2ae8944a.0000000000000002' -> pg 91.a7513028 (91.28) -> up ([0,1,2], p0) acting ([0,1,2], p0)
# ls 91.28_head/
rbd\udata.134d2ae8944a.0000000000000002__head_A7513028__5b
create snapshot
int add_snap(ImageCtx *ictx, const char *snap_name)
{
uint64_t snap_id;
///(1)alloc snap_id
int r = ictx->md_ctx.selfmanaged_snap_create(&snap_id);
if (r < 0) {
lderr(ictx->cct) << "failed to create snap id: " << cpp_strerror(-r)
<< dendl;
return r;
}
///(2)exec rbd snapshot_add
if (ictx->old_format) {
r = cls_client::old_snapshot_add(&ictx->md_ctx, ictx->header_oid,
snap_id, snap_name);
} else {
r = cls_client::snapshot_add(&ictx->md_ctx, ictx->header_oid,
snap_id, snap_name);
}
//...
return 0;
}
** exec rbd snapshot_add **
///cls_rbd_client.cc
int old_snapshot_add(librados::IoCtx *ioctx, const std::string &oid,
snapid_t snap_id, const std::string &snap_name)
{
bufferlist bl, bl2;
::encode(snap_name, bl); ///snap name
::encode(snap_id, bl); ///snap id
return ioctx->exec(oid, "rbd", "snap_add", bl, bl2);
}
/**
* Adds a snapshot to an rbd header. Ensures the id and name are unique.
*
* Input:
* @param snap_name name of the snapshot (string)
* @param snap_id id of the snapshot (uint64_t)
*
* Output:
* @returns 0 on success, negative error code on failure.
* @returns -ESTALE if the input snap_id is less than the image's snap_seq
* @returns -EEXIST if the id or name are already used by another snapshot
*/
int snapshot_add(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
{
bufferlist snap_namebl, snap_idbl;
cls_rbd_snap snap_meta;
try {
bufferlist::iterator iter = in->begin();
::decode(snap_meta.name, iter);
::decode(snap_meta.id, iter);
} catch (const buffer::error &err) {
return -EINVAL;
}
///....
bufferlist snap_metabl, snap_seqbl;
::encode(snap_meta, snap_metabl);
::encode(snap_meta.id, snap_seqbl); ///snapshot的 snap id
string snapshot_key;
key_from_snap_id(snap_meta.id, &snapshot_key);
map<string, bufferlist> vals;
vals["snap_seq"] = snap_seqbl; ///更新snap_seq field
vals[snapshot_key] = snap_metabl; ///snapshot_$ID = struct cls_rbd_snap
r = cls_cxx_map_set_vals(hctx, &vals);
}
clone snapshot
# rbd clone pool100/user1_image1@user1_image1_snap pool100/user1_image2
# rados -p pool100 ls
rbd_data.134d2ae8944a.0000000000000000
rbd_children
rbd_data.134d2ae8944a.0000000000000001
rbd_id.user1_image2
rbd_header.134d2ae8944a
rbd_directory
rbd_id.user1_image1
rbd_header.1368238e1f29
可以看到,pool100中多了3个object:rbd_children,rbd_header.1368238e1f29,rbd_id.user1_image2
** rbd_children **
# rados -p pool100 listomapvals rbd_children
key: (32 bytes):
0000 : 5b 00 00 00 00 00 00 00 0c 00 00 00 31 33 34 64 : [...........134d
0010 : 32 61 65 38 39 34 34 61 02 00 00 00 00 00 00 00 : 2ae8944a........
value: (20 bytes) :
0000 : 01 00 00 00 0c 00 00 00 31 33 36 38 32 33 38 65 : ........1368238e
0010 : 31 66 32 39 : 1f29
** rbd_header.1368238e1f29 **
相比user1_image1,多了parent字段:
# rados -p pool100 listomapvals rbd_header.1368238e1f29
features
value: (8 bytes) :
0000 : 01 00 00 00 00 00 00 00 : ........
object_prefix
value: (25 bytes) :
0000 : 15 00 00 00 72 62 64 5f 64 61 74 61 2e 31 33 36 : ....rbd_data.136
0010 : 38 32 33 38 65 31 66 32 39 : 8238e1f29
order
value: (1 bytes) :
0000 : 16 : .
parent
value: (46 bytes) :
0000 : 01 01 28 00 00 00 5b 00 00 00 00 00 00 00 0c 00 : ..(...[.........
0010 : 00 00 31 33 34 64 32 61 65 38 39 34 34 61 02 00 : ..134d2ae8944a..
0020 : 00 00 00 00 00 00 00 00 00 00 19 00 00 00 : ..............
size
value: (8 bytes) :
0000 : 00 00 00 00 19 00 00 00 : ........
snap_seq
value: (8 bytes) :
0000 : 00 00 00 00 00 00 00 00 : ........
read snapshot
在深入讨论之前,我们先整理下之前的流程:
create image1
write image1@[object0,object1]
create image1@snap
write image1@object0
create image1@snap2
write image1@object1
write image1@ojbect2
clone image1@snap image2
我们已经知道,当我们基于user1_image1@user1_image1_snap创建新的user1_image2时,pool100并没有对应的rbd_data.1368238e1f29.*的object,如果我们读取user1_image2,ceph如何处理呢?比如,我们读取[4M,8M),即user1_image2@object1:
#define IMAGE_BUF_SIZE 4194304
err = rados_ioctx_create(cluster, poolname, &io);
if (err < 0) {
fprintf(stderr, "%s: cannot open rados pool %s: %s\n", argv[0], poolname, strerror(-err));
rados_shutdown(cluster);
exit(1);
}
err = rbd_open(io, "user1_image2", &image, NULL);
if (err < 0){
fprintf(stderr, "open image failed: %s\n", strerror(-err));
goto out;
}
err = rbd_read(image, IMAGE_BUF_SIZE, IMAGE_BUF_SIZE, buf);
if (err < 0) {
fprintf(stderr, "%s: cannot read image: %s\n", poolname, strerror(-err));
}else{
fprintf(stderr, "read image return :%d\n", err);
}
实际上,Ceph会先尝试读取rbd_data.1368238e1f29.0000000000000001,必然返回ENOENT。这时,client再尝试从parent(user1_image1@snap)读取object1,而并不存在object1-snap,而是返回object1-snap2。对于object1,snap和snap2都对应object1-snap2。
FileStore::read 91.12_head/4a392b12/rbd_data.134d2ae8944a.0000000000000001/3//91 0~4194304/4194304
参考log。
如果我们读取rbd_data.1368238e1f29.0000000000000002,parent(user1_image1@snap)也会返回ENOENT。这时librbd会构造一个4M的zero block:
error opening file /var/lib/ceph/osd/ceph-0/current/91.2a_head/rbd\udata.1368238e1f29.0000000000000002__head_EB9D38AA__5b with flags=2: (2) No such file or directory
参考log2。
如下:
class AioRequest
{
void complete(int r)
{
if (should_complete(r)) {
if (m_hide_enoent && r == -ENOENT)
r = 0;
m_completion->complete(r);
delete this;
}
}
void C_AioRead::finish(int r)
{
ldout(m_cct, 10) << "C_AioRead::finish() " << this << " r = " << r << dendl;
if (r >= 0 || r == -ENOENT) { // this was a sparse_read operation
m_completion->destriper.add_partial_sparse_result(
m_cct, m_req->data(), m_req->m_ext_map, m_req->m_object_off,
m_req->m_buffer_extents);
}
void Striper::StripedReadResult::assemble_result(CephContext *cct, bufferlist& bl, bool zero_tail)
{
ldout(cct, 10) << "assemble_result(" << this << ") zero_tail=" << zero_tail << dendl;
// go backwards, so that we can efficiently discard zeros
map<uint64_t,pair<bufferlist,uint64_t> >::reverse_iterator p = partial.rbegin();
if (p == partial.rend())
return;
uint64_t end = p->first + p->second.second;
while (p != partial.rend()) {
// sanity check
ldout(cct, 20) << "assemble_result(" << this << ") " << p->first << "~" << p->second.second
<< " " << p->second.first.length() << " bytes"
<< dendl;
assert(p->first == end - p->second.second);
end = p->first;
size_t len = p->second.first.length(); ///return data len = 0
if (len < p->second.second) {
if (zero_tail || bl.length()) {
bufferptr bp(p->second.second - len); ///intended len(4M) - data len(0)
bp.zero();
bl.push_front(bp); ///zero block
bl.claim_prepend(p->second.first);
}
rbd_read
整体流程: