zfs/include/sys/arc_impl.h - backupdr - Git at Google

 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  */

 #ifndef _SYS_ARC_IMPL_H
 #define	_SYS_ARC_IMPL_H

 #include <sys/arc.h>
 #include <sys/zio_crypt.h>

 #ifdef __cplusplus
 extern "C" {
 #endif

 /*
  * Note that buffers can be in one of 6 states:
  *	ARC_anon	- anonymous (discussed below)
  *	ARC_mru		- recently used, currently cached
  *	ARC_mru_ghost	- recently used, no longer in cache
  *	ARC_mfu		- frequently used, currently cached
  *	ARC_mfu_ghost	- frequently used, no longer in cache
  *	ARC_l2c_only	- exists in L2ARC but not other states
  * When there are no active references to the buffer, they are
  * are linked onto a list in one of these arc states.  These are
  * the only buffers that can be evicted or deleted.  Within each
  * state there are multiple lists, one for meta-data and one for
  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
  * etc.) is tracked separately so that it can be managed more
  * explicitly: favored over data, limited explicitly.
  *
  * Anonymous buffers are buffers that are not associated with
  * a DVA.  These are buffers that hold dirty block copies
  * before they are written to stable storage.  By definition,
  * they are "ref'd" and are considered part of arc_mru
  * that cannot be freed.  Generally, they will acquire a DVA
  * as they are written and migrate onto the arc_mru list.
  *
  * The ARC_l2c_only state is for buffers that are in the second
  * level ARC but no longer in any of the ARC_m* lists.  The second
  * level ARC itself may also contain buffers that are in any of
  * the ARC_m* states - meaning that a buffer can exist in two
  * places.  The reason for the ARC_l2c_only state is to keep the
  * buffer header in the hash table, so that reads that hit the
  * second level ARC benefit from these fast lookups.
  */

 typedef struct arc_state {
 	/*
 	 * list of evictable buffers
 	 */
 	multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
 	/*
 	 * total amount of evictable data in this state
 	 */
 	zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
 	/*
 	 * total amount of data in this state; this includes: evictable,
 	 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
 	 */
 	zfs_refcount_t arcs_size;
 	/*
 	 * supports the "dbufs" kstat
 	 */
 	arc_state_type_t arcs_state;
 } arc_state_t;

 typedef struct arc_callback arc_callback_t;

 struct arc_callback {
 	void			*acb_private;
 	arc_read_done_func_t	*acb_done;
 	arc_buf_t		*acb_buf;
 	boolean_t		acb_encrypted;
 	boolean_t		acb_compressed;
 	boolean_t		acb_noauth;
 	zbookmark_phys_t	acb_zb;
 	zio_t			*acb_zio_dummy;
 	zio_t			*acb_zio_head;
 	arc_callback_t		*acb_next;
 };

 typedef struct arc_write_callback arc_write_callback_t;

 struct arc_write_callback {
 	void			*awcb_private;
 	arc_write_done_func_t	*awcb_ready;
 	arc_write_done_func_t	*awcb_children_ready;
 	arc_write_done_func_t	*awcb_physdone;
 	arc_write_done_func_t	*awcb_done;
 	arc_buf_t		*awcb_buf;
 };

 /*
  * ARC buffers are separated into multiple structs as a memory saving measure:
  *   - Common fields struct, always defined, and embedded within it:
  *       - L2-only fields, always allocated but undefined when not in L2ARC
  *       - L1-only fields, only allocated when in L1ARC
  *
  *           Buffer in L1                     Buffer only in L2
  *    +------------------------+          +------------------------+
  *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
  *    |                        |          |                        |
  *    |                        |          |                        |
  *    |                        |          |                        |
  *    +------------------------+          +------------------------+
  *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
  *    | (undefined if L1-only) |          |                        |
  *    +------------------------+          +------------------------+
  *    | l1arc_buf_hdr_t        |
  *    |                        |
  *    |                        |
  *    |                        |
  *    |                        |
  *    +------------------------+
  *
  * Because it's possible for the L2ARC to become extremely large, we can wind
  * up eating a lot of memory in L2ARC buffer headers, so the size of a header
  * is minimized by only allocating the fields necessary for an L1-cached buffer
  * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
  * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
  * words in pointers. arc_hdr_realloc() is used to switch a header between
  * these two allocation states.
  */
 typedef struct l1arc_buf_hdr {
 	kmutex_t		b_freeze_lock;
 	zio_cksum_t		*b_freeze_cksum;

 	arc_buf_t		*b_buf;
 	uint32_t		b_bufcnt;
 	/* for waiting on writes to complete */
 	kcondvar_t		b_cv;
 	uint8_t			b_byteswap;


 	/* protected by arc state mutex */
 	arc_state_t		*b_state;
 	multilist_node_t	b_arc_node;

 	/* updated atomically */
 	clock_t			b_arc_access;
 	uint32_t		b_mru_hits;
 	uint32_t		b_mru_ghost_hits;
 	uint32_t		b_mfu_hits;
 	uint32_t		b_mfu_ghost_hits;
 	uint32_t		b_l2_hits;

 	/* self protecting */
 	zfs_refcount_t		b_refcnt;

 	arc_callback_t		*b_acb;
 	abd_t			*b_pabd;
 } l1arc_buf_hdr_t;

 /*
  * Encrypted blocks will need to be stored encrypted on the L2ARC
  * disk as they appear in the main pool. In order for this to work we
  * need to pass around the encryption parameters so they can be used
  * to write data to the L2ARC. This struct is only defined in the
  * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED
  * flag set.
  */
 typedef struct arc_buf_hdr_crypt {
 	abd_t			*b_rabd;	/* raw encrypted data */
 	dmu_object_type_t	b_ot;		/* object type */
 	uint32_t		b_ebufcnt;	/* count of encrypted buffers */

 	/* dsobj for looking up encryption key for l2arc encryption */
 	uint64_t		b_dsobj;

 	/* encryption parameters */
 	uint8_t			b_salt[ZIO_DATA_SALT_LEN];
 	uint8_t			b_iv[ZIO_DATA_IV_LEN];

 	/*
 	 * Technically this could be removed since we will always be able to
 	 * get the mac from the bp when we need it. However, it is inconvenient
 	 * for callers of arc code to have to pass a bp in all the time. This
 	 * also allows us to assert that L2ARC data is properly encrypted to
 	 * match the data in the main storage pool.
 	 */
 	uint8_t			b_mac[ZIO_DATA_MAC_LEN];
 } arc_buf_hdr_crypt_t;

 typedef struct l2arc_dev {
 	vdev_t			*l2ad_vdev;	/* vdev */
 	spa_t			*l2ad_spa;	/* spa */
 	uint64_t		l2ad_hand;	/* next write location */
 	uint64_t		l2ad_start;	/* first addr on device */
 	uint64_t		l2ad_end;	/* last addr on device */
 	boolean_t		l2ad_first;	/* first sweep through */
 	boolean_t		l2ad_writing;	/* currently writing */
 	kmutex_t		l2ad_mtx;	/* lock for buffer list */
 	list_t			l2ad_buflist;	/* buffer list */
 	list_node_t		l2ad_node;	/* device list node */
 	zfs_refcount_t		l2ad_alloc;	/* allocated bytes */
 } l2arc_dev_t;

 typedef struct l2arc_buf_hdr {
 	/* protected by arc_buf_hdr mutex */
 	l2arc_dev_t		*b_dev;		/* L2ARC device */
 	uint64_t		b_daddr;	/* disk address, offset byte */
 	uint32_t		b_hits;

 	list_node_t		b_l2node;
 } l2arc_buf_hdr_t;

 typedef struct l2arc_write_callback {
 	l2arc_dev_t	*l2wcb_dev;		/* device info */
 	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
 } l2arc_write_callback_t;

 struct arc_buf_hdr {
 	/* protected by hash lock */
 	dva_t			b_dva;
 	uint64_t		b_birth;

 	arc_buf_contents_t	b_type;
 	arc_buf_hdr_t		*b_hash_next;
 	arc_flags_t		b_flags;

 	/*
 	 * This field stores the size of the data buffer after
 	 * compression, and is set in the arc's zio completion handlers.
 	 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
 	 *
 	 * While the block pointers can store up to 32MB in their psize
 	 * field, we can only store up to 32MB minus 512B. This is due
 	 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
 	 * a field of zeros represents 512B in the bp). We can't use a
 	 * bias of 1 since we need to reserve a psize of zero, here, to
 	 * represent holes and embedded blocks.
 	 *
 	 * This isn't a problem in practice, since the maximum size of a
 	 * buffer is limited to 16MB, so we never need to store 32MB in
 	 * this field. Even in the upstream illumos code base, the
 	 * maximum size of a buffer is limited to 16MB.
 	 */
 	uint16_t		b_psize;

 	/*
 	 * This field stores the size of the data buffer before
 	 * compression, and cannot change once set. It is in units
 	 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
 	 */
 	uint16_t		b_lsize;	/* immutable */
 	uint64_t		b_spa;		/* immutable */

 	/* L2ARC fields. Undefined when not in L2ARC. */
 	l2arc_buf_hdr_t		b_l2hdr;
 	/* L1ARC fields. Undefined when in l2arc_only state */
 	l1arc_buf_hdr_t		b_l1hdr;
 	/*
 	 * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED
 	 * is set and the L1 header exists.
 	 */
 	arc_buf_hdr_crypt_t b_crypt_hdr;
 };
 #ifdef __cplusplus
 }
 #endif

 #endif /* _SYS_ARC_IMPL_H */
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/
	/*
	* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
	* Copyright (c) 2013 by Delphix. All rights reserved.
	* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
	* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
	*/

	#ifndef _SYS_ARC_IMPL_H
	#define _SYS_ARC_IMPL_H

	#include <sys/arc.h>
	#include <sys/zio_crypt.h>

	#ifdef __cplusplus
	extern "C" {
	#endif

	/*
	* Note that buffers can be in one of 6 states:
	* ARC_anon - anonymous (discussed below)
	* ARC_mru - recently used, currently cached
	* ARC_mru_ghost - recently used, no longer in cache
	* ARC_mfu - frequently used, currently cached
	* ARC_mfu_ghost - frequently used, no longer in cache
	* ARC_l2c_only - exists in L2ARC but not other states
	* When there are no active references to the buffer, they are
	* are linked onto a list in one of these arc states. These are
	* the only buffers that can be evicted or deleted. Within each
	* state there are multiple lists, one for meta-data and one for
	* non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
	* etc.) is tracked separately so that it can be managed more
	* explicitly: favored over data, limited explicitly.
	*
	* Anonymous buffers are buffers that are not associated with
	* a DVA. These are buffers that hold dirty block copies
	* before they are written to stable storage. By definition,
	* they are "ref'd" and are considered part of arc_mru
	* that cannot be freed. Generally, they will acquire a DVA
	* as they are written and migrate onto the arc_mru list.
	*
	* The ARC_l2c_only state is for buffers that are in the second
	* level ARC but no longer in any of the ARC_m* lists. The second
	* level ARC itself may also contain buffers that are in any of
	* the ARC_m* states - meaning that a buffer can exist in two
	* places. The reason for the ARC_l2c_only state is to keep the
	* buffer header in the hash table, so that reads that hit the
	* second level ARC benefit from these fast lookups.
	*/

	typedef struct arc_state {
	/*
	* list of evictable buffers
	*/
	multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
	/*
	* total amount of evictable data in this state
	*/
	zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
	/*
	* total amount of data in this state; this includes: evictable,
	* non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
	*/
	zfs_refcount_t arcs_size;
	/*
	* supports the "dbufs" kstat
	*/
	arc_state_type_t arcs_state;
	} arc_state_t;

	typedef struct arc_callback arc_callback_t;

	struct arc_callback {
	void *acb_private;
	arc_read_done_func_t *acb_done;
	arc_buf_t *acb_buf;
	boolean_t acb_encrypted;
	boolean_t acb_compressed;
	boolean_t acb_noauth;
	zbookmark_phys_t acb_zb;
	zio_t *acb_zio_dummy;
	zio_t *acb_zio_head;
	arc_callback_t *acb_next;
	};

	typedef struct arc_write_callback arc_write_callback_t;

	struct arc_write_callback {
	void *awcb_private;
	arc_write_done_func_t *awcb_ready;
	arc_write_done_func_t *awcb_children_ready;
	arc_write_done_func_t *awcb_physdone;
	arc_write_done_func_t *awcb_done;
	arc_buf_t *awcb_buf;
	};

	/*
	* ARC buffers are separated into multiple structs as a memory saving measure:
	* - Common fields struct, always defined, and embedded within it:
	* - L2-only fields, always allocated but undefined when not in L2ARC
	* - L1-only fields, only allocated when in L1ARC
	*
	* Buffer in L1 Buffer only in L2
	* +------------------------+ +------------------------+
	* \| arc_buf_hdr_t \| \| arc_buf_hdr_t \|
	* \| \| \| \|
	* \| \| \| \|
	* \| \| \| \|
	* +------------------------+ +------------------------+
	* \| l2arc_buf_hdr_t \| \| l2arc_buf_hdr_t \|
	* \| (undefined if L1-only) \| \| \|
	* +------------------------+ +------------------------+
	* \| l1arc_buf_hdr_t \|
	* \| \|
	* \| \|
	* \| \|
	* \| \|
	* +------------------------+
	*
	* Because it's possible for the L2ARC to become extremely large, we can wind
	* up eating a lot of memory in L2ARC buffer headers, so the size of a header
	* is minimized by only allocating the fields necessary for an L1-cached buffer
	* when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
	* l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
	* words in pointers. arc_hdr_realloc() is used to switch a header between
	* these two allocation states.
	*/
	typedef struct l1arc_buf_hdr {
	kmutex_t b_freeze_lock;
	zio_cksum_t *b_freeze_cksum;

	arc_buf_t *b_buf;
	uint32_t b_bufcnt;
	/* for waiting on writes to complete */
	kcondvar_t b_cv;
	uint8_t b_byteswap;


	/* protected by arc state mutex */
	arc_state_t *b_state;
	multilist_node_t b_arc_node;

	/* updated atomically */
	clock_t b_arc_access;
	uint32_t b_mru_hits;
	uint32_t b_mru_ghost_hits;
	uint32_t b_mfu_hits;
	uint32_t b_mfu_ghost_hits;
	uint32_t b_l2_hits;

	/* self protecting */
	zfs_refcount_t b_refcnt;

	arc_callback_t *b_acb;
	abd_t *b_pabd;
	} l1arc_buf_hdr_t;

	/*
	* Encrypted blocks will need to be stored encrypted on the L2ARC
	* disk as they appear in the main pool. In order for this to work we
	* need to pass around the encryption parameters so they can be used
	* to write data to the L2ARC. This struct is only defined in the
	* arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED
	* flag set.
	*/
	typedef struct arc_buf_hdr_crypt {
	abd_t b_rabd; / raw encrypted data */
	dmu_object_type_t b_ot; /* object type */
	uint32_t b_ebufcnt; /* count of encrypted buffers */

	/* dsobj for looking up encryption key for l2arc encryption */
	uint64_t b_dsobj;

	/* encryption parameters */
	uint8_t b_salt[ZIO_DATA_SALT_LEN];
	uint8_t b_iv[ZIO_DATA_IV_LEN];

	/*
	* Technically this could be removed since we will always be able to
	* get the mac from the bp when we need it. However, it is inconvenient
	* for callers of arc code to have to pass a bp in all the time. This
	* also allows us to assert that L2ARC data is properly encrypted to
	* match the data in the main storage pool.
	*/
	uint8_t b_mac[ZIO_DATA_MAC_LEN];
	} arc_buf_hdr_crypt_t;

	typedef struct l2arc_dev {
	vdev_t l2ad_vdev; / vdev */
	spa_t l2ad_spa; / spa */
	uint64_t l2ad_hand; /* next write location */
	uint64_t l2ad_start; /* first addr on device */
	uint64_t l2ad_end; /* last addr on device */
	boolean_t l2ad_first; /* first sweep through */
	boolean_t l2ad_writing; /* currently writing */
	kmutex_t l2ad_mtx; /* lock for buffer list */
	list_t l2ad_buflist; /* buffer list */
	list_node_t l2ad_node; /* device list node */
	zfs_refcount_t l2ad_alloc; /* allocated bytes */
	} l2arc_dev_t;

	typedef struct l2arc_buf_hdr {
	/* protected by arc_buf_hdr mutex */
	l2arc_dev_t b_dev; / L2ARC device */
	uint64_t b_daddr; /* disk address, offset byte */
	uint32_t b_hits;

	list_node_t b_l2node;
	} l2arc_buf_hdr_t;

	typedef struct l2arc_write_callback {
	l2arc_dev_t l2wcb_dev; / device info */
	arc_buf_hdr_t l2wcb_head; / head of write buflist */
	} l2arc_write_callback_t;

	struct arc_buf_hdr {
	/* protected by hash lock */
	dva_t b_dva;
	uint64_t b_birth;

	arc_buf_contents_t b_type;
	arc_buf_hdr_t *b_hash_next;
	arc_flags_t b_flags;

	/*
	* This field stores the size of the data buffer after
	* compression, and is set in the arc's zio completion handlers.
	* It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
	*
	* While the block pointers can store up to 32MB in their psize
	* field, we can only store up to 32MB minus 512B. This is due
	* to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
	* a field of zeros represents 512B in the bp). We can't use a
	* bias of 1 since we need to reserve a psize of zero, here, to
	* represent holes and embedded blocks.
	*
	* This isn't a problem in practice, since the maximum size of a
	* buffer is limited to 16MB, so we never need to store 32MB in
	* this field. Even in the upstream illumos code base, the
	* maximum size of a buffer is limited to 16MB.
	*/
	uint16_t b_psize;

	/*
	* This field stores the size of the data buffer before
	* compression, and cannot change once set. It is in units
	* of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
	*/
	uint16_t b_lsize; /* immutable */
	uint64_t b_spa; /* immutable */

	/* L2ARC fields. Undefined when not in L2ARC. */
	l2arc_buf_hdr_t b_l2hdr;
	/* L1ARC fields. Undefined when in l2arc_only state */
	l1arc_buf_hdr_t b_l1hdr;
	/*
	* Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED
	* is set and the L1 header exists.
	*/
	arc_buf_hdr_crypt_t b_crypt_hdr;
	};
	#ifdef __cplusplus
	}
	#endif

	#endif /* _SYS_ARC_IMPL_H */