Index: src/distrib/sets/lists/base/shl.mi =================================================================== RCS file: /home/chs/netbsd/cvs/src/distrib/sets/lists/base/shl.mi,v retrieving revision 1.823 diff -u -p -r1.823 shl.mi --- src/distrib/sets/lists/base/shl.mi 10 Oct 2017 19:31:56 -0000 1.823 +++ src/distrib/sets/lists/base/shl.mi 10 Oct 2017 20:14:55 -0000 @@ -854,6 +854,9 @@ ./usr/lib/libzfs.so base-zfs-shlib compatfile,zfs ./usr/lib/libzfs.so.0 base-zfs-shlib compatfile,zfs ./usr/lib/libzfs.so.0.0 base-zfs-shlib compatfile,zfs +./usr/lib/libzfs_core.so base-zfs-shlib compatfile,zfs +./usr/lib/libzfs_core.so.0 base-zfs-shlib compatfile,zfs +./usr/lib/libzfs_core.so.0.0 base-zfs-shlib compatfile,zfs ./usr/lib/libzpool.so base-zfs-shlib compatfile,zfs ./usr/lib/libzpool.so.0 base-zfs-shlib compatfile,zfs ./usr/lib/libzpool.so.0.0 base-zfs-shlib compatfile,zfs Index: src/distrib/sets/lists/comp/mi =================================================================== RCS file: /home/chs/netbsd/cvs/src/distrib/sets/lists/comp/mi,v retrieving revision 1.2151 diff -u -p -r1.2151 mi --- src/distrib/sets/lists/comp/mi 10 Oct 2017 19:31:56 -0000 1.2151 +++ src/distrib/sets/lists/comp/mi 10 Oct 2017 20:14:55 -0000 @@ -3674,6 +3674,8 @@ ./usr/lib/libz_p.a comp-c-proflib compatfile,profile ./usr/lib/libzfs.a comp-zfs-lib compatfile,zfs ./usr/lib/libzfs_p.a comp-zfs-proflib compatfile,zfs,profile +./usr/lib/libzfs_core.a comp-zfs-lib compatfile,zfs +./usr/lib/libzfs_core_p.a comp-zfs-proflib compatfile,zfs,profile ./usr/lib/libzpool.a comp-zfs-lib compatfile,zfs ./usr/lib/libzpool_p.a comp-zfs-proflib compatfile,zfs,profile ./usr/lib/pkgconfig comp-c-lib Index: src/distrib/sets/lists/comp/shl.mi =================================================================== RCS file: /home/chs/netbsd/cvs/src/distrib/sets/lists/comp/shl.mi,v retrieving revision 1.308 diff -u -p -r1.308 shl.mi --- src/distrib/sets/lists/comp/shl.mi 10 Oct 2017 19:31:56 -0000 1.308 +++ src/distrib/sets/lists/comp/shl.mi 10 Oct 2017 20:14:55 -0000 @@ -274,7 +274,8 @@ ./usr/lib/libwrap_pic.a comp-c-piclib compatfile,picinstall ./usr/lib/libz_pic.a comp-c-piclib compatfile,picinstall ./usr/lib/libzfs_pic.a comp-zfs-piclib compatfile,picinstall,zfs -./usr/lib/libzpool_pic.a comp-zfs-piclib compatfile,zfs,picinstall +./usr/lib/libzfs_core_pic.a comp-zfs-piclib compatfile,picinstall,zfs +./usr/lib/libzpool_pic.a comp-zfs-piclib compatfile,picinstall,zfs ./usr/libexec/liblto_plugin.so comp-c-bin gcc ./usr/libexec/liblto_plugin.so.0 comp-c-bin gcc ./usr/libexec/liblto_plugin.so.0.0 comp-c-bin gcc Index: src/distrib/sets/lists/debug/mi =================================================================== RCS file: /home/chs/netbsd/cvs/src/distrib/sets/lists/debug/mi,v retrieving revision 1.227 diff -u -p -r1.227 mi --- src/distrib/sets/lists/debug/mi 10 Oct 2017 19:31:56 -0000 1.227 +++ src/distrib/sets/lists/debug/mi 10 Oct 2017 20:14:55 -0000 @@ -264,6 +264,7 @@ ./usr/lib/liby_g.a comp-c-debuglib debuglib,compatfile ./usr/lib/libz_g.a comp-c-debuglib debuglib,compatfile ./usr/lib/libzfs_g.a comp-c-debuglib debuglib,compatfile,zfs +./usr/lib/libzfs_core_g.a comp-c-debuglib debuglib,compatfile,zfs ./usr/lib/libzpool_g.a comp-c-debuglib debuglib,compatfile,zfs ./usr/libdata/debug/bin/cat.debug comp-util-debug debug ./usr/libdata/debug/bin/chio.debug comp-util-debug debug Index: src/distrib/sets/lists/debug/shl.mi =================================================================== RCS file: /home/chs/netbsd/cvs/src/distrib/sets/lists/debug/shl.mi,v retrieving revision 1.185 diff -u -p -r1.185 shl.mi --- src/distrib/sets/lists/debug/shl.mi 10 Oct 2017 19:31:57 -0000 1.185 +++ src/distrib/sets/lists/debug/shl.mi 10 Oct 2017 20:14:55 -0000 @@ -289,6 +289,7 @@ ./usr/libdata/debug/usr/lib/libwrap.so.1.0.debug comp-net-debug debug,compatfile ./usr/libdata/debug/usr/lib/libz.so.1.0.debug comp-sys-debug debug,compatfile ./usr/libdata/debug/usr/lib/libzfs.so.0.0.debug comp-zfs-debug debug,compatfile,zfs +./usr/libdata/debug/usr/lib/libzfs_core.so.0.0.debug comp-zfs-debug debug,compatfile,zfs ./usr/libdata/debug/usr/lib/libzpool.so.0.0.debug comp-zfs-debug debug,compatfile,zfs ./usr/libdata/debug/usr/lib/npf/ext_log.so.0.0.debug comp-obsolete debug,compatfile,npf,obsolete ./usr/libdata/debug/usr/lib/npf/ext_normalise.so.0.0.debug comp-obsolete debug,compatfile,npf,obsolete Index: src/external/bsd/libproc/dist/libproc.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/bsd/libproc/dist/libproc.h,v retrieving revision 1.3 diff -u -p -r1.3 libproc.h --- src/external/bsd/libproc/dist/libproc.h 9 Jun 2017 01:17:25 -0000 1.3 +++ src/external/bsd/libproc/dist/libproc.h 10 Jun 2017 00:35:09 -0000 @@ -51,6 +51,11 @@ typedef void (*proc_child_func)(void *); #define PS_DEAD 5 #define PS_LOST 6 +/* Flags for proc_attach(). */ +#define PATTACH_FORCE 0x01 +#define PATTACH_RDONLY 0x02 +#define PATTACH_NOSTOP 0x04 + /* Reason values for proc_detach(). */ #define PRELEASE_HANG 1 #define PRELEASE_KILL 2 Index: src/external/cddl/osnet/Makefile.inc =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/Makefile.inc,v retrieving revision 1.3 diff -u -p -r1.3 Makefile.inc --- src/external/cddl/osnet/Makefile.inc 23 Jan 2016 21:22:45 -0000 1.3 +++ src/external/cddl/osnet/Makefile.inc 10 Jun 2017 06:04:19 -0000 @@ -1,8 +1,10 @@ -# $FreeBSD: src/cddl/Makefile.inc,v 1.6.2.1 2009/08/03 08:13:06 kensmith Exp $ +# $FreeBSD: head/cddl/Makefile.inc 270358 2014-08-22 20:04:51Z delphij $ WARNS?=5 + OSNETDIR= ${NETBSDSRCDIR}/external/cddl/osnet OPENSOLARIS_USR_DISTDIR=${OSNETDIR}/dist OPENSOLARIS_SYS_DISTDIR=${OSNETDIR}/dist + CPPFLAGS+=-Wno-unknown-pragmas -Wno-sign-compare -D_KERNTYPES Index: src/external/cddl/osnet/Makefile.zfs =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/Makefile.zfs,v retrieving revision 1.4 diff -u -p -r1.4 Makefile.zfs --- src/external/cddl/osnet/Makefile.zfs 5 Sep 2012 23:08:42 -0000 1.4 +++ src/external/cddl/osnet/Makefile.zfs 10 Jun 2017 05:39:13 -0000 @@ -6,15 +6,13 @@ NOGCCERROR= yes # Basic compilation stuff. -CPPFLAGS+= "-D__va_list=va_list" -CPPFLAGS+= "-Doffsetof(s, m)=((size_t)(&(((s *)0)->m)))" CPPFLAGS+= -std=c99 # Pick a VTOC format - ick. CPPFLAGS+= -D_SUNOS_VTOC_16 CPPFLAGS+= -D_PROPLIB_ZFS_CONFLICT -CFLAGS+= -O0 -fno-inline +#CFLAGS+= -O0 -fno-inline #DBG= -g @@ -36,14 +34,24 @@ CPPFLAGS+= -I${ZFSDIR}/dist/lib/libshare CPPFLAGS+= -I${ZFSDIR}/dist/lib/libumem CPPFLAGS+= -I${ZFSDIR}/dist/lib/libuutil/common CPPFLAGS+= -I${ZFSDIR}/dist/lib/libzfs/common +CPPFLAGS+= -I${ZFSDIR}/dist/lib/libzfs_core/common CPPFLAGS+= -I${ZFSDIR}/dist/lib/libzpool/common CPPFLAGS+= -I${ZFSDIR}/dist/common +CWARNFLAGS+= -Wno-missing-field-initializers +CWARNFLAGS+= -Wno-strict-prototypes +CWARNFLAGS+= -Wno-cast-qual +CWARNFLAGS+= -Wno-discarded-qualifiers +CWARNFLAGS+= -Wno-switch +CWARNFLAGS+= -Wno-missing-prototypes +CWARNFLAGS+= -Wno-unused-variable +CWARNFLAGS+= -Wno-shadow + LIBAVL_SRCDIR= ${ZFSDIR}/lib/libavl LIBNVPAIR_SRCDIR= ${ZFSDIR}/lib/libnvpair LIBUMEM_SRCDIR= ${ZFSDIR}/lib/libumem LIBUUTIL_SRCDIR= ${ZFSDIR}/lib/libuutil LIBZFS_SRCDIR= ${ZFSDIR}/lib/libzfs +LIBZFS_CORE_SRCDIR= ${ZFSDIR}/lib/libzfs_core LIBZPOOL_SRCDIR= ${ZFSDIR}/lib/libzpool - Index: src/external/cddl/osnet/dev/cyclic/cyclic.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/cyclic/cyclic.c,v retrieving revision 1.7 diff -u -p -r1.7 cyclic.c --- src/external/cddl/osnet/dev/cyclic/cyclic.c 1 Feb 2017 21:59:09 -0000 1.7 +++ src/external/cddl/osnet/dev/cyclic/cyclic.c 11 Jun 2017 12:07:53 -0000 @@ -23,7 +23,7 @@ * * Portions Copyright 2008 John Birrell * - * $FreeBSD$ + * $FreeBSD: head/sys/cddl/dev/cyclic/cyclic.c 227293 2011-11-07 06:44:47Z ed $ * * This is a simplified version of the cyclic timer subsystem from * OpenSolaris. In the FreeBSD version, we don't use interrupt levels. @@ -352,7 +352,6 @@ #define mtx_unlock_spin(x) mutex_spin_exit(x) #define mtx_destroy(x) mutex_destroy(x) -#define ASSERT(x) KASSERT(x) #define SYSINIT(a1, a2, a3, a4, a5) #define SYSUNINIT(a1, a2, a3, a4, a5) #define CPU_FOREACH(var) \ Index: src/external/cddl/osnet/dev/cyclic/cyclic_test.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/cyclic/cyclic_test.c,v retrieving revision 1.2 diff -u -p -r1.2 cyclic_test.c --- src/external/cddl/osnet/dev/cyclic/cyclic_test.c 21 Feb 2010 01:46:33 -0000 1.2 +++ src/external/cddl/osnet/dev/cyclic/cyclic_test.c 11 Jun 2017 12:07:24 -0000 @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: src/sys/cddl/dev/cyclic/cyclic_test.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/cyclic/cyclic_test.c 179260 2008-05-23 22:21:58Z jb $ * */ Index: src/external/cddl/osnet/dev/cyclic/arm/cyclic_machdep.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/cyclic/arm/cyclic_machdep.c,v retrieving revision 1.1 diff -u -p -r1.1 cyclic_machdep.c --- src/external/cddl/osnet/dev/cyclic/arm/cyclic_machdep.c 5 Mar 2014 06:35:44 -0000 1.1 +++ src/external/cddl/osnet/dev/cyclic/arm/cyclic_machdep.c 11 Jun 2017 12:13:45 -0000 @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD$ + * $FreeBSD: head/sys/cddl/dev/cyclic/i386/cyclic_machdep.c 222813 2011-06-07 08:46:13Z attilio $ * */ Index: src/external/cddl/osnet/dev/cyclic/i386/cyclic_machdep.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/cyclic/i386/cyclic_machdep.c,v retrieving revision 1.4 diff -u -p -r1.4 cyclic_machdep.c --- src/external/cddl/osnet/dev/cyclic/i386/cyclic_machdep.c 2 Dec 2012 01:05:16 -0000 1.4 +++ src/external/cddl/osnet/dev/cyclic/i386/cyclic_machdep.c 11 Jun 2017 12:09:34 -0000 @@ -24,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD$ + * $FreeBSD: head/sys/cddl/dev/cyclic/i386/cyclic_machdep.c 222813 2011-06-07 08:46:13Z attilio $ * */ Index: src/external/cddl/osnet/dev/dtmalloc/dtmalloc.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtmalloc/dtmalloc.c,v retrieving revision 1.2 diff -u -p -r1.2 dtmalloc.c --- src/external/cddl/osnet/dev/dtmalloc/dtmalloc.c 21 Feb 2010 01:46:33 -0000 1.2 +++ src/external/cddl/osnet/dev/dtmalloc/dtmalloc.c 10 Jun 2017 16:13:27 -0000 @@ -22,7 +22,7 @@ * * Portions Copyright 2006-2008 John Birrell jb@freebsd.org * - * $FreeBSD: src/sys/cddl/dev/dtmalloc/dtmalloc.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtmalloc/dtmalloc.c 252325 2013-06-28 03:14:40Z markj $ * */ @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -113,8 +114,17 @@ dtmalloc_type_cb(struct malloc_type *mtp { char name[DTRACE_FUNCNAMELEN]; struct malloc_type_internal *mtip = mtp->ks_handle; + int i; + /* + * malloc_type descriptions are allowed to contain whitespace, but + * DTrace probe identifiers are not, so replace the whitespace with + * underscores. + */ strlcpy(name, mtp->ks_shortdesc, sizeof(name)); + for (i = 0; name[i] != 0; i++) + if (isspace(name[i])) + name[i] = '_'; if (dtrace_probe_lookup(dtmalloc_id, NULL, name, "malloc") != 0) return; Index: src/external/cddl/osnet/dev/dtrace/dtrace_anon.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_anon.c,v retrieving revision 1.2 diff -u -p -r1.2 dtrace_anon.c --- src/external/cddl/osnet/dev/dtrace/dtrace_anon.c 21 Feb 2010 01:46:33 -0000 1.2 +++ src/external/cddl/osnet/dev/dtrace/dtrace_anon.c 12 Apr 2017 15:45:10 -0000 @@ -20,7 +20,7 @@ * * CDDL HEADER END * - * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_anon.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_anon.c 179237 2008-05-23 05:59:42Z jb $ */ /* Index: src/external/cddl/osnet/dev/dtrace/dtrace_cddl.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_cddl.h,v retrieving revision 1.2 diff -u -p -r1.2 dtrace_cddl.h --- src/external/cddl/osnet/dev/dtrace/dtrace_cddl.h 21 Feb 2010 01:46:33 -0000 1.2 +++ src/external/cddl/osnet/dev/dtrace/dtrace_cddl.h 16 Jun 2017 17:07:29 -0000 @@ -20,7 +20,7 @@ * * CDDL HEADER END * - * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_cddl.h,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_cddl.h 292388 2015-12-17 00:00:27Z markj $ * */ @@ -28,6 +28,11 @@ #define _DTRACE_CDDL_H_ #include +#include + +#define SYSCTL_NODE(...) +#define SYSCTL_DECL(...) +#define SYSCTL_INT(...) #define LOCK_LEVEL 10 @@ -38,6 +43,7 @@ typedef struct kdtrace_proc { int p_dtrace_probes; /* Are there probes for this proc? */ u_int64_t p_dtrace_count; /* Number of DTrace tracepoints */ void *p_dtrace_helpers; /* DTrace helpers, if any */ + int p_dtrace_model; } kdtrace_proc_t; @@ -61,6 +67,9 @@ typedef struct kdtrace_thread { /* Handling a return probe. */ u_int8_t _td_dtrace_ast; /* Saved ast flag. */ +#ifdef __amd64__ + u_int8_t _td_dtrace_reg; +#endif } _tds; u_long _td_dtrace_ft; /* Bitwise or of these flags. */ } _tdu; @@ -69,6 +78,7 @@ typedef struct kdtrace_thread { #define td_dtrace_step _tdu._tds._td_dtrace_step #define td_dtrace_ret _tdu._tds._td_dtrace_ret #define td_dtrace_ast _tdu._tds._td_dtrace_ast +#define td_dtrace_reg _tdu._tds._td_dtrace_reg uintptr_t td_dtrace_pc; /* DTrace saved pc from fasttrap. */ uintptr_t td_dtrace_npc; /* DTrace next pc from fasttrap. */ @@ -76,8 +86,12 @@ typedef struct kdtrace_thread { /* DTrace per-thread scratch location. */ uintptr_t td_dtrace_astpc; /* DTrace return sequence location. */ +#ifdef __amd64__ + uintptr_t td_dtrace_regv; +#endif u_int64_t td_hrtime; /* Last time on cpu. */ - int td_errno; /* Syscall return value. */ + void *td_dtrace_sscr; /* Saved scratch space location. */ + void *td_systrace_args; /* syscall probe arguments. */ } kdtrace_thread_t; /* @@ -86,21 +100,47 @@ typedef struct kdtrace_thread { * that the separation on FreeBSD is a licensing constraint designed to * keep the GENERIC kernel BSD licensed. */ -#define t_dtrace_vtime l_dtrace->td_dtrace_vtime -#define t_dtrace_start l_dtrace->td_dtrace_start -#define t_dtrace_stop l_dtrace->td_dtrace_stop -#define t_dtrace_sig l_dtrace->td_dtrace_sig -#define t_predcache l_dtrace->td_predcache -#define p_dtrace_helpers p_dtrace->p_dtrace_helpers +#define td_dtrace l_dtrace +#define t_dtrace_vtime td_dtrace->td_dtrace_vtime +#define t_dtrace_start td_dtrace->td_dtrace_start +#define t_dtrace_stop td_dtrace->td_dtrace_stop +#define t_dtrace_sig td_dtrace->td_dtrace_sig +#define t_predcache td_dtrace->td_predcache +#define t_dtrace_ft td_dtrace->td_dtrace_ft +#define t_dtrace_on td_dtrace->td_dtrace_on +#define t_dtrace_step td_dtrace->td_dtrace_step +#define t_dtrace_ret td_dtrace->td_dtrace_ret +#define t_dtrace_ast td_dtrace->td_dtrace_ast +#define t_dtrace_reg td_dtrace->td_dtrace_reg +#define t_dtrace_pc td_dtrace->td_dtrace_pc +#define t_dtrace_npc td_dtrace->td_dtrace_npc +#define t_dtrace_scrpc td_dtrace->td_dtrace_scrpc +#define t_dtrace_astpc td_dtrace->td_dtrace_astpc +#define t_dtrace_regv td_dtrace->td_dtrace_regv +#define t_dtrace_sscr td_dtrace->td_dtrace_sscr +#define t_dtrace_systrace_args td_dtrace->td_systrace_args +#define p_dtrace_helpers p_dtrace->p_dtrace_helpers +#define p_dtrace_count p_dtrace->p_dtrace_count +#define p_dtrace_probes p_dtrace->p_dtrace_probes +#define p_model p_dtrace->p_dtrace_model + +#define DATAMODEL_NATIVE 0 +#ifdef __amd64__ +#define DATAMODEL_LP64 0 +#define DATAMODEL_ILP32 1 +#else +#define DATAMODEL_LP64 1 +#define DATAMODEL_ILP32 0 +#endif /* - * Definitions for fields in struct proc which are named differntly in FreeBSD. + * Definitions for fields in struct proc which are named differently in FreeBSD. */ //#define p_cred p_ucred #define p_parent p_pptr /* - * Definitions for fields in struct thread which are named differntly in NetBSD. + * Definitions for fields in struct thread which are named differently in NetBSD. */ #define t_procp l_proc #define t_tid l_lid Index: src/external/cddl/osnet/dev/dtrace/dtrace_clone.c =================================================================== RCS file: src/external/cddl/osnet/dev/dtrace/dtrace_clone.c diff -N src/external/cddl/osnet/dev/dtrace/dtrace_clone.c --- src/external/cddl/osnet/dev/dtrace/dtrace_clone.c 21 Feb 2010 01:46:33 -0000 1.2 +++ /dev/null 1 Jan 1970 00:00:00 -0000 @@ -1,63 +0,0 @@ -/* $NetBSD: dtrace_clone.c,v 1.2 2010/02/21 01:46:33 darran Exp $ */ - -/*- - * Copyright (C) 2006 John Birrell . - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice(s), this list of conditions and the following disclaimer as - * the first lines of this file unmodified other than the possible - * addition of one or more copyright notices. - * 2. Redistributions in binary form must reproduce the above copyright - * notice(s), this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH - * DAMAGE. - * - * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_clone.c,v 1.2.2.1 2009/08/03 08:13:06 kensmith Exp $ - * - */ - -static void -dtrace_clone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) -{ - int u = -1; - size_t len; - - if (*dev != NULL) - return; - - len = strlen(name); - - if (len != 6 && len != 13) - return; - - if (bcmp(name,"dtrace",6) != 0) - return; - - if (len == 13 && bcmp(name,"dtrace/dtrace",13) != 0) - return; - - /* Clone the device to the new minor number. */ - if (clone_create(&dtrace_clones, &dtrace_cdevsw, &u, dev, 0) != 0) - /* Create the /dev/dtrace/dtraceNN entry. */ - *dev = make_dev_cred(&dtrace_cdevsw, u, cred, - UID_ROOT, GID_WHEEL, 0600, "dtrace/dtrace%d", u); - if (*dev != NULL) { - dev_ref(*dev); - (*dev)->si_flags |= SI_CHEAPCLONE; - } -} Index: src/external/cddl/osnet/dev/dtrace/dtrace_debug.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_debug.c,v retrieving revision 1.8 diff -u -p -r1.8 dtrace_debug.c --- src/external/cddl/osnet/dev/dtrace/dtrace_debug.c 5 Mar 2014 06:06:42 -0000 1.8 +++ src/external/cddl/osnet/dev/dtrace/dtrace_debug.c 10 May 2017 11:09:52 -0000 @@ -27,33 +27,35 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * - * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_debug.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_debug.c 315208 2017-03-13 18:43:00Z markj $ * */ static char const hex2ascii_data[] = "0123456789abcdefghijklmnopqrstuvwxyz"; #define hex2ascii(hex) (hex2ascii_data[hex]) +#define MAXCPU MAXCPUS #ifdef DEBUG #define DTRACE_DEBUG_BUFR_SIZE (32 * 1024) struct dtrace_debug_data { + u_long lock __aligned(CACHE_LINE_SIZE); char bufr[DTRACE_DEBUG_BUFR_SIZE]; char *first; char *last; char *next; -} dtrace_debug_data[MAXCPUS]; +} dtrace_debug_data[MAXCPU]; static char dtrace_debug_bufr[DTRACE_DEBUG_BUFR_SIZE]; -static volatile u_long dtrace_debug_flag[MAXCPUS]; - static void dtrace_debug_lock(int cpu) { - /* FIXME: use atomic_cmpset_ulong once we have it */ - while (atomic_cas_ulong(&dtrace_debug_flag[cpu], 0, 1) == 0) + void *tid; + + tid = curlwp; + while (atomic_cas_ptr(&dtrace_debug_data[cpu].lock, 0, tid) == 0) /* Loop until the lock is obtained. */ ; } @@ -61,7 +63,9 @@ dtrace_debug_lock(int cpu) static void dtrace_debug_unlock(int cpu) { - dtrace_debug_flag[cpu] = 0; + + membar_producer(); + dtrace_debug_data[cpu].lock = 0; } static void @@ -83,25 +87,26 @@ dtrace_debug_init(void *dummy) } } -//SYSINIT(dtrace_debug_init, SI_SUB_KDTRACE, SI_ORDER_ANY, dtrace_debug_init, NULL); -//SYSINIT(dtrace_debug_smpinit, SI_SUB_SMP, SI_ORDER_ANY, dtrace_debug_init, NULL); +#ifdef __FreeBSD__ +SYSINIT(dtrace_debug_init, SI_SUB_KDTRACE, SI_ORDER_ANY, dtrace_debug_init, NULL); +SYSINIT(dtrace_debug_smpinit, SI_SUB_SMP, SI_ORDER_ANY, dtrace_debug_init, NULL); +#endif static void dtrace_debug_output(void) { char *p; + int i; struct dtrace_debug_data *d; uintptr_t count; CPU_INFO_ITERATOR cpuind; struct cpu_info *cinfo; - cpuid_t cpuid; for (CPU_INFO_FOREACH(cpuind, cinfo)) { - cpuid = cpu_index(cinfo); + i = cpu_index(cinfo); + dtrace_debug_lock(i); - dtrace_debug_lock(cpuid); - - d = &dtrace_debug_data[cpuid]; + d = &dtrace_debug_data[i]; count = 0; @@ -129,7 +134,7 @@ dtrace_debug_output(void) d->first = d->bufr; d->next = d->bufr; - dtrace_debug_unlock(cpuid); + dtrace_debug_unlock(i); if (count > 0) { char *last = dtrace_debug_bufr + count; @@ -158,10 +163,11 @@ dtrace_debug_output(void) */ static __inline void -dtrace_debug__putc(char c) +dtrace_debug__putc(int cpu, char c) { - struct dtrace_debug_data *d = &dtrace_debug_data[cpu_number()]; + struct dtrace_debug_data *d; + d = &dtrace_debug_data[cpu]; *d->next++ = c; if (d->next == d->last) @@ -179,24 +185,30 @@ dtrace_debug__putc(char c) static void __used dtrace_debug_putc(char c) { - dtrace_debug_lock(cpu_number()); + int cpu; + + cpu = cpu_number(); + dtrace_debug_lock(cpu); - dtrace_debug__putc(c); + dtrace_debug__putc(cpu, c); - dtrace_debug_unlock(cpu_number()); + dtrace_debug_unlock(cpu); } static void __used dtrace_debug_puts(const char *s) { - dtrace_debug_lock(cpu_number()); + int cpu; + + cpu = cpu_number(); + dtrace_debug_lock(cpu); while (*s != '\0') - dtrace_debug__putc(*s++); + dtrace_debug__putc(cpu, *s++); - dtrace_debug__putc('\0'); + dtrace_debug__putc(cpu, '\0'); - dtrace_debug_unlock(cpu_number()); + dtrace_debug_unlock(cpu); } /* @@ -205,30 +217,30 @@ dtrace_debug_puts(const char *s) * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse * order; return an optional length and a pointer to the last character * written in the buffer (i.e., the first character of the string). - * The buffer pointed to by `xbuf' must have length >= MAXNBUF. + * The buffer pointed to by `nbuf' must have length >= MAXNBUF. */ static char * -dtrace_debug_ksprintn(char *xbuf, uintmax_t num, int base, int *lenp, int upper) +dtrace_debug_ksprintn(char *nbuf, uintmax_t num, int base, int *lenp, int upper) { char *p, c; - p = xbuf; + p = nbuf; *p = '\0'; do { c = hex2ascii(num % base); *++p = upper ? toupper(c) : c; } while (num /= base); if (lenp) - *lenp = p - xbuf; + *lenp = p - nbuf; return (p); } #define MAXNBUF (sizeof(intmax_t) * NBBY + 1) static void -dtrace_debug_vprintf(const char *fmt, va_list ap) +dtrace_debug_vprintf(int cpu, const char *fmt, va_list ap) { - char xbuf[MAXNBUF]; + char nbuf[MAXNBUF]; const char *p, *percent, *q; u_char *up; int ch, n; @@ -250,10 +262,10 @@ dtrace_debug_vprintf(const char *fmt, va width = 0; while ((ch = (u_char)*fmt++) != '%' || stop) { if (ch == '\0') { - dtrace_debug__putc('\0'); + dtrace_debug__putc(cpu, '\0'); return; } - dtrace_debug__putc(ch); + dtrace_debug__putc(cpu, ch); } percent = fmt - 1; qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0; @@ -273,7 +285,7 @@ reswitch: switch (ch = (u_char)*fmt++) { ladjust = 1; goto reswitch; case '%': - dtrace_debug__putc(ch); + dtrace_debug__putc(cpu, ch); break; case '*': if (!dot) { @@ -307,8 +319,8 @@ reswitch: switch (ch = (u_char)*fmt++) { case 'b': num = (u_int)va_arg(ap, int); p = va_arg(ap, char *); - for (q = dtrace_debug_ksprintn(xbuf, num, *p++, NULL, 0); *q;) - dtrace_debug__putc(*q--); + for (q = dtrace_debug_ksprintn(nbuf, num, *p++, NULL, 0); *q;) + dtrace_debug__putc(cpu, *q--); if (num == 0) break; @@ -316,19 +328,19 @@ reswitch: switch (ch = (u_char)*fmt++) { for (tmp = 0; *p;) { n = *p++; if (num & (1 << (n - 1))) { - dtrace_debug__putc(tmp ? ',' : '<'); + dtrace_debug__putc(cpu, tmp ? ',' : '<'); for (; (n = *p) > ' '; ++p) - dtrace_debug__putc(n); + dtrace_debug__putc(cpu, n); tmp = 1; } else for (; *p > ' '; ++p) continue; } if (tmp) - dtrace_debug__putc('>'); + dtrace_debug__putc(cpu, '>'); break; case 'c': - dtrace_debug__putc(va_arg(ap, int)); + dtrace_debug__putc(cpu, va_arg(ap, int)); break; case 'D': up = va_arg(ap, u_char *); @@ -336,12 +348,12 @@ reswitch: switch (ch = (u_char)*fmt++) { if (!width) width = 16; while(width--) { - dtrace_debug__putc(hex2ascii(*up >> 4)); - dtrace_debug__putc(hex2ascii(*up & 0x0f)); + dtrace_debug__putc(cpu, hex2ascii(*up >> 4)); + dtrace_debug__putc(cpu, hex2ascii(*up & 0x0f)); up++; if (width) for (q=p;*q;q++) - dtrace_debug__putc(*q); + dtrace_debug__putc(cpu, *q); } break; case 'd': @@ -413,12 +425,12 @@ reswitch: switch (ch = (u_char)*fmt++) { if (!ladjust && width > 0) while (width--) - dtrace_debug__putc(padc); + dtrace_debug__putc(cpu, padc); while (n--) - dtrace_debug__putc(*p++); + dtrace_debug__putc(cpu, *p++); if (ladjust && width > 0) while (width--) - dtrace_debug__putc(padc); + dtrace_debug__putc(cpu, padc); break; case 't': tflag = 1; @@ -479,7 +491,7 @@ number: neg = 1; num = -(intmax_t)num; } - p = dtrace_debug_ksprintn(xbuf, num, base, &tmp, upper); + p = dtrace_debug_ksprintn(nbuf, num, base, &tmp, upper); if (sharpflag && num != 0) { if (base == 8) tmp++; @@ -492,32 +504,32 @@ number: if (!ladjust && padc != '0' && width && (width -= tmp) > 0) while (width--) - dtrace_debug__putc(padc); + dtrace_debug__putc(cpu, padc); if (neg) - dtrace_debug__putc('-'); + dtrace_debug__putc(cpu, '-'); if (sharpflag && num != 0) { if (base == 8) { - dtrace_debug__putc('0'); + dtrace_debug__putc(cpu, '0'); } else if (base == 16) { - dtrace_debug__putc('0'); - dtrace_debug__putc('x'); + dtrace_debug__putc(cpu, '0'); + dtrace_debug__putc(cpu, 'x'); } } if (!ladjust && width && (width -= tmp) > 0) while (width--) - dtrace_debug__putc(padc); + dtrace_debug__putc(cpu, padc); while (*p) - dtrace_debug__putc(*p--); + dtrace_debug__putc(cpu, *p--); if (ladjust && width && (width -= tmp) > 0) while (width--) - dtrace_debug__putc(padc); + dtrace_debug__putc(cpu, padc); break; default: while (percent < fmt) - dtrace_debug__putc(*percent++); + dtrace_debug__putc(cpu, *percent++); /* * Since we ignore an formatting argument it is no * longer safe to obey the remaining formatting @@ -529,23 +541,25 @@ number: } } - dtrace_debug__putc('\0'); + dtrace_debug__putc(cpu, '\0'); } void dtrace_debug_printf(const char *fmt, ...) { va_list ap; + int cpu; - dtrace_debug_lock(cpu_number()); + cpu = cpu_number(); + dtrace_debug_lock(cpu); va_start(ap, fmt); - dtrace_debug_vprintf(fmt, ap); + dtrace_debug_vprintf(cpu, fmt, ap); va_end(ap); - dtrace_debug_unlock(cpu_number()); + dtrace_debug_unlock(cpu); } #else @@ -554,4 +568,9 @@ dtrace_debug_printf(const char *fmt, ... #define dtrace_debug_puts(_s) #define dtrace_debug_printf(fmt, ...) +static void +dtrace_debug_init(void *dummy) +{ +} + #endif Index: src/external/cddl/osnet/dev/dtrace/dtrace_hacks.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_hacks.c,v retrieving revision 1.5 diff -u -p -r1.5 dtrace_hacks.c --- src/external/cddl/osnet/dev/dtrace/dtrace_hacks.c 23 Jun 2016 06:44:52 -0000 1.5 +++ src/external/cddl/osnet/dev/dtrace/dtrace_hacks.c 5 May 2017 11:52:00 -0000 @@ -1,12 +1,13 @@ /* $NetBSD: dtrace_hacks.c,v 1.5 2016/06/23 06:44:52 pgoyette Exp $ */ -/* $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_hacks.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ */ +/* $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_hacks.c 281916 2015-04-24 03:19:30Z markj $ */ /* XXX Hacks.... */ dtrace_cacheid_t dtrace_predcache_id; boolean_t -priv_policy_only(const cred_t *a, int b, boolean_t c) +priv_policy_only(const cred_t *cr, int b, boolean_t c) { - return 1; + + return kauth_authorize_generic(cr, KAUTH_GENERIC_ISSUSER, NULL) == 0; } Index: src/external/cddl/osnet/dev/dtrace/dtrace_ioctl.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_ioctl.c,v retrieving revision 1.6 diff -u -p -r1.6 dtrace_ioctl.c --- src/external/cddl/osnet/dev/dtrace/dtrace_ioctl.c 30 Sep 2015 20:59:13 -0000 1.6 +++ src/external/cddl/osnet/dev/dtrace/dtrace_ioctl.c 11 Jun 2017 11:47:33 -0000 @@ -20,18 +20,106 @@ * * CDDL HEADER END * - * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_ioctl.c,v 1.2.2.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_ioctl.c 313262 2017-02-05 02:39:12Z markj $ * */ -static int dtrace_verbose_ioctl=0; -//SYSCTL_INT(_debug_dtrace, OID_AUTO, verbose_ioctl, CTLFLAG_RW, &dtrace_verbose_ioctl, 0, ""); +static int dtrace_verbose_ioctl; +SYSCTL_INT(_debug_dtrace, OID_AUTO, verbose_ioctl, CTLFLAG_RW, + &dtrace_verbose_ioctl, 0, "log DTrace ioctls"); + +#define pfind(pid) proc_find((pid)) #define DTRACE_IOCTL_PRINTF(fmt, ...) if (dtrace_verbose_ioctl) printf(fmt, ## __VA_ARGS__ ) +#ifdef __FreeBSD__ +static int +dtrace_ioctl_helper(struct cdev *dev, u_long cmd, caddr_t addr, int flags, + struct thread *td) +#endif +#ifdef __NetBSD__ +static int +dtrace_ioctl_helper(dev_t dev, u_long cmd, caddr_t addr, int flags) +#endif +{ + struct proc *p; + dof_helper_t *dhp; + dof_hdr_t *dof; + int rval; + + dhp = NULL; + dof = NULL; + rval = 0; + switch (cmd) { + case DTRACEHIOC_ADDDOF: + dhp = (dof_helper_t *)addr; + addr = (caddr_t)(uintptr_t)dhp->dofhp_dof; + p = curproc; + if (p->p_pid == dhp->dofhp_pid) { + dof = dtrace_dof_copyin((uintptr_t)addr, &rval); + } else { +#ifdef __FreeBSD__ + p = pfind(dhp->dofhp_pid); + if (p == NULL) + return (EINVAL); + if (!P_SHOULDSTOP(p) || + (p->p_flag & (P_TRACED | P_WEXIT)) != P_TRACED || + p->p_pptr != curproc) { + PROC_UNLOCK(p); + return (EINVAL); + } + _PHOLD(p); + PROC_UNLOCK(p); + dof = dtrace_dof_copyin_proc(p, (uintptr_t)addr, &rval); +#endif +#ifdef __NetBSD__ + dof = dtrace_dof_copyin_pid(dhp->dofhp_pid, addr, &rval); +#endif + } + + if (dof == NULL) { +#ifdef __FreeBSD__ + if (p != curproc) + PRELE(p); +#endif + break; + } + + mutex_enter(&dtrace_lock); + if ((rval = dtrace_helper_slurp(dof, dhp, p)) != -1) { + dhp->dofhp_gen = rval; + rval = 0; + } else { + rval = EINVAL; + } + mutex_exit(&dtrace_lock); +#ifdef __FreeBSD__ + if (p != curproc) + PRELE(p); +#endif + break; + case DTRACEHIOC_REMOVE: + mutex_enter(&dtrace_lock); + rval = dtrace_helper_destroygen(NULL, *(int *)(uintptr_t)addr); + mutex_exit(&dtrace_lock); + break; + default: + rval = ENOTTY; + break; + } + return (rval); +} + /* ARGSUSED */ +#ifdef __FreeBSD__ +static int +dtrace_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, + int flags __unused, struct thread *td) +#endif +#ifdef __NetBSD__ static int dtrace_ioctl(struct file *fp, u_long cmd, void *addr) +#endif { dtrace_state_t *state = (dtrace_state_t *)fp->f_data; int error = 0; @@ -224,6 +312,7 @@ dtrace_ioctl(struct file *fp, u_long cmd desc.dtbd_drops = buf->dtb_drops; desc.dtbd_errors = buf->dtb_errors; desc.dtbd_oldest = buf->dtb_xamot_offset; + desc.dtbd_timestamp = dtrace_gethrtime(); mutex_exit(&dtrace_lock); @@ -278,6 +367,7 @@ dtrace_ioctl(struct file *fp, u_long cmd desc.dtbd_drops = buf->dtb_xamot_drops; desc.dtbd_errors = buf->dtb_xamot_errors; desc.dtbd_oldest = 0; + desc.dtbd_timestamp = buf->dtb_switched; mutex_exit(&dtrace_lock); @@ -361,7 +451,8 @@ dtrace_ioctl(struct file *fp, u_long cmd return (EBUSY); } - if (dtrace_dof_slurp(dof, vstate, curlwp->l_cred, &enab, 0, B_TRUE) != 0) { + if (dtrace_dof_slurp(dof, vstate, CRED(), &enab, 0, 0, + B_TRUE) != 0) { mutex_exit(&dtrace_lock); mutex_exit(&cpu_lock); dtrace_dof_destroy(dof); @@ -528,19 +619,25 @@ dtrace_ioctl(struct file *fp, u_long cmd return (EINVAL); mutex_enter(&dtrace_provider_lock); +#ifdef illumos mutex_enter(&mod_lock); +#endif mutex_enter(&dtrace_lock); if (desc->dtargd_id > dtrace_nprobes) { mutex_exit(&dtrace_lock); +#ifdef illumos mutex_exit(&mod_lock); +#endif mutex_exit(&dtrace_provider_lock); return (EINVAL); } if ((probe = dtrace_probes[desc->dtargd_id - 1]) == NULL) { mutex_exit(&dtrace_lock); +#ifdef illumos mutex_exit(&mod_lock); +#endif mutex_exit(&dtrace_provider_lock); return (EINVAL); } @@ -564,7 +661,9 @@ dtrace_ioctl(struct file *fp, u_long cmd probe->dtpr_id, probe->dtpr_arg, desc); } +#ifdef illumos mutex_exit(&mod_lock); +#endif mutex_exit(&dtrace_provider_lock); return (0); @@ -710,7 +809,7 @@ again: case DTRACEIOC_STATUS: { dtrace_status_t *stat = (dtrace_status_t *) addr; dtrace_dstate_t *dstate; - int j; + int i, j; uint64_t nerrs; CPU_INFO_ITERATOR cpuind; struct cpu_info *cinfo; @@ -742,24 +841,25 @@ again: dstate = &state->dts_vstate.dtvs_dynvars; for (CPU_INFO_FOREACH(cpuind, cinfo)) { - int ci = cpu_index(cinfo); - dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[ci]; + i = cpu_index(cinfo); + + dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i]; stat->dtst_dyndrops += dcpu->dtdsc_drops; stat->dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops; stat->dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops; - if (state->dts_buffer[ci].dtb_flags & DTRACEBUF_FULL) + if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL) stat->dtst_filled++; - nerrs += state->dts_buffer[ci].dtb_errors; + nerrs += state->dts_buffer[i].dtb_errors; for (j = 0; j < state->dts_nspeculations; j++) { dtrace_speculation_t *spec; dtrace_buffer_t *buf; spec = &state->dts_speculations[j]; - buf = &spec->dtsp_buffer[ci]; + buf = &spec->dtsp_buffer[i]; stat->dtst_specdrops += buf->dtb_xamot_drops; } } @@ -777,15 +877,16 @@ again: return (0); } case DTRACEIOC_STOP: { + int rval; processorid_t *cpuid = (processorid_t *) addr; DTRACE_IOCTL_PRINTF("%s(%d): DTRACEIOC_STOP\n",__func__,__LINE__); mutex_enter(&dtrace_lock); - error = dtrace_state_stop(state, cpuid); + rval = dtrace_state_stop(state, cpuid); mutex_exit(&dtrace_lock); - return (error); + return (rval); } default: error = ENOTTY; Index: src/external/cddl/osnet/dev/dtrace/dtrace_load.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_load.c,v retrieving revision 1.3 diff -u -p -r1.3 dtrace_load.c --- src/external/cddl/osnet/dev/dtrace/dtrace_load.c 31 Aug 2011 21:57:16 -0000 1.3 +++ src/external/cddl/osnet/dev/dtrace/dtrace_load.c 15 May 2017 23:58:54 -0000 @@ -20,7 +20,7 @@ * * CDDL HEADER END * - * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_load.c,v 1.2.2.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_load.c 309069 2016-11-23 22:50:20Z gnn $ * */ @@ -30,8 +30,32 @@ void dtrace_gethrtime_init(void *); int dtrace_helptrace_size=0; -#ifndef mutex_init -#define mutex_init(a, b, c, d) mutex_init(a, c, IPL_NONE) +#ifdef __FreeBSD__ +#ifndef EARLY_AP_STARTUP +static void +dtrace_ap_start(void *dummy) +{ + int i; + + mutex_enter(&cpu_lock); + + /* Setup the rest of the CPUs. */ + CPU_FOREACH(i) { + if (i == 0) + continue; + + (void) dtrace_cpu_setup(CPU_CONFIG, i); + } + + mutex_exit(&cpu_lock); +} + +SYSINIT(dtrace_ap_start, SI_SUB_SMP, SI_ORDER_ANY, dtrace_ap_start, NULL); +#endif +#endif + +#ifdef __NetBSD__ +void *dtrace_modcb; #endif static void @@ -44,6 +68,17 @@ dtrace_load(void *dummy) dtrace_debug_init(NULL); dtrace_gethrtime_init(NULL); +#ifdef __FreeBSD__ + /* + * DTrace uses negative logic for the destructive mode switch, so it + * is required to translate from the sysctl which uses positive logic. + */ + if (dtrace_allow_destructive) + dtrace_destructive_disallow = 0; + else + dtrace_destructive_disallow = 1; +#endif + /* Hook into the trap handler. */ dtrace_trap_func = dtrace_trap; @@ -53,11 +88,23 @@ dtrace_load(void *dummy) /* Hang our hook for exceptions. */ dtrace_invop_init(); - /* - * XXX This is a short term hack to avoid having to comment - * out lots and lots of lock/unlock calls. - */ - mutex_init(&mod_lock,"XXX mod_lock hack", MUTEX_DEFAULT, NULL); +#ifdef __FreeBSD__ + dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri, 0, 0, 0); + + dtrace_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx); + + /* Register callbacks for linker file load and unload events. */ + dtrace_kld_load_tag = EVENTHANDLER_REGISTER(kld_load, + dtrace_kld_load, NULL, EVENTHANDLER_PRI_ANY); + dtrace_kld_unload_try_tag = EVENTHANDLER_REGISTER(kld_unload_try, + dtrace_kld_unload_try, NULL, EVENTHANDLER_PRI_ANY); +#endif + +#ifdef __NetBSD__ + dtrace_arena = vmem_create("dtrace", 1, INT_MAX, 1, + NULL, NULL, NULL, 0, VM_SLEEP, IPL_NONE); + +#endif /* * Initialise the mutexes without 'witness' because the dtrace @@ -70,7 +117,9 @@ dtrace_load(void *dummy) mutex_init(&dtrace_lock,"dtrace probe state", MUTEX_DEFAULT, NULL); mutex_init(&dtrace_provider_lock,"dtrace provider state", MUTEX_DEFAULT, NULL); mutex_init(&dtrace_meta_lock,"dtrace meta-provider state", MUTEX_DEFAULT, NULL); +#ifdef DEBUG mutex_init(&dtrace_errlock,"dtrace error lock", MUTEX_DEFAULT, NULL); +#endif mutex_enter(&dtrace_provider_lock); mutex_enter(&dtrace_lock); @@ -78,9 +127,6 @@ dtrace_load(void *dummy) ASSERT(MUTEX_HELD(&cpu_lock)); - dtrace_arena = vmem_create("dtrace", 1, INT_MAX, 1, - NULL, NULL, NULL, 0, VM_SLEEP, IPL_NONE); - dtrace_state_cache = kmem_cache_create(__UNCONST("dtrace_state_cache"), sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); @@ -130,19 +176,6 @@ dtrace_load(void *dummy) dtrace_provider, NULL, NULL, "ERROR", 1, NULL); mutex_exit(&cpu_lock); - - /* - * If DTrace helper tracing is enabled, we need to allocate the - * trace buffer and initialize the values. - */ - if (dtrace_helptrace_enabled) { - ASSERT(dtrace_helptrace_buffer == NULL); - dtrace_helptrace_buffer = - kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP); - dtrace_helptrace_next = 0; - dtrace_helptrace_size = dtrace_helptrace_bufsize; - } - mutex_exit(&dtrace_lock); mutex_exit(&dtrace_provider_lock); @@ -155,9 +188,17 @@ dtrace_load(void *dummy) mutex_exit(&cpu_lock); +#ifdef __NetBSD__ dtrace_anon_init(NULL); -#if 0 - dtrace_dev = make_dev(&dtrace_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "dtrace/dtrace"); + + dtrace_modcb = module_register_callbacks(dtrace_module_loaded, + dtrace_module_unloaded); +#endif +#ifdef __FreeBSD__ + dtrace_dev = make_dev(&dtrace_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, + "dtrace/dtrace"); + helper_dev = make_dev(&helper_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, + "dtrace/helper"); #endif return; Index: src/external/cddl/osnet/dev/dtrace/dtrace_modevent.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_modevent.c,v retrieving revision 1.5 diff -u -p -r1.5 dtrace_modevent.c --- src/external/cddl/osnet/dev/dtrace/dtrace_modevent.c 28 Nov 2015 22:41:36 -0000 1.5 +++ src/external/cddl/osnet/dev/dtrace/dtrace_modevent.c 12 Apr 2017 15:44:25 -0000 @@ -20,7 +20,7 @@ * * CDDL HEADER END * - * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_modevent.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_load.c 309069 2016-11-23 22:50:20Z gnn $ * */ Index: src/external/cddl/osnet/dev/dtrace/dtrace_sysctl.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_sysctl.c,v retrieving revision 1.3 diff -u -p -r1.3 dtrace_sysctl.c --- src/external/cddl/osnet/dev/dtrace/dtrace_sysctl.c 23 Apr 2010 11:39:52 -0000 1.3 +++ src/external/cddl/osnet/dev/dtrace/dtrace_sysctl.c 12 Apr 2017 15:49:42 -0000 @@ -20,16 +20,10 @@ * * CDDL HEADER END * - * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_sysctl.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_sysctl.c 309069 2016-11-23 22:50:20Z gnn $ * */ -int dtrace_debug = 0; -#if 0 -TUNABLE_INT("debug.dtrace.debug", &dtrace_debug); -SYSCTL_INT(_debug_dtrace, OID_AUTO, debug, CTLFLAG_RW, &dtrace_debug, 0, ""); -#endif - #if 0 /* XXX TBD sysctl */ /* Report registered DTrace providers. */ static int @@ -82,6 +76,27 @@ sysctl_dtrace_providers(SYSCTL_HANDLER_A return (error); } +SYSCTL_NODE(_debug, OID_AUTO, dtrace, CTLFLAG_RD, 0, "DTrace debug parameters"); + SYSCTL_PROC(_debug_dtrace, OID_AUTO, providers, CTLTYPE_STRING | CTLFLAG_RD, - 0, 0, sysctl_dtrace_providers, "A", ""); + 0, 0, sysctl_dtrace_providers, "A", "available DTrace providers"); + +SYSCTL_NODE(_kern, OID_AUTO, dtrace, CTLFLAG_RD, 0, "DTrace parameters"); + +SYSCTL_INT(_kern_dtrace, OID_AUTO, err_verbose, CTLFLAG_RW, + &dtrace_err_verbose, 0, + "print DIF and DOF validation errors to the message buffer"); + +SYSCTL_INT(_kern_dtrace, OID_AUTO, memstr_max, CTLFLAG_RW, &dtrace_memstr_max, + 0, "largest allowed argument to memstr(), 0 indicates no limit"); + +SYSCTL_QUAD(_kern_dtrace, OID_AUTO, dof_maxsize, CTLFLAG_RW, + &dtrace_dof_maxsize, 0, "largest allowed DOF table"); + +SYSCTL_QUAD(_kern_dtrace, OID_AUTO, helper_actions_max, CTLFLAG_RW, + &dtrace_helper_actions_max, 0, "maximum number of allowed helper actions"); + +SYSCTL_INT(_security_bsd, OID_AUTO, allow_destructive_dtrace, CTLFLAG_RDTUN, + &dtrace_allow_destructive, 1, "Allow destructive mode DTrace scripts"); + #endif Index: src/external/cddl/osnet/dev/dtrace/dtrace_test.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_test.c,v retrieving revision 1.2 diff -u -p -r1.2 dtrace_test.c --- src/external/cddl/osnet/dev/dtrace/dtrace_test.c 21 Feb 2010 01:46:33 -0000 1.2 +++ src/external/cddl/osnet/dev/dtrace/dtrace_test.c 12 Apr 2017 15:53:25 -0000 @@ -24,18 +24,26 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_test.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_test.c 258622 2013-11-26 08:46:27Z avg $ * */ - #include #include #include +#include + #include #include #include +#include +#include #include +SDT_PROVIDER_DEFINE(test); + +SDT_PROBE_DEFINE7(test, , , sdttest, "int", "int", "int", "int", "int", + "int", "int"); + /* * These are variables that the DTrace test suite references in the * Solaris kernel. We define them here so that the tests function @@ -47,6 +55,33 @@ typedef struct vnode vnode_t; vnode_t dummy; vnode_t *rootvp = &dummy; +/* + * Test SDT probes with more than 5 arguments. On amd64, such probes require + * special handling since only the first 5 arguments will be passed to + * dtrace_probe() in registers; the rest must be fetched off the stack. + */ +static int +dtrace_test_sdttest(SYSCTL_HANDLER_ARGS) +{ + int val, error; + + val = 0; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || req->newptr == NULL) + return (error); + else if (val == 0) + return (0); + + SDT_PROBE7(test, , , sdttest, 1, 2, 3, 4, 5, 6, 7); + + return (error); +} + +static SYSCTL_NODE(_debug, OID_AUTO, dtracetest, CTLFLAG_RD, 0, ""); + +SYSCTL_PROC(_debug_dtracetest, OID_AUTO, sdttest, CTLTYPE_INT | CTLFLAG_RW, + NULL, 0, dtrace_test_sdttest, "I", "Trigger the SDT test probe"); + static int dtrace_test_modevent(module_t mod, int type, void *data) { Index: src/external/cddl/osnet/dev/dtrace/dtrace_unload.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_unload.c,v retrieving revision 1.6 diff -u -p -r1.6 dtrace_unload.c --- src/external/cddl/osnet/dev/dtrace/dtrace_unload.c 26 Feb 2015 09:10:52 -0000 1.6 +++ src/external/cddl/osnet/dev/dtrace/dtrace_unload.c 15 May 2017 23:59:03 -0000 @@ -20,19 +20,25 @@ * * CDDL HEADER END * - * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_unload.c,v 1.2.2.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_unload.c 278166 2015-02-03 19:39:53Z pfg $ * */ -extern int dtrace_probes_size; -extern int dtrace_helptrace_size; - static int dtrace_unload() { dtrace_state_t *state; int error = 0; +#ifdef __FreeBSD__ + destroy_dev(dtrace_dev); + destroy_dev(helper_dev); +#endif + +#ifdef __NetBSD__ + module_unregister_callbacks(dtrace_modcb); +#endif + mutex_enter(&dtrace_provider_lock); mutex_enter(&dtrace_lock); mutex_enter(&cpu_lock); @@ -52,6 +58,10 @@ dtrace_unload() } dtrace_provider = NULL; +#ifdef __FreeBSD__ + EVENTHANDLER_DEREGISTER(kld_load, dtrace_kld_load_tag); + EVENTHANDLER_DEREGISTER(kld_unload_try, dtrace_kld_unload_try_tag); +#endif if ((state = dtrace_anon_grab()) != NULL) { /* @@ -67,13 +77,8 @@ dtrace_unload() mutex_exit(&cpu_lock); - if (dtrace_helptrace_enabled) { - kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_size); - dtrace_helptrace_buffer = NULL; - } - if (dtrace_probes != NULL) { - kmem_free(dtrace_probes, dtrace_probes_size); + kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *)); dtrace_probes = NULL; dtrace_nprobes = 0; } @@ -87,7 +92,12 @@ dtrace_unload() kmem_cache_destroy(dtrace_state_cache); +#ifdef __FreeBSD__ + delete_unrhdr(dtrace_arena); +#endif +#ifdef __NetBSD__ vmem_destroy(dtrace_arena); +#endif if (dtrace_toxrange != NULL) { kmem_free(dtrace_toxrange, @@ -107,10 +117,13 @@ dtrace_unload() mutex_destroy(&dtrace_meta_lock); mutex_destroy(&dtrace_provider_lock); mutex_destroy(&dtrace_lock); +#ifdef DEBUG mutex_destroy(&dtrace_errlock); +#endif - /* XXX Hack */ - mutex_destroy(&mod_lock); +#ifdef __FreeBSD__ + taskq_destroy(dtrace_taskq); +#endif /* Reset our hook for exceptions. */ dtrace_invop_uninit(); Index: src/external/cddl/osnet/dev/dtrace/dtrace_vtime.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/dtrace_vtime.c,v retrieving revision 1.2 diff -u -p -r1.2 dtrace_vtime.c --- src/external/cddl/osnet/dev/dtrace/dtrace_vtime.c 21 Feb 2010 01:46:33 -0000 1.2 +++ src/external/cddl/osnet/dev/dtrace/dtrace_vtime.c 12 Apr 2017 16:01:35 -0000 @@ -20,7 +20,7 @@ * * CDDL HEADER END * - * $FreeBSD: src/sys/cddl/dev/dtrace/dtrace_vtime.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/dtrace_vtime.c 179237 2008-05-23 05:59:42Z jb $ */ /* Index: src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.c =================================================================== RCS file: src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.c diff -N src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.c --- src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.c 18 Jul 2011 00:42:40 -0000 1.3 +++ /dev/null 1 Jan 1970 00:00:00 -0000 @@ -1,3195 +0,0 @@ -/* $NetBSD: dis_tables.c,v 1.3 2011/07/18 00:42:40 christos Exp $ */ - -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - * - * $FreeBSD: src/sys/cddl/dev/dtrace/amd64/dis_tables.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* Copyright (c) 1988 AT&T */ -/* All Rights Reserved */ - - -#if defined(sun) -#pragma ident "@(#)dis_tables.c 1.11 06/03/02 SMI" -#endif - -#include "dis_tables.h" - -/* BEGIN CSTYLED */ - -/* - * Disassembly begins in dis_distable, which is equivalent to the One-byte - * Opcode Map in the Intel IA32 ISA Reference (page A-6 in my copy). The - * decoding loops then traverse out through the other tables as necessary to - * decode a given instruction. - * - * The behavior of this file can be controlled by one of the following flags: - * - * DIS_TEXT Include text for disassembly - * DIS_MEM Include memory-size calculations - * - * Either or both of these can be defined. - * - * This file is not, and will never be, cstyled. If anything, the tables should - * be taken out another tab stop or two so nothing overlaps. - */ - -/* - * These functions must be provided for the consumer to do disassembly. - */ -#ifdef DIS_TEXT -extern char *strncpy(char *, const char *, size_t); -extern size_t strlen(const char *); -extern int strcmp(const char *, const char *); -extern int strncmp(const char *, const char *, size_t); -extern size_t strlcat(char *, const char *, size_t); -#endif - - -#define TERM NULL /* used to indicate that the 'indirect' */ - /* field terminates - no pointer. */ - -/* Used to decode instructions. */ -typedef struct instable { - const struct instable *it_indirect; /* for decode op codes */ - uchar_t it_adrmode; -#ifdef DIS_TEXT - char it_name[NCPS]; - uint_t it_suffix:1; /* mneu + "w", "l", or "d" */ -#endif -#ifdef DIS_MEM - uint_t it_size:16; -#endif - uint_t it_invalid64:1; /* opcode invalid in amd64 */ - uint_t it_always64:1; /* 64 bit when in 64 bit mode */ - uint_t it_invalid32:1; /* invalid in IA32 */ - uint_t it_stackop:1; /* push/pop stack operation */ -} instable_t; - -/* - * Instruction formats. - */ -enum { - UNKNOWN, - MRw, - IMlw, - IMw, - IR, - OA, - AO, - MS, - SM, - Mv, - Mw, - M, /* register or memory */ - Mb, /* register or memory, always byte sized */ - MO, /* memory only (no registers) */ - PREF, - SWAPGS, - R, - RA, - SEG, - MR, - RM, - IA, - MA, - SD, - AD, - SA, - D, - INM, - SO, - BD, - I, - P, - V, - DSHIFT, /* for double shift that has an 8-bit immediate */ - U, - OVERRIDE, - NORM, /* instructions w/o ModR/M byte, no memory access */ - IMPLMEM, /* instructions w/o ModR/M byte, implicit mem access */ - O, /* for call */ - JTAB, /* jump table */ - IMUL, /* for 186 iimul instr */ - CBW, /* so data16 can be evaluated for cbw and variants */ - MvI, /* for 186 logicals */ - ENTER, /* for 186 enter instr */ - RMw, /* for 286 arpl instr */ - Ib, /* for push immediate byte */ - F, /* for 287 instructions */ - FF, /* for 287 instructions */ - FFC, /* for 287 instructions */ - DM, /* 16-bit data */ - AM, /* 16-bit addr */ - LSEG, /* for 3-bit seg reg encoding */ - MIb, /* for 386 logicals */ - SREG, /* for 386 special registers */ - PREFIX, /* a REP instruction prefix */ - LOCK, /* a LOCK instruction prefix */ - INT3, /* The int 3 instruction, which has a fake operand */ - INTx, /* The normal int instruction, with explicit int num */ - DSHIFTcl, /* for double shift that implicitly uses %cl */ - CWD, /* so data16 can be evaluated for cwd and variants */ - RET, /* single immediate 16-bit operand */ - MOVZ, /* for movs and movz, with different size operands */ - XADDB, /* for xaddb */ - MOVSXZ, /* AMD64 mov sign extend 32 to 64 bit instruction */ - -/* - * MMX/SIMD addressing modes. - */ - - MMO, /* Prefixable MMX/SIMD-Int mm/mem -> mm */ - MMOIMPL, /* Prefixable MMX/SIMD-Int mm -> mm (mem) */ - MMO3P, /* Prefixable MMX/SIMD-Int mm -> r32,imm8 */ - MMOM3, /* Prefixable MMX/SIMD-Int mm -> r32 */ - MMOS, /* Prefixable MMX/SIMD-Int mm -> mm/mem */ - MMOMS, /* Prefixable MMX/SIMD-Int mm -> mem */ - MMOPM, /* MMX/SIMD-Int mm/mem -> mm,imm8 */ - MMOPRM, /* Prefixable MMX/SIMD-Int r32/mem -> mm,imm8 */ - MMOSH, /* Prefixable MMX mm,imm8 */ - MM, /* MMX/SIMD-Int mm/mem -> mm */ - MMS, /* MMX/SIMD-Int mm -> mm/mem */ - MMSH, /* MMX mm,imm8 */ - XMMO, /* Prefixable SIMD xmm/mem -> xmm */ - XMMOS, /* Prefixable SIMD xmm -> xmm/mem */ - XMMOPM, /* Prefixable SIMD xmm/mem w/to xmm,imm8 */ - XMMOMX, /* Prefixable SIMD mm/mem -> xmm */ - XMMOX3, /* Prefixable SIMD xmm -> r32 */ - XMMOXMM, /* Prefixable SIMD xmm/mem -> mm */ - XMMOM, /* Prefixable SIMD xmm -> mem */ - XMMOMS, /* Prefixable SIMD mem -> xmm */ - XMM, /* SIMD xmm/mem -> xmm */ - XMMXIMPL, /* SIMD xmm -> xmm (mem) */ - XMM3P, /* SIMD xmm -> r32,imm8 */ - XMMP, /* SIMD xmm/mem w/to xmm,imm8 */ - XMMPRM, /* SIMD r32/mem -> xmm,imm8 */ - XMMS, /* SIMD xmm -> xmm/mem */ - XMMM, /* SIMD mem -> xmm */ - XMMMS, /* SIMD xmm -> mem */ - XMM3MX, /* SIMD r32/mem -> xmm */ - XMM3MXS, /* SIMD xmm -> r32/mem */ - XMMSH, /* SIMD xmm,imm8 */ - XMMXM3, /* SIMD xmm/mem -> r32 */ - XMMX3, /* SIMD xmm -> r32 */ - XMMXMM, /* SIMD xmm/mem -> mm */ - XMMMX, /* SIMD mm -> xmm */ - XMMXM, /* SIMD xmm -> mm */ - XMMFENCE, /* SIMD lfence or mfence */ - XMMSFNC /* SIMD sfence (none or mem) */ -}; - -#define FILL 0x90 /* Fill byte used for alignment (nop) */ - -/* -** Register numbers for the i386 -*/ -#define EAX_REGNO 0 -#define ECX_REGNO 1 -#define EDX_REGNO 2 -#define EBX_REGNO 3 -#define ESP_REGNO 4 -#define EBP_REGNO 5 -#define ESI_REGNO 6 -#define EDI_REGNO 7 - -/* - * modes for immediate values - */ -#define MODE_NONE 0 -#define MODE_IPREL 1 /* signed IP relative value */ -#define MODE_SIGNED 2 /* sign extended immediate */ -#define MODE_IMPLIED 3 /* constant value implied from opcode */ -#define MODE_OFFSET 4 /* offset part of an address */ - -/* - * The letters used in these macros are: - * IND - indirect to another to another table - * "T" - means to Terminate indirections (this is the final opcode) - * "S" - means "operand length suffix required" - * "NS" - means "no suffix" which is the operand length suffix of the opcode - * "Z" - means instruction size arg required - * "u" - means the opcode is invalid in IA32 but valid in amd64 - * "x" - means the opcode is invalid in amd64, but not IA32 - * "y" - means the operand size is always 64 bits in 64 bit mode - * "p" - means push/pop stack operation - */ - -#if defined(DIS_TEXT) && defined(DIS_MEM) -#define IND(table) {table, 0, "", 0, 0, 0, 0, 0, 0} -#define INDx(table) {table, 0, "", 0, 0, 1, 0, 0, 0} -#define TNS(name, amode) {TERM, amode, name, 0, 0, 0, 0, 0, 0} -#define TNSu(name, amode) {TERM, amode, name, 0, 0, 0, 0, 1, 0} -#define TNSx(name, amode) {TERM, amode, name, 0, 0, 1, 0, 0, 0} -#define TNSy(name, amode) {TERM, amode, name, 0, 0, 0, 1, 0, 0} -#define TNSyp(name, amode) {TERM, amode, name, 0, 0, 0, 1, 0, 1} -#define TNSZ(name, amode, sz) {TERM, amode, name, 0, sz, 0, 0, 0, 0} -#define TNSZy(name, amode, sz) {TERM, amode, name, 0, sz, 0, 1, 0, 0} -#define TS(name, amode) {TERM, amode, name, 1, 0, 0, 0, 0, 0} -#define TSx(name, amode) {TERM, amode, name, 1, 0, 1, 0, 0, 0} -#define TSy(name, amode) {TERM, amode, name, 1, 0, 0, 1, 0, 0} -#define TSp(name, amode) {TERM, amode, name, 1, 0, 0, 0, 0, 1} -#define TSZ(name, amode, sz) {TERM, amode, name, 1, sz, 0, 0, 0, 0} -#define TSZx(name, amode, sz) {TERM, amode, name, 1, sz, 1, 0, 0, 0} -#define TSZy(name, amode, sz) {TERM, amode, name, 1, sz, 0, 1, 0, 0} -#define INVALID {TERM, UNKNOWN, "", 0, 0, 0, 0, 0} -#elif defined(DIS_TEXT) -#define IND(table) {table, 0, "", 0, 0, 0, 0, 0} -#define INDx(table) {table, 0, "", 0, 1, 0, 0, 0} -#define TNS(name, amode) {TERM, amode, name, 0, 0, 0, 0, 0} -#define TNSu(name, amode) {TERM, amode, name, 0, 0, 0, 1, 0} -#define TNSx(name, amode) {TERM, amode, name, 0, 1, 0, 0, 0} -#define TNSy(name, amode) {TERM, amode, name, 0, 0, 1, 0, 0} -#define TNSyp(name, amode) {TERM, amode, name, 0, 0, 1, 0, 1} -#define TNSZ(name, amode, sz) {TERM, amode, name, 0, 0, 0, 0, 0} -#define TNSZy(name, amode, sz) {TERM, amode, name, 0, 0, 1, 0, 0} -#define TS(name, amode) {TERM, amode, name, 1, 0, 0, 0, 0} -#define TSx(name, amode) {TERM, amode, name, 1, 1, 0, 0, 0} -#define TSy(name, amode) {TERM, amode, name, 1, 0, 1, 0, 0} -#define TSp(name, amode) {TERM, amode, name, 1, 0, 0, 0, 1} -#define TSZ(name, amode, sz) {TERM, amode, name, 1, 0, 0, 0, 0} -#define TSZx(name, amode, sz) {TERM, amode, name, 1, 1, 0, 0, 0} -#define TSZy(name, amode, sz) {TERM, amode, name, 1, 0, 1, 0, 0} -#define INVALID {TERM, UNKNOWN, "", 0, 0, 0, 0, 0} -#elif defined(DIS_MEM) -#define IND(table) {table, 0, 0, 0, 0, 0, 0} -#define INDx(table) {table, 0, 0, 1, 0, 0, 0} -#define TNS(name, amode) {TERM, amode, 0, 0, 0, 0, 0} -#define TNSu(name, amode) {TERM, amode, 0, 0, 0, 1, 0} -#define TNSy(name, amode) {TERM, amode, 0, 0, 1, 0, 0} -#define TNSyp(name, amode) {TERM, amode, 0, 0, 1, 0, 1} -#define TNSx(name, amode) {TERM, amode, 0, 1, 0, 0, 0} -#define TNSZ(name, amode, sz) {TERM, amode, sz, 0, 0, 0, 0} -#define TNSZy(name, amode, sz) {TERM, amode, sz, 0, 1, 0, 0} -#define TS(name, amode) {TERM, amode, 0, 0, 0, 0, 0} -#define TSx(name, amode) {TERM, amode, 0, 1, 0, 0, 0} -#define TSy(name, amode) {TERM, amode, 0, 0, 1, 0, 0} -#define TSp(name, amode) {TERM, amode, 0, 0, 0, 0, 1} -#define TSZ(name, amode, sz) {TERM, amode, sz, 0, 0, 0, 0} -#define TSZx(name, amode, sz) {TERM, amode, sz, 1, 0, 0, 0} -#define TSZy(name, amode, sz) {TERM, amode, sz, 0, 1, 0, 0} -#define INVALID {TERM, UNKNOWN, 0, 0, 0, 0, 0} -#else -#define IND(table) {table[0], 0, 0, 0, 0, 0} -#define INDx(table) {table[0], 0, 1, 0, 0, 0} -#define TNS(name, amode) {TERM, amode, 0, 0, 0, 0} -#define TNSu(name, amode) {TERM, amode, 0, 0, 1, 0} -#define TNSy(name, amode) {TERM, amode, 0, 1, 0, 0} -#define TNSyp(name, amode) {TERM, amode, 0, 1, 0, 1} -#define TNSx(name, amode) {TERM, amode, 1, 0, 0, 0} -#define TNSZ(name, amode, sz) {TERM, amode, 0, 0, 0, 0} -#define TNSZy(name, amode, sz) {TERM, amode, 0, 1, 0, 0} -#define TS(name, amode) {TERM, amode, 0, 0, 0, 0} -#define TSx(name, amode) {TERM, amode, 1, 0, 0, 0} -#define TSy(name, amode) {TERM, amode, 0, 1, 0, 0} -#define TSp(name, amode) {TERM, amode, 0, 0, 0, 1} -#define TSZ(name, amode, sz) {TERM, amode, 0, 0, 0, 0} -#define TSZx(name, amode, sz) {TERM, amode, 1, 0, 0, 0} -#define TSZy(name, amode, sz) {TERM, amode, 0, 1, 0, 0} -#define INVALID {TERM, UNKNOWN, 0, 0, 0, 0} -#endif - -#ifdef DIS_TEXT -/* - * this decodes the r_m field for mode's 0, 1, 2 in 16 bit mode - */ -const char *const dis_addr16[3][8] = { -"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di)", "", - "(%bx)", -"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di", "(%bp)", - "(%bx)", -"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di)", "(%bp)", - "(%bx)", -}; - - -/* - * This decodes 32 bit addressing mode r_m field for modes 0, 1, 2 - */ -const char *const dis_addr32_mode0[16] = { - "(%eax)", "(%ecx)", "(%edx)", "(%ebx)", "", "", "(%esi)", "(%edi)", - "(%r8d)", "(%r9d)", "(%r10d)", "(%r11d)", "", "", "(%r14d)", "(%r15d)" -}; - -const char *const dis_addr32_mode12[16] = { - "(%eax)", "(%ecx)", "(%edx)", "(%ebx)", "", "(%ebp)", "(%esi)", "(%edi)", - "(%r8d)", "(%r9d)", "(%r10d)", "(%r11d)", "", "(%r13d)", "(%r14d)", "(%r15d)" -}; - -/* - * This decodes 64 bit addressing mode r_m field for modes 0, 1, 2 - */ -const char *const dis_addr64_mode0[16] = { - "(%rax)", "(%rcx)", "(%rdx)", "(%rbx)", "", "(%rip)", "(%rsi)", "(%rdi)", - "(%r8)", "(%r9)", "(%r10)", "(%r11)", "(%r12)", "(%rip)", "(%r14)", "(%r15)" -}; -const char *const dis_addr64_mode12[16] = { - "(%rax)", "(%rcx)", "(%rdx)", "(%rbx)", "", "(%rbp)", "(%rsi)", "(%rdi)", - "(%r8)", "(%r9)", "(%r10)", "(%r11)", "(%r12)", "(%r13)", "(%r14)", "(%r15)" -}; - -/* - * decode for scale from SIB byte - */ -const char *const dis_scale_factor[4] = { ")", ",2)", ",4)", ",8)" }; - -/* - * register decoding for normal references to registers (ie. not addressing) - */ -const char *const dis_REG8[16] = { - "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh", - "%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b" -}; - -const char *const dis_REG8_REX[16] = { - "%al", "%cl", "%dl", "%bl", "%spl", "%bpl", "%sil", "%dil", - "%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b" -}; - -const char *const dis_REG16[16] = { - "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di", - "%r8w", "%r9w", "%r10w", "%r11w", "%r12w", "%r13w", "%r14w", "%r15w" -}; - -const char *const dis_REG32[16] = { - "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", - "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" -}; - -const char *const dis_REG64[16] = { - "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", - "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" -}; - -const char *const dis_DEBUGREG[16] = { - "%db0", "%db1", "%db2", "%db3", "%db4", "%db5", "%db6", "%db7", - "%db8", "%db9", "%db10", "%db11", "%db12", "%db13", "%db14", "%db15" -}; - -const char *const dis_CONTROLREG[16] = { - "%cr0", "%cr1", "%cr2", "%cr3", "%cr4", "%cr5?", "%cr6?", "%cr7?", - "%cr8", "%cr9?", "%cr10?", "%cr11?", "%cr12?", "%cr13?", "%cr14?", "%cr15?" -}; - -const char *const dis_TESTREG[16] = { - "%tr0?", "%tr1?", "%tr2?", "%tr3", "%tr4", "%tr5", "%tr6", "%tr7", - "%tr0?", "%tr1?", "%tr2?", "%tr3", "%tr4", "%tr5", "%tr6", "%tr7" -}; - -const char *const dis_MMREG[16] = { - "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", - "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" -}; - -const char *const dis_XMMREG[16] = { - "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" -}; - -const char *const dis_SEGREG[16] = { - "%es", "%cs", "%ss", "%ds", "%fs", "%gs", "", "", - "%es", "%cs", "%ss", "%ds", "%fs", "%gs", "", "" -}; - -/* - * SIMD predicate suffixes - */ -const char *const dis_PREDSUFFIX[8] = { - "eq", "lt", "le", "unord", "neq", "nlt", "nle", "ord" -}; - - - -#endif /* DIS_TEXT */ - - - - -/* - * "decode table" for 64 bit mode MOVSXD instruction (opcode 0x63) - */ -const instable_t dis_opMOVSLD = TNS("movslq",MOVSXZ); - -/* - * "decode table" for pause and clflush instructions - */ -const instable_t dis_opPause = TNS("pause", NORM); - -/* - * Decode table for 0x0F00 opcodes - */ -const instable_t dis_op0F00[8] = { - -/* [0] */ TNS("sldt",M), TNS("str",M), TNSy("lldt",M), TNSy("ltr",M), -/* [4] */ TNSZ("verr",M,2), TNSZ("verw",M,2), INVALID, INVALID, -}; - - -/* - * Decode table for 0x0F01 opcodes - */ -const instable_t dis_op0F01[8] = { - -/* [0] */ TNSZ("sgdt",MO,6), TNSZ("sidt",MO,6), TNSZ("lgdt",MO,6), TNSZ("lidt",MO,6), -/* [4] */ TNSZ("smsw",M,2), INVALID, TNSZ("lmsw",M,2), TNS("invlpg",SWAPGS), -}; - -/* - * Decode table for 0x0F18 opcodes -- SIMD prefetch - */ -const instable_t dis_op0F18[8] = { - -/* [0] */ TNS("prefetchnta",PREF),TNS("prefetcht0",PREF), TNS("prefetcht1",PREF), TNS("prefetcht2",PREF), -/* [4] */ INVALID, INVALID, INVALID, INVALID, -}; - -/* - * Decode table for 0x0FAE opcodes -- SIMD state save/restore - */ -const instable_t dis_op0FAE[8] = { -/* [0] */ TNSZ("fxsave",M,512), TNSZ("fxrstor",M,512), TNS("ldmxcsr",M), TNS("stmxcsr",M), -/* [4] */ INVALID, TNS("lfence",XMMFENCE), TNS("mfence",XMMFENCE), TNS("sfence",XMMSFNC), -}; - -/* - * Decode table for 0x0FBA opcodes - */ - -const instable_t dis_op0FBA[8] = { - -/* [0] */ INVALID, INVALID, INVALID, INVALID, -/* [4] */ TS("bt",MIb), TS("bts",MIb), TS("btr",MIb), TS("btc",MIb), -}; - -/* - * Decode table for 0x0FC7 opcode - */ - -const instable_t dis_op0FC7[8] = { - -/* [0] */ INVALID, TNS("cmpxchg8b",M), INVALID, INVALID, -/* [4] */ INVALID, INVALID, INVALID, INVALID, -}; - - -/* - * Decode table for 0x0FC8 opcode -- 486 bswap instruction - * - *bit pattern: 0000 1111 1100 1reg - */ -const instable_t dis_op0FC8[4] = { -/* [0] */ TNS("bswap",R), INVALID, INVALID, INVALID, -}; - -/* - * Decode table for 0x0F71, 0x0F72, and 0x0F73 opcodes -- MMX instructions - */ -const instable_t dis_op0F7123[4][8] = { -{ -/* [70].0 */ INVALID, INVALID, INVALID, INVALID, -/* .4 */ INVALID, INVALID, INVALID, INVALID, -}, { -/* [71].0 */ INVALID, INVALID, TNS("psrlw",MMOSH), INVALID, -/* .4 */ TNS("psraw",MMOSH), INVALID, TNS("psllw",MMOSH), INVALID, -}, { -/* [72].0 */ INVALID, INVALID, TNS("psrld",MMOSH), INVALID, -/* .4 */ TNS("psrad",MMOSH), INVALID, TNS("pslld",MMOSH), INVALID, -}, { -/* [73].0 */ INVALID, INVALID, TNS("psrlq",MMOSH), TNS("INVALID",MMOSH), -/* .4 */ INVALID, INVALID, TNS("psllq",MMOSH), TNS("INVALID",MMOSH), -} }; - -/* - * Decode table for SIMD extensions to above 0x0F71-0x0F73 opcodes. - */ -const instable_t dis_opSIMD7123[32] = { -/* [70].0 */ INVALID, INVALID, INVALID, INVALID, -/* .4 */ INVALID, INVALID, INVALID, INVALID, - -/* [71].0 */ INVALID, INVALID, TNS("psrlw",XMMSH), INVALID, -/* .4 */ TNS("psraw",XMMSH), INVALID, TNS("psllw",XMMSH), INVALID, - -/* [72].0 */ INVALID, INVALID, TNS("psrld",XMMSH), INVALID, -/* .4 */ TNS("psrad",XMMSH), INVALID, TNS("pslld",XMMSH), INVALID, - -/* [73].0 */ INVALID, INVALID, TNS("psrlq",XMMSH), TNS("psrldq",XMMSH), -/* .4 */ INVALID, INVALID, TNS("psllq",XMMSH), TNS("pslldq",XMMSH), -}; - -/* - * SIMD instructions have been wedged into the existing IA32 instruction - * set through the use of prefixes. That is, while 0xf0 0x58 may be - * addps, 0xf3 0xf0 0x58 (literally, repz addps) is a completely different - * instruction - addss. At present, three prefixes have been coopted in - * this manner - address size (0x66), repnz (0xf2) and repz (0xf3). The - * following tables are used to provide the prefixed instruction names. - * The arrays are sparse, but they're fast. - */ - -/* - * Decode table for SIMD instructions with the address size (0x66) prefix. - */ -const instable_t dis_opSIMDdata16[256] = { -/* [00] */ INVALID, INVALID, INVALID, INVALID, -/* [04] */ INVALID, INVALID, INVALID, INVALID, -/* [08] */ INVALID, INVALID, INVALID, INVALID, -/* [0C] */ INVALID, INVALID, INVALID, INVALID, - -/* [10] */ TNSZ("movupd",XMM,16), TNSZ("movupd",XMMS,16), TNSZ("movlpd",XMMM,8), TNSZ("movlpd",XMMMS,8), -/* [14] */ TNSZ("unpcklpd",XMM,16),TNSZ("unpckhpd",XMM,16),TNSZ("movhpd",XMMM,8), TNSZ("movhpd",XMMMS,8), -/* [18] */ INVALID, INVALID, INVALID, INVALID, -/* [1C] */ INVALID, INVALID, INVALID, INVALID, - -/* [20] */ INVALID, INVALID, INVALID, INVALID, -/* [24] */ INVALID, INVALID, INVALID, INVALID, -/* [28] */ TNSZ("movapd",XMM,16), TNSZ("movapd",XMMS,16), TNSZ("cvtpi2pd",XMMOMX,8),TNSZ("movntpd",XMMOMS,16), -/* [2C] */ TNSZ("cvttpd2pi",XMMXMM,16),TNSZ("cvtpd2pi",XMMXMM,16),TNSZ("ucomisd",XMM,8),TNSZ("comisd",XMM,8), - -/* [30] */ INVALID, INVALID, INVALID, INVALID, -/* [34] */ INVALID, INVALID, INVALID, INVALID, -/* [38] */ INVALID, INVALID, INVALID, INVALID, -/* [3C] */ INVALID, INVALID, INVALID, INVALID, - -/* [40] */ INVALID, INVALID, INVALID, INVALID, -/* [44] */ INVALID, INVALID, INVALID, INVALID, -/* [48] */ INVALID, INVALID, INVALID, INVALID, -/* [4C] */ INVALID, INVALID, INVALID, INVALID, - -/* [50] */ TNS("movmskpd",XMMOX3), TNSZ("sqrtpd",XMM,16), INVALID, INVALID, -/* [54] */ TNSZ("andpd",XMM,16), TNSZ("andnpd",XMM,16), TNSZ("orpd",XMM,16), TNSZ("xorpd",XMM,16), -/* [58] */ TNSZ("addpd",XMM,16), TNSZ("mulpd",XMM,16), TNSZ("cvtpd2ps",XMM,16),TNSZ("cvtps2dq",XMM,16), -/* [5C] */ TNSZ("subpd",XMM,16), TNSZ("minpd",XMM,16), TNSZ("divpd",XMM,16), TNSZ("maxpd",XMM,16), - -/* [60] */ TNSZ("punpcklbw",XMM,16),TNSZ("punpcklwd",XMM,16),TNSZ("punpckldq",XMM,16),TNSZ("packsswb",XMM,16), -/* [64] */ TNSZ("pcmpgtb",XMM,16), TNSZ("pcmpgtw",XMM,16), TNSZ("pcmpgtd",XMM,16), TNSZ("packuswb",XMM,16), -/* [68] */ TNSZ("punpckhbw",XMM,16),TNSZ("punpckhwd",XMM,16),TNSZ("punpckhdq",XMM,16),TNSZ("packssdw",XMM,16), -/* [6C] */ TNSZ("punpcklqdq",XMM,16),TNSZ("punpckhqdq",XMM,16),TNSZ("movd",XMM3MX,4),TNSZ("movdqa",XMM,16), - -/* [70] */ TNSZ("pshufd",XMMP,16), INVALID, INVALID, INVALID, -/* [74] */ TNSZ("pcmpeqb",XMM,16), TNSZ("pcmpeqw",XMM,16), TNSZ("pcmpeqd",XMM,16), INVALID, -/* [78] */ INVALID, INVALID, INVALID, INVALID, -/* [7C] */ INVALID, INVALID, TNSZ("movd",XMM3MXS,4), TNSZ("movdqa",XMMS,16), - -/* [80] */ INVALID, INVALID, INVALID, INVALID, -/* [84] */ INVALID, INVALID, INVALID, INVALID, -/* [88] */ INVALID, INVALID, INVALID, INVALID, -/* [8C] */ INVALID, INVALID, INVALID, INVALID, - -/* [90] */ INVALID, INVALID, INVALID, INVALID, -/* [94] */ INVALID, INVALID, INVALID, INVALID, -/* [98] */ INVALID, INVALID, INVALID, INVALID, -/* [9C] */ INVALID, INVALID, INVALID, INVALID, - -/* [A0] */ INVALID, INVALID, INVALID, INVALID, -/* [A4] */ INVALID, INVALID, INVALID, INVALID, -/* [A8] */ INVALID, INVALID, INVALID, INVALID, -/* [AC] */ INVALID, INVALID, INVALID, INVALID, - -/* [B0] */ INVALID, INVALID, INVALID, INVALID, -/* [B4] */ INVALID, INVALID, INVALID, INVALID, -/* [B8] */ INVALID, INVALID, INVALID, INVALID, -/* [BC] */ INVALID, INVALID, INVALID, INVALID, - -/* [C0] */ INVALID, INVALID, TNSZ("cmppd",XMMP,16), INVALID, -/* [C4] */ TNSZ("pinsrw",XMMPRM,2),TNS("pextrw",XMM3P), TNSZ("shufpd",XMMP,16), INVALID, -/* [C8] */ INVALID, INVALID, INVALID, INVALID, -/* [CC] */ INVALID, INVALID, INVALID, INVALID, - -/* [D0] */ INVALID, TNSZ("psrlw",XMM,16), TNSZ("psrld",XMM,16), TNSZ("psrlq",XMM,16), -/* [D4] */ TNSZ("paddq",XMM,16), TNSZ("pmullw",XMM,16), TNSZ("movq",XMMS,8), TNS("pmovmskb",XMMX3), -/* [D8] */ TNSZ("psubusb",XMM,16), TNSZ("psubusw",XMM,16), TNSZ("pminub",XMM,16), TNSZ("pand",XMM,16), -/* [DC] */ TNSZ("paddusb",XMM,16), TNSZ("paddusw",XMM,16), TNSZ("pmaxub",XMM,16), TNSZ("pandn",XMM,16), - -/* [E0] */ TNSZ("pavgb",XMM,16), TNSZ("psraw",XMM,16), TNSZ("psrad",XMM,16), TNSZ("pavgw",XMM,16), -/* [E4] */ TNSZ("pmulhuw",XMM,16), TNSZ("pmulhw",XMM,16), TNSZ("cvttpd2dq",XMM,16),TNSZ("movntdq",XMMS,16), -/* [E8] */ TNSZ("psubsb",XMM,16), TNSZ("psubsw",XMM,16), TNSZ("pminsw",XMM,16), TNSZ("por",XMM,16), -/* [EC] */ TNSZ("paddsb",XMM,16), TNSZ("paddsw",XMM,16), TNSZ("pmaxsw",XMM,16), TNSZ("pxor",XMM,16), - -/* [F0] */ INVALID, TNSZ("psllw",XMM,16), TNSZ("pslld",XMM,16), TNSZ("psllq",XMM,16), -/* [F4] */ TNSZ("pmuludq",XMM,16), TNSZ("pmaddwd",XMM,16), TNSZ("psadbw",XMM,16), TNSZ("maskmovdqu", XMMXIMPL,16), -/* [F8] */ TNSZ("psubb",XMM,16), TNSZ("psubw",XMM,16), TNSZ("psubd",XMM,16), TNSZ("psubq",XMM,16), -/* [FC] */ TNSZ("paddb",XMM,16), TNSZ("paddw",XMM,16), TNSZ("paddd",XMM,16), INVALID, -}; - -/* - * Decode table for SIMD instructions with the repnz (0xf2) prefix. - */ -const instable_t dis_opSIMDrepnz[256] = { -/* [00] */ INVALID, INVALID, INVALID, INVALID, -/* [04] */ INVALID, INVALID, INVALID, INVALID, -/* [08] */ INVALID, INVALID, INVALID, INVALID, -/* [0C] */ INVALID, INVALID, INVALID, INVALID, - -/* [10] */ TNSZ("movsd",XMM,8), TNSZ("movsd",XMMS,8), INVALID, INVALID, -/* [14] */ INVALID, INVALID, INVALID, INVALID, -/* [18] */ INVALID, INVALID, INVALID, INVALID, -/* [1C] */ INVALID, INVALID, INVALID, INVALID, - -/* [20] */ INVALID, INVALID, INVALID, INVALID, -/* [24] */ INVALID, INVALID, INVALID, INVALID, -/* [28] */ INVALID, INVALID, TNSZ("cvtsi2sd",XMM3MX,4),INVALID, -/* [2C] */ TNSZ("cvttsd2si",XMMXM3,8),TNSZ("cvtsd2si",XMMXM3,8),INVALID, INVALID, - -/* [30] */ INVALID, INVALID, INVALID, INVALID, -/* [34] */ INVALID, INVALID, INVALID, INVALID, -/* [38] */ INVALID, INVALID, INVALID, INVALID, -/* [3C] */ INVALID, INVALID, INVALID, INVALID, - -/* [40] */ INVALID, INVALID, INVALID, INVALID, -/* [44] */ INVALID, INVALID, INVALID, INVALID, -/* [48] */ INVALID, INVALID, INVALID, INVALID, -/* [4C] */ INVALID, INVALID, INVALID, INVALID, - -/* [50] */ INVALID, TNSZ("sqrtsd",XMM,8), INVALID, INVALID, -/* [54] */ INVALID, INVALID, INVALID, INVALID, -/* [58] */ TNSZ("addsd",XMM,8), TNSZ("mulsd",XMM,8), TNSZ("cvtsd2ss",XMM,8), INVALID, -/* [5C] */ TNSZ("subsd",XMM,8), TNSZ("minsd",XMM,8), TNSZ("divsd",XMM,8), TNSZ("maxsd",XMM,8), - -/* [60] */ INVALID, INVALID, INVALID, INVALID, -/* [64] */ INVALID, INVALID, INVALID, INVALID, -/* [68] */ INVALID, INVALID, INVALID, INVALID, -/* [6C] */ INVALID, INVALID, INVALID, INVALID, - -/* [70] */ TNSZ("pshuflw",XMMP,16),INVALID, INVALID, INVALID, -/* [74] */ INVALID, INVALID, INVALID, INVALID, -/* [78] */ INVALID, INVALID, INVALID, INVALID, -/* [7C] */ INVALID, INVALID, INVALID, INVALID, - -/* [80] */ INVALID, INVALID, INVALID, INVALID, -/* [84] */ INVALID, INVALID, INVALID, INVALID, -/* [88] */ INVALID, INVALID, INVALID, INVALID, -/* [0C] */ INVALID, INVALID, INVALID, INVALID, - -/* [90] */ INVALID, INVALID, INVALID, INVALID, -/* [94] */ INVALID, INVALID, INVALID, INVALID, -/* [98] */ INVALID, INVALID, INVALID, INVALID, -/* [9C] */ INVALID, INVALID, INVALID, INVALID, - -/* [A0] */ INVALID, INVALID, INVALID, INVALID, -/* [A4] */ INVALID, INVALID, INVALID, INVALID, -/* [A8] */ INVALID, INVALID, INVALID, INVALID, -/* [AC] */ INVALID, INVALID, INVALID, INVALID, - -/* [B0] */ INVALID, INVALID, INVALID, INVALID, -/* [B4] */ INVALID, INVALID, INVALID, INVALID, -/* [B8] */ INVALID, INVALID, INVALID, INVALID, -/* [BC] */ INVALID, INVALID, INVALID, INVALID, - -/* [C0] */ INVALID, INVALID, TNSZ("cmpsd",XMMP,8), INVALID, -/* [C4] */ INVALID, INVALID, INVALID, INVALID, -/* [C8] */ INVALID, INVALID, INVALID, INVALID, -/* [CC] */ INVALID, INVALID, INVALID, INVALID, - -/* [D0] */ INVALID, INVALID, INVALID, INVALID, -/* [D4] */ INVALID, INVALID, TNS("movdq2q",XMMXM), INVALID, -/* [D8] */ INVALID, INVALID, INVALID, INVALID, -/* [DC] */ INVALID, INVALID, INVALID, INVALID, - -/* [E0] */ INVALID, INVALID, INVALID, INVALID, -/* [E4] */ INVALID, INVALID, TNSZ("cvtpd2dq",XMM,16),INVALID, -/* [E8] */ INVALID, INVALID, INVALID, INVALID, -/* [EC] */ INVALID, INVALID, INVALID, INVALID, - -/* [F0] */ INVALID, INVALID, INVALID, INVALID, -/* [F4] */ INVALID, INVALID, INVALID, INVALID, -/* [F8] */ INVALID, INVALID, INVALID, INVALID, -/* [FC] */ INVALID, INVALID, INVALID, INVALID, -}; - -/* - * Decode table for SIMD instructions with the repz (0xf3) prefix. - */ -const instable_t dis_opSIMDrepz[256] = { -/* [00] */ INVALID, INVALID, INVALID, INVALID, -/* [04] */ INVALID, INVALID, INVALID, INVALID, -/* [08] */ INVALID, INVALID, INVALID, INVALID, -/* [0C] */ INVALID, INVALID, INVALID, INVALID, - -/* [10] */ TNSZ("movss",XMM,4), TNSZ("movss",XMMS,4), INVALID, INVALID, -/* [14] */ INVALID, INVALID, INVALID, INVALID, -/* [18] */ INVALID, INVALID, INVALID, INVALID, -/* [1C] */ INVALID, INVALID, INVALID, INVALID, - -/* [20] */ INVALID, INVALID, INVALID, INVALID, -/* [24] */ INVALID, INVALID, INVALID, INVALID, -/* [28] */ INVALID, INVALID, TNSZ("cvtsi2ss",XMM3MX,4),INVALID, -/* [2C] */ TNSZ("cvttss2si",XMMXM3,4),TNSZ("cvtss2si",XMMXM3,4),INVALID, INVALID, - -/* [30] */ INVALID, INVALID, INVALID, INVALID, -/* [34] */ INVALID, INVALID, INVALID, INVALID, -/* [38] */ INVALID, INVALID, INVALID, INVALID, -/* [3C] */ INVALID, INVALID, INVALID, INVALID, - -/* [40] */ INVALID, INVALID, INVALID, INVALID, -/* [44] */ INVALID, INVALID, INVALID, INVALID, -/* [48] */ INVALID, INVALID, INVALID, INVALID, -/* [4C] */ INVALID, INVALID, INVALID, INVALID, - -/* [50] */ INVALID, TNSZ("sqrtss",XMM,4), TNSZ("rsqrtss",XMM,4), TNSZ("rcpss",XMM,4), -/* [54] */ INVALID, INVALID, INVALID, INVALID, -/* [58] */ TNSZ("addss",XMM,4), TNSZ("mulss",XMM,4), TNSZ("cvtss2sd",XMM,4), TNSZ("cvttps2dq",XMM,16), -/* [5C] */ TNSZ("subss",XMM,4), TNSZ("minss",XMM,4), TNSZ("divss",XMM,4), TNSZ("maxss",XMM,4), - -/* [60] */ INVALID, INVALID, INVALID, INVALID, -/* [64] */ INVALID, INVALID, INVALID, INVALID, -/* [68] */ INVALID, INVALID, INVALID, INVALID, -/* [6C] */ INVALID, INVALID, INVALID, TNSZ("movdqu",XMM,16), - -/* [70] */ TNSZ("pshufhw",XMMP,16),INVALID, INVALID, INVALID, -/* [74] */ INVALID, INVALID, INVALID, INVALID, -/* [78] */ INVALID, INVALID, INVALID, INVALID, -/* [7C] */ INVALID, INVALID, TNSZ("movq",XMM,8), TNSZ("movdqu",XMMS,16), - -/* [80] */ INVALID, INVALID, INVALID, INVALID, -/* [84] */ INVALID, INVALID, INVALID, INVALID, -/* [88] */ INVALID, INVALID, INVALID, INVALID, -/* [0C] */ INVALID, INVALID, INVALID, INVALID, - -/* [90] */ INVALID, INVALID, INVALID, INVALID, -/* [94] */ INVALID, INVALID, INVALID, INVALID, -/* [98] */ INVALID, INVALID, INVALID, INVALID, -/* [9C] */ INVALID, INVALID, INVALID, INVALID, - -/* [A0] */ INVALID, INVALID, INVALID, INVALID, -/* [A4] */ INVALID, INVALID, INVALID, INVALID, -/* [A8] */ INVALID, INVALID, INVALID, INVALID, -/* [AC] */ INVALID, INVALID, INVALID, INVALID, - -/* [B0] */ INVALID, INVALID, INVALID, INVALID, -/* [B4] */ INVALID, INVALID, INVALID, INVALID, -/* [B8] */ INVALID, INVALID, INVALID, INVALID, -/* [BC] */ INVALID, INVALID, INVALID, INVALID, - -/* [C0] */ INVALID, INVALID, TNSZ("cmpss",XMMP,4), INVALID, -/* [C4] */ INVALID, INVALID, INVALID, INVALID, -/* [C8] */ INVALID, INVALID, INVALID, INVALID, -/* [CC] */ INVALID, INVALID, INVALID, INVALID, - -/* [D0] */ INVALID, INVALID, INVALID, INVALID, -/* [D4] */ INVALID, INVALID, TNS("movq2dq",XMMMX), INVALID, -/* [D8] */ INVALID, INVALID, INVALID, INVALID, -/* [DC] */ INVALID, INVALID, INVALID, INVALID, - -/* [E0] */ INVALID, INVALID, INVALID, INVALID, -/* [E4] */ INVALID, INVALID, TNSZ("cvtdq2pd",XMM,8), INVALID, -/* [E8] */ INVALID, INVALID, INVALID, INVALID, -/* [EC] */ INVALID, INVALID, INVALID, INVALID, - -/* [F0] */ INVALID, INVALID, INVALID, INVALID, -/* [F4] */ INVALID, INVALID, INVALID, INVALID, -/* [F8] */ INVALID, INVALID, INVALID, INVALID, -/* [FC] */ INVALID, INVALID, INVALID, INVALID, -}; - -/* - * Decode table for 0x0F opcodes - */ - -const instable_t dis_op0F[16][16] = { -{ -/* [00] */ IND(dis_op0F00), IND(dis_op0F01), TNS("lar",MR), TNS("lsl",MR), -/* [04] */ INVALID, TNS("syscall",NORM), TNS("clts",NORM), TNS("sysret",NORM), -/* [08] */ TNS("invd",NORM), TNS("wbinvd",NORM), INVALID, TNS("ud2",NORM), -/* [0C] */ INVALID, INVALID, INVALID, INVALID, -}, { -/* [10] */ TNSZ("movups",XMMO,16), TNSZ("movups",XMMOS,16),TNSZ("movlps",XMMO,8), TNSZ("movlps",XMMOS,8), -/* [14] */ TNSZ("unpcklps",XMMO,16),TNSZ("unpckhps",XMMO,16),TNSZ("movhps",XMMOM,8),TNSZ("movhps",XMMOMS,8), -/* [18] */ IND(dis_op0F18), INVALID, INVALID, INVALID, -/* [1C] */ INVALID, INVALID, INVALID, INVALID, -}, { -/* [20] */ TSy("mov",SREG), TSy("mov",SREG), TSy("mov",SREG), TSy("mov",SREG), -/* [24] */ TSx("mov",SREG), INVALID, TSx("mov",SREG), INVALID, -/* [28] */ TNSZ("movaps",XMMO,16), TNSZ("movaps",XMMOS,16),TNSZ("cvtpi2ps",XMMOMX,8),TNSZ("movntps",XMMOS,16), -/* [2C] */ TNSZ("cvttps2pi",XMMOXMM,8),TNSZ("cvtps2pi",XMMOXMM,8),TNSZ("ucomiss",XMMO,4),TNSZ("comiss",XMMO,4), -}, { -/* [30] */ TNS("wrmsr",NORM), TNS("rdtsc",NORM), TNS("rdmsr",NORM), TNS("rdpmc",NORM), -/* [34] */ TNSx("sysenter",NORM), TNSx("sysexit",NORM), INVALID, INVALID, -/* [38] */ INVALID, INVALID, INVALID, INVALID, -/* [3C] */ INVALID, INVALID, INVALID, INVALID, -}, { -/* [40] */ TS("cmovx.o",MR), TS("cmovx.no",MR), TS("cmovx.b",MR), TS("cmovx.ae",MR), -/* [44] */ TS("cmovx.e",MR), TS("cmovx.ne",MR), TS("cmovx.be",MR), TS("cmovx.a",MR), -/* [48] */ TS("cmovx.s",MR), TS("cmovx.ns",MR), TS("cmovx.pe",MR), TS("cmovx.po",MR), -/* [4C] */ TS("cmovx.l",MR), TS("cmovx.ge",MR), TS("cmovx.le",MR), TS("cmovx.g",MR), -}, { -/* [50] */ TNS("movmskps",XMMOX3), TNSZ("sqrtps",XMMO,16), TNSZ("rsqrtps",XMMO,16),TNSZ("rcpps",XMMO,16), -/* [54] */ TNSZ("andps",XMMO,16), TNSZ("andnps",XMMO,16), TNSZ("orps",XMMO,16), TNSZ("xorps",XMMO,16), -/* [58] */ TNSZ("addps",XMMO,16), TNSZ("mulps",XMMO,16), TNSZ("cvtps2pd",XMMO,8),TNSZ("cvtdq2ps",XMMO,16), -/* [5C] */ TNSZ("subps",XMMO,16), TNSZ("minps",XMMO,16), TNSZ("divps",XMMO,16), TNSZ("maxps",XMMO,16), -}, { -/* [60] */ TNSZ("punpcklbw",MMO,4),TNSZ("punpcklwd",MMO,4),TNSZ("punpckldq",MMO,4),TNSZ("packsswb",MMO,8), -/* [64] */ TNSZ("pcmpgtb",MMO,8), TNSZ("pcmpgtw",MMO,8), TNSZ("pcmpgtd",MMO,8), TNSZ("packuswb",MMO,8), -/* [68] */ TNSZ("punpckhbw",MMO,8),TNSZ("punpckhwd",MMO,8),TNSZ("punpckhdq",MMO,8),TNSZ("packssdw",MMO,8), -/* [6C] */ TNSZ("INVALID",MMO,0), TNSZ("INVALID",MMO,0), TNSZ("movd",MMO,4), TNSZ("movq",MMO,8), -}, { -/* [70] */ TNSZ("pshufw",MMOPM,8), TNS("psrXXX",MR), TNS("psrXXX",MR), TNS("psrXXX",MR), -/* [74] */ TNSZ("pcmpeqb",MMO,8), TNSZ("pcmpeqw",MMO,8), TNSZ("pcmpeqd",MMO,8), TNS("emms",NORM), -/* [78] */ INVALID, INVALID, INVALID, INVALID, -/* [7C] */ INVALID, INVALID, TNSZ("movd",MMOS,4), TNSZ("movq",MMOS,8), -}, { -/* [80] */ TNS("jo",D), TNS("jno",D), TNS("jb",D), TNS("jae",D), -/* [84] */ TNS("je",D), TNS("jne",D), TNS("jbe",D), TNS("ja",D), -/* [88] */ TNS("js",D), TNS("jns",D), TNS("jp",D), TNS("jnp",D), -/* [8C] */ TNS("jl",D), TNS("jge",D), TNS("jle",D), TNS("jg",D), -}, { -/* [90] */ TNS("seto",Mb), TNS("setno",Mb), TNS("setb",Mb), TNS("setae",Mb), -/* [94] */ TNS("sete",Mb), TNS("setne",Mb), TNS("setbe",Mb), TNS("seta",Mb), -/* [98] */ TNS("sets",Mb), TNS("setns",Mb), TNS("setp",Mb), TNS("setnp",Mb), -/* [9C] */ TNS("setl",Mb), TNS("setge",Mb), TNS("setle",Mb), TNS("setg",Mb), -}, { -/* [A0] */ TSp("push",LSEG), TSp("pop",LSEG), TNS("cpuid",NORM), TS("bt",RMw), -/* [A4] */ TS("shld",DSHIFT), TS("shld",DSHIFTcl), INVALID, INVALID, -/* [A8] */ TSp("push",LSEG), TSp("pop",LSEG), TNS("rsm",NORM), TS("bts",RMw), -/* [AC] */ TS("shrd",DSHIFT), TS("shrd",DSHIFTcl), IND(dis_op0FAE), TS("imul",MRw), -}, { -/* [B0] */ TNS("cmpxchgb",RMw), TS("cmpxchg",RMw), TS("lss",MR), TS("btr",RMw), -/* [B4] */ TS("lfs",MR), TS("lgs",MR), TS("movzb",MOVZ), TNS("movzwl",MOVZ), -/* [B8] */ INVALID, INVALID, IND(dis_op0FBA), TS("btc",RMw), -/* [BC] */ TS("bsf",MRw), TS("bsr",MRw), TS("movsb",MOVZ), TNS("movswl",MOVZ), -}, { -/* [C0] */ TNS("xaddb",XADDB), TS("xadd",RMw), TNSZ("cmpps",XMMOPM,16),TNS("movnti",RM), -/* [C4] */ TNSZ("pinsrw",MMOPRM,2),TNS("pextrw",MMO3P), TNSZ("shufps",XMMOPM,16),IND(dis_op0FC7), -/* [C8] */ INVALID, INVALID, INVALID, INVALID, -/* [CC] */ INVALID, INVALID, INVALID, INVALID, -}, { -/* [D0] */ INVALID, TNSZ("psrlw",MMO,8), TNSZ("psrld",MMO,8), TNSZ("psrlq",MMO,8), -/* [D4] */ TNSZ("paddq",MMO,8), TNSZ("pmullw",MMO,8), TNSZ("INVALID",MMO,0), TNS("pmovmskb",MMOM3), -/* [D8] */ TNSZ("psubusb",MMO,8), TNSZ("psubusw",MMO,8), TNSZ("pminub",MMO,8), TNSZ("pand",MMO,8), -/* [DC] */ TNSZ("paddusb",MMO,8), TNSZ("paddusw",MMO,8), TNSZ("pmaxub",MMO,8), TNSZ("pandn",MMO,8), -}, { -/* [E0] */ TNSZ("pavgb",MMO,8), TNSZ("psraw",MMO,8), TNSZ("psrad",MMO,8), TNSZ("pavgw",MMO,8), -/* [E4] */ TNSZ("pmulhuw",MMO,8), TNSZ("pmulhw",MMO,8), TNS("INVALID",XMMO), TNSZ("movntq",MMOMS,8), -/* [E8] */ TNSZ("psubsb",MMO,8), TNSZ("psubsw",MMO,8), TNSZ("pminsw",MMO,8), TNSZ("por",MMO,8), -/* [EC] */ TNSZ("paddsb",MMO,8), TNSZ("paddsw",MMO,8), TNSZ("pmaxsw",MMO,8), TNSZ("pxor",MMO,8), -}, { -/* [F0] */ INVALID, TNSZ("psllw",MMO,8), TNSZ("pslld",MMO,8), TNSZ("psllq",MMO,8), -/* [F4] */ TNSZ("pmuludq",MMO,8), TNSZ("pmaddwd",MMO,8), TNSZ("psadbw",MMO,8), TNSZ("maskmovq",MMOIMPL,8), -/* [F8] */ TNSZ("psubb",MMO,8), TNSZ("psubw",MMO,8), TNSZ("psubd",MMO,8), TNSZ("psubq",MMO,8), -/* [FC] */ TNSZ("paddb",MMO,8), TNSZ("paddw",MMO,8), TNSZ("paddd",MMO,8), INVALID, -} }; - - -/* - * Decode table for 0x80 opcodes - */ - -const instable_t dis_op80[8] = { - -/* [0] */ TNS("addb",IMlw), TNS("orb",IMw), TNS("adcb",IMlw), TNS("sbbb",IMlw), -/* [4] */ TNS("andb",IMw), TNS("subb",IMlw), TNS("xorb",IMw), TNS("cmpb",IMlw), -}; - - -/* - * Decode table for 0x81 opcodes. - */ - -const instable_t dis_op81[8] = { - -/* [0] */ TS("add",IMlw), TS("or",IMw), TS("adc",IMlw), TS("sbb",IMlw), -/* [4] */ TS("and",IMw), TS("sub",IMlw), TS("xor",IMw), TS("cmp",IMlw), -}; - - -/* - * Decode table for 0x82 opcodes. - */ - -const instable_t dis_op82[8] = { - -/* [0] */ TNSx("addb",IMlw), TNSx("orb",IMlw), TNSx("adcb",IMlw), TNSx("sbbb",IMlw), -/* [4] */ TNSx("andb",IMlw), TNSx("subb",IMlw), TNSx("xorb",IMlw), TNSx("cmpb",IMlw), -}; -/* - * Decode table for 0x83 opcodes. - */ - -const instable_t dis_op83[8] = { - -/* [0] */ TS("add",IMlw), TS("or",IMlw), TS("adc",IMlw), TS("sbb",IMlw), -/* [4] */ TS("and",IMlw), TS("sub",IMlw), TS("xor",IMlw), TS("cmp",IMlw), -}; - -/* - * Decode table for 0xC0 opcodes. - */ - -const instable_t dis_opC0[8] = { - -/* [0] */ TNS("rolb",MvI), TNS("rorb",MvI), TNS("rclb",MvI), TNS("rcrb",MvI), -/* [4] */ TNS("shlb",MvI), TNS("shrb",MvI), INVALID, TNS("sarb",MvI), -}; - -/* - * Decode table for 0xD0 opcodes. - */ - -const instable_t dis_opD0[8] = { - -/* [0] */ TNS("rolb",Mv), TNS("rorb",Mv), TNS("rclb",Mv), TNS("rcrb",Mv), -/* [4] */ TNS("shlb",Mv), TNS("shrb",Mv), TNS("salb",Mv), TNS("sarb",Mv), -}; - -/* - * Decode table for 0xC1 opcodes. - * 186 instruction set - */ - -const instable_t dis_opC1[8] = { - -/* [0] */ TS("rol",MvI), TS("ror",MvI), TS("rcl",MvI), TS("rcr",MvI), -/* [4] */ TS("shl",MvI), TS("shr",MvI), TS("sal",MvI), TS("sar",MvI), -}; - -/* - * Decode table for 0xD1 opcodes. - */ - -const instable_t dis_opD1[8] = { - -/* [0] */ TS("rol",Mv), TS("ror",Mv), TS("rcl",Mv), TS("rcr",Mv), -/* [4] */ TS("shl",Mv), TS("shr",Mv), TS("sal",Mv), TS("sar",Mv), -}; - - -/* - * Decode table for 0xD2 opcodes. - */ - -const instable_t dis_opD2[8] = { - -/* [0] */ TNS("rolb",Mv), TNS("rorb",Mv), TNS("rclb",Mv), TNS("rcrb",Mv), -/* [4] */ TNS("shlb",Mv), TNS("shrb",Mv), TNS("salb",Mv), TNS("sarb",Mv), -}; -/* - * Decode table for 0xD3 opcodes. - */ - -const instable_t dis_opD3[8] = { - -/* [0] */ TS("rol",Mv), TS("ror",Mv), TS("rcl",Mv), TS("rcr",Mv), -/* [4] */ TS("shl",Mv), TS("shr",Mv), TS("salb",Mv), TS("sar",Mv), -}; - - -/* - * Decode table for 0xF6 opcodes. - */ - -const instable_t dis_opF6[8] = { - -/* [0] */ TNS("testb",IMw), TNS("testb",IMw), TNS("notb",Mw), TNS("negb",Mw), -/* [4] */ TNS("mulb",MA), TNS("imulb",MA), TNS("divb",MA), TNS("idivb",MA), -}; - - -/* - * Decode table for 0xF7 opcodes. - */ - -const instable_t dis_opF7[8] = { - -/* [0] */ TS("test",IMw), TS("test",IMw), TS("not",Mw), TS("neg",Mw), -/* [4] */ TS("mul",MA), TS("imul",MA), TS("div",MA), TS("idiv",MA), -}; - - -/* - * Decode table for 0xFE opcodes. - */ - -const instable_t dis_opFE[8] = { - -/* [0] */ TNS("incb",Mw), TNS("decb",Mw), INVALID, INVALID, -/* [4] */ INVALID, INVALID, INVALID, INVALID, -}; -/* - * Decode table for 0xFF opcodes. - */ - -const instable_t dis_opFF[8] = { - -/* [0] */ TS("inc",Mw), TS("dec",Mw), TNSyp("call",INM), TNS("lcall",INM), -/* [4] */ TNSy("jmp",INM), TNS("ljmp",INM), TSp("push",M), INVALID, -}; - -/* for 287 instructions, which are a mess to decode */ - -const instable_t dis_opFP1n2[8][8] = { -{ -/* bit pattern: 1101 1xxx MODxx xR/M */ -/* [0,0] */ TNS("fadds",M), TNS("fmuls",M), TNS("fcoms",M), TNS("fcomps",M), -/* [0,4] */ TNS("fsubs",M), TNS("fsubrs",M), TNS("fdivs",M), TNS("fdivrs",M), -}, { -/* [1,0] */ TNS("flds",M), INVALID, TNS("fsts",M), TNS("fstps",M), -/* [1,4] */ TNSZ("fldenv",M,28), TNSZ("fldcw",M,2), TNSZ("fnstenv",M,28), TNSZ("fnstcw",M,2), -}, { -/* [2,0] */ TNS("fiaddl",M), TNS("fimull",M), TNS("ficoml",M), TNS("ficompl",M), -/* [2,4] */ TNS("fisubl",M), TNS("fisubrl",M), TNS("fidivl",M), TNS("fidivrl",M), -}, { -/* [3,0] */ TNS("fildl",M), INVALID, TNS("fistl",M), TNS("fistpl",M), -/* [3,4] */ INVALID, TNSZ("fldt",M,10), INVALID, TNSZ("fstpt",M,10), -}, { -/* [4,0] */ TNSZ("faddl",M,8), TNSZ("fmull",M,8), TNSZ("fcoml",M,8), TNSZ("fcompl",M,8), -/* [4,1] */ TNSZ("fsubl",M,8), TNSZ("fsubrl",M,8), TNSZ("fdivl",M,8), TNSZ("fdivrl",M,8), -}, { -/* [5,0] */ TNSZ("fldl",M,8), INVALID, TNSZ("fstl",M,8), TNSZ("fstpl",M,8), -/* [5,4] */ TNSZ("frstor",M,108), INVALID, TNSZ("fnsave",M,108), TNSZ("fnstsw",M,2), -}, { -/* [6,0] */ TNSZ("fiadd",M,2), TNSZ("fimul",M,2), TNSZ("ficom",M,2), TNSZ("ficomp",M,2), -/* [6,4] */ TNSZ("fisub",M,2), TNSZ("fisubr",M,2), TNSZ("fidiv",M,2), TNSZ("fidivr",M,2), -}, { -/* [7,0] */ TNSZ("fild",M,2), INVALID, TNSZ("fist",M,2), TNSZ("fistp",M,2), -/* [7,4] */ TNSZ("fbld",M,10), TNSZ("fildll",M,8), TNSZ("fbstp",M,10), TNSZ("fistpll",M,8), -} }; - -const instable_t dis_opFP3[8][8] = { -{ -/* bit pattern: 1101 1xxx 11xx xREG */ -/* [0,0] */ TNS("fadd",FF), TNS("fmul",FF), TNS("fcom",F), TNS("fcomp",F), -/* [0,4] */ TNS("fsub",FF), TNS("fsubr",FF), TNS("fdiv",FF), TNS("fdivr",FF), -}, { -/* [1,0] */ TNS("fld",F), TNS("fxch",F), TNS("fnop",NORM), TNS("fstp",F), -/* [1,4] */ INVALID, INVALID, INVALID, INVALID, -}, { -/* [2,0] */ INVALID, INVALID, INVALID, INVALID, -/* [2,4] */ INVALID, TNS("fucompp",NORM), INVALID, INVALID, -}, { -/* [3,0] */ INVALID, INVALID, INVALID, INVALID, -/* [3,4] */ INVALID, INVALID, INVALID, INVALID, -}, { -/* [4,0] */ TNS("fadd",FF), TNS("fmul",FF), TNS("fcom",F), TNS("fcomp",F), -/* [4,4] */ TNS("fsub",FF), TNS("fsubr",FF), TNS("fdiv",FF), TNS("fdivr",FF), -}, { -/* [5,0] */ TNS("ffree",F), TNS("fxch",F), TNS("fst",F), TNS("fstp",F), -/* [5,4] */ TNS("fucom",F), TNS("fucomp",F), INVALID, INVALID, -}, { -/* [6,0] */ TNS("faddp",FF), TNS("fmulp",FF), TNS("fcomp",F), TNS("fcompp",NORM), -/* [6,4] */ TNS("fsubp",FF), TNS("fsubrp",FF), TNS("fdivp",FF), TNS("fdivrp",FF), -}, { -/* [7,0] */ TNS("ffree",F), TNS("fxch",F), TNS("fstp",F), TNS("fstp",F), -/* [7,4] */ TNS("fnstsw",M), TNS("fucomip",FFC), TNS("fcomip",FFC), INVALID, -} }; - -const instable_t dis_opFP4[4][8] = { -{ -/* bit pattern: 1101 1001 111x xxxx */ -/* [0,0] */ TNS("fchs",NORM), TNS("fabs",NORM), INVALID, INVALID, -/* [0,4] */ TNS("ftst",NORM), TNS("fxam",NORM), TNS("ftstp",NORM), INVALID, -}, { -/* [1,0] */ TNS("fld1",NORM), TNS("fldl2t",NORM), TNS("fldl2e",NORM), TNS("fldpi",NORM), -/* [1,4] */ TNS("fldlg2",NORM), TNS("fldln2",NORM), TNS("fldz",NORM), INVALID, -}, { -/* [2,0] */ TNS("f2xm1",NORM), TNS("fyl2x",NORM), TNS("fptan",NORM), TNS("fpatan",NORM), -/* [2,4] */ TNS("fxtract",NORM), TNS("fprem1",NORM), TNS("fdecstp",NORM), TNS("fincstp",NORM), -}, { -/* [3,0] */ TNS("fprem",NORM), TNS("fyl2xp1",NORM), TNS("fsqrt",NORM), TNS("fsincos",NORM), -/* [3,4] */ TNS("frndint",NORM), TNS("fscale",NORM), TNS("fsin",NORM), TNS("fcos",NORM), -} }; - -const instable_t dis_opFP5[8] = { -/* bit pattern: 1101 1011 111x xxxx */ -/* [0] */ TNS("feni",NORM), TNS("fdisi",NORM), TNS("fnclex",NORM), TNS("fninit",NORM), -/* [4] */ TNS("fsetpm",NORM), TNS("frstpm",NORM), INVALID, INVALID, -}; - -const instable_t dis_opFP6[8] = { -/* bit pattern: 1101 1011 11yy yxxx */ -/* [00] */ TNS("fcmov.nb",FF), TNS("fcmov.ne",FF), TNS("fcmov.nbe",FF), TNS("fcmov.nu",FF), -/* [04] */ INVALID, TNS("fucomi",F), TNS("fcomi",F), INVALID, -}; - -const instable_t dis_opFP7[8] = { -/* bit pattern: 1101 1010 11yy yxxx */ -/* [00] */ TNS("fcmov.b",FF), TNS("fcmov.e",FF), TNS("fcmov.be",FF), TNS("fcmov.u",FF), -/* [04] */ INVALID, INVALID, INVALID, INVALID, -}; - -/* - * Main decode table for the op codes. The first two nibbles - * will be used as an index into the table. If there is a - * a need to further decode an instruction, the array to be - * referenced is indicated with the other two entries being - * empty. - */ - -const instable_t dis_distable[16][16] = { -{ -/* [0,0] */ TNS("addb",RMw), TS("add",RMw), TNS("addb",MRw), TS("add",MRw), -/* [0,4] */ TNS("addb",IA), TS("add",IA), TSx("push",SEG), TSx("pop",SEG), -/* [0,8] */ TNS("orb",RMw), TS("or",RMw), TNS("orb",MRw), TS("or",MRw), -/* [0,C] */ TNS("orb",IA), TS("or",IA), TSx("push",SEG), IND(&dis_op0F[0][0]), -}, { -/* [1,0] */ TNS("adcb",RMw), TS("adc",RMw), TNS("adcb",MRw), TS("adc",MRw), -/* [1,4] */ TNS("adcb",IA), TS("adc",IA), TSx("push",SEG), TSx("pop",SEG), -/* [1,8] */ TNS("sbbb",RMw), TS("sbb",RMw), TNS("sbbb",MRw), TS("sbb",MRw), -/* [1,C] */ TNS("sbbb",IA), TS("sbb",IA), TSx("push",SEG), TSx("pop",SEG), -}, { -/* [2,0] */ TNS("andb",RMw), TS("and",RMw), TNS("andb",MRw), TS("and",MRw), -/* [2,4] */ TNS("andb",IA), TS("and",IA), TNSx("%es:",OVERRIDE), TNSx("daa",NORM), -/* [2,8] */ TNS("subb",RMw), TS("sub",RMw), TNS("subb",MRw), TS("sub",MRw), -/* [2,C] */ TNS("subb",IA), TS("sub",IA), TNSx("%cs:",OVERRIDE), TNSx("das",NORM), -}, { -/* [3,0] */ TNS("xorb",RMw), TS("xor",RMw), TNS("xorb",MRw), TS("xor",MRw), -/* [3,4] */ TNS("xorb",IA), TS("xor",IA), TNSx("%ss:",OVERRIDE), TNSx("aaa",NORM), -/* [3,8] */ TNS("cmpb",RMw), TS("cmp",RMw), TNS("cmpb",MRw), TS("cmp",MRw), -/* [3,C] */ TNS("cmpb",IA), TS("cmp",IA), TNSx("%ds:",OVERRIDE), TNSx("aas",NORM), -}, { -/* [4,0] */ TSx("inc",R), TSx("inc",R), TSx("inc",R), TSx("inc",R), -/* [4,4] */ TSx("inc",R), TSx("inc",R), TSx("inc",R), TSx("inc",R), -/* [4,8] */ TSx("dec",R), TSx("dec",R), TSx("dec",R), TSx("dec",R), -/* [4,C] */ TSx("dec",R), TSx("dec",R), TSx("dec",R), TSx("dec",R), -}, { -/* [5,0] */ TSp("push",R), TSp("push",R), TSp("push",R), TSp("push",R), -/* [5,4] */ TSp("push",R), TSp("push",R), TSp("push",R), TSp("push",R), -/* [5,8] */ TSp("pop",R), TSp("pop",R), TSp("pop",R), TSp("pop",R), -/* [5,C] */ TSp("pop",R), TSp("pop",R), TSp("pop",R), TSp("pop",R), -}, { -/* [6,0] */ TSZx("pusha",IMPLMEM,28),TSZx("popa",IMPLMEM,28), TSx("bound",MR), TNS("arpl",RMw), -/* [6,4] */ TNS("%fs:",OVERRIDE), TNS("%gs:",OVERRIDE), TNS("data16",DM), TNS("addr16",AM), -/* [6,8] */ TSp("push",I), TS("imul",IMUL), TSp("push",Ib), TS("imul",IMUL), -/* [6,C] */ TNSZ("insb",IMPLMEM,1), TSZ("ins",IMPLMEM,4), TNSZ("outsb",IMPLMEM,1),TSZ("outs",IMPLMEM,4), -}, { -/* [7,0] */ TNSy("jo",BD), TNSy("jno",BD), TNSy("jb",BD), TNSy("jae",BD), -/* [7,4] */ TNSy("je",BD), TNSy("jne",BD), TNSy("jbe",BD), TNSy("ja",BD), -/* [7,8] */ TNSy("js",BD), TNSy("jns",BD), TNSy("jp",BD), TNSy("jnp",BD), -/* [7,C] */ TNSy("jl",BD), TNSy("jge",BD), TNSy("jle",BD), TNSy("jg",BD), -}, { -/* [8,0] */ IND(dis_op80), IND(dis_op81), INDx(dis_op82), IND(dis_op83), -/* [8,4] */ TNS("testb",RMw), TS("test",RMw), TNS("xchgb",RMw), TS("xchg",RMw), -/* [8,8] */ TNS("movb",RMw), TS("mov",RMw), TNS("movb",MRw), TS("mov",MRw), -/* [8,C] */ TNS("movw",SM), TS("lea",MR), TNS("movw",MS), TSp("pop",M), -}, { -/* [9,0] */ TNS("nop",NORM), TS("xchg",RA), TS("xchg",RA), TS("xchg",RA), -/* [9,4] */ TS("xchg",RA), TS("xchg",RA), TS("xchg",RA), TS("xchg",RA), -/* [9,8] */ TNS("cXtX",CBW), TNS("cXtX",CWD), TNSx("lcall",SO), TNS("fwait",NORM), -/* [9,C] */ TSZy("pushf",IMPLMEM,4),TSZy("popf",IMPLMEM,4), TNSx("sahf",NORM), TNSx("lahf",NORM), -}, { -/* [A,0] */ TNS("movb",OA), TS("mov",OA), TNS("movb",AO), TS("mov",AO), -/* [A,4] */ TNSZ("movsb",SD,1), TS("movs",SD), TNSZ("cmpsb",SD,1), TS("cmps",SD), -/* [A,8] */ TNS("testb",IA), TS("test",IA), TNS("stosb",AD), TS("stos",AD), -/* [A,C] */ TNS("lodsb",SA), TS("lods",SA), TNS("scasb",AD), TS("scas",AD), -}, { -/* [B,0] */ TNS("movb",IR), TNS("movb",IR), TNS("movb",IR), TNS("movb",IR), -/* [B,4] */ TNS("movb",IR), TNS("movb",IR), TNS("movb",IR), TNS("movb",IR), -/* [B,8] */ TS("mov",IR), TS("mov",IR), TS("mov",IR), TS("mov",IR), -/* [B,C] */ TS("mov",IR), TS("mov",IR), TS("mov",IR), TS("mov",IR), -}, { -/* [C,0] */ IND(dis_opC0), IND(dis_opC1), TNSyp("ret",RET), TNSyp("ret",NORM), -/* [C,4] */ TNSx("les",MR), TNSx("lds",MR), TNS("movb",IMw), TS("mov",IMw), -/* [C,8] */ TNSyp("enter",ENTER), TNSyp("leave",NORM), TNS("lret",RET), TNS("lret",NORM), -/* [C,C] */ TNS("int",INT3), TNS("int",INTx), TNSx("into",NORM), TNS("iret",NORM), -}, { -/* [D,0] */ IND(dis_opD0), IND(dis_opD1), IND(dis_opD2), IND(dis_opD3), -/* [D,4] */ TNSx("aam",U), TNSx("aad",U), TNSx("falc",NORM), TNSZ("xlat",IMPLMEM,1), - -/* 287 instructions. Note that although the indirect field */ -/* indicates opFP1n2 for further decoding, this is not necessarily */ -/* the case since the opFP arrays are not partitioned according to key1 */ -/* and key2. opFP1n2 is given only to indicate that we haven't */ -/* finished decoding the instruction. */ -/* [D,8] */ IND(&dis_opFP1n2[0][0]), IND(&dis_opFP1n2[0][0]), IND(&dis_opFP1n2[0][0]), IND(&dis_opFP1n2[0][0]), -/* [D,C] */ IND(&dis_opFP1n2[0][0]), IND(&dis_opFP1n2[0][0]), IND(&dis_opFP1n2[0][0]), IND(&dis_opFP1n2[0][0]), -}, { -/* [E,0] */ TNSy("loopnz",BD), TNSy("loopz",BD), TNSy("loop",BD), TNSy("jcxz",BD), -/* [E,4] */ TNS("inb",P), TS("in",P), TNS("outb",P), TS("out",P), -/* [E,8] */ TNSyp("call",D), TNSy("jmp",D), TNSx("ljmp",SO), TNSy("jmp",BD), -/* [E,C] */ TNS("inb",V), TS("in",V), TNS("outb",V), TS("out",V), -}, { -/* [F,0] */ TNS("lock",LOCK), TNS("icebp", NORM), TNS("repnz",PREFIX), TNS("repz",PREFIX), -/* [F,4] */ TNS("hlt",NORM), TNS("cmc",NORM), IND(dis_opF6), IND(dis_opF7), -/* [F,8] */ TNS("clc",NORM), TNS("stc",NORM), TNS("cli",NORM), TNS("sti",NORM), -/* [F,C] */ TNS("cld",NORM), TNS("std",NORM), IND(dis_opFE), IND(dis_opFF), -} }; - -/* END CSTYLED */ - -/* - * common functions to decode and disassemble an x86 or amd64 instruction - */ - -/* - * These are the individual fields of a REX prefix. Note that a REX - * prefix with none of these set is still needed to: - * - use the MOVSXD (sign extend 32 to 64 bits) instruction - * - access the %sil, %dil, %bpl, %spl registers - */ -#define REX_W 0x08 /* 64 bit operand size when set */ -#define REX_R 0x04 /* high order bit extension of ModRM reg field */ -#define REX_X 0x02 /* high order bit extension of SIB index field */ -#define REX_B 0x01 /* extends ModRM r_m, SIB base, or opcode reg */ - -static uint_t opnd_size; /* SIZE16, SIZE32 or SIZE64 */ -static uint_t addr_size; /* SIZE16, SIZE32 or SIZE64 */ - -/* - * Even in 64 bit mode, usually only 4 byte immediate operands are supported. - */ -static int isize[] = {1, 2, 4, 4}; -static int isize64[] = {1, 2, 4, 8}; - -/* - * Just a bunch of useful macros. - */ -#define WBIT(x) (x & 0x1) /* to get w bit */ -#define REGNO(x) (x & 0x7) /* to get 3 bit register */ -#define VBIT(x) ((x)>>1 & 0x1) /* to get 'v' bit */ -#define OPSIZE(osize, wbit) ((wbit) ? isize[osize] : 1) -#define OPSIZE64(osize, wbit) ((wbit) ? isize64[osize] : 1) - -#define REG_ONLY 3 /* mode to indicate a register operand (not memory) */ - -#define BYTE_OPND 0 /* w-bit value indicating byte register */ -#define LONG_OPND 1 /* w-bit value indicating opnd_size register */ -#define MM_OPND 2 /* "value" used to indicate a mmx reg */ -#define XMM_OPND 3 /* "value" used to indicate a xmm reg */ -#define SEG_OPND 4 /* "value" used to indicate a segment reg */ -#define CONTROL_OPND 5 /* "value" used to indicate a control reg */ -#define DEBUG_OPND 6 /* "value" used to indicate a debug reg */ -#define TEST_OPND 7 /* "value" used to indicate a test reg */ -#define WORD_OPND 8 /* w-bit value indicating word size reg */ - -/* - * Get the next byte and separate the op code into the high and low nibbles. - */ -static int -dtrace_get_opcode(dis86_t *x, uint_t *high, uint_t *low) -{ - int byte; - - /* - * x86 instructions have a maximum length of 15 bytes. Bail out if - * we try to read more. - */ - if (x->d86_len >= 15) - return (x->d86_error = 1); - - if (x->d86_error) - return (1); - byte = x->d86_get_byte(x->d86_data); - if (byte < 0) - return (x->d86_error = 1); - x->d86_bytes[x->d86_len++] = byte; - *low = byte & 0xf; /* ----xxxx low 4 bits */ - *high = byte >> 4 & 0xf; /* xxxx---- bits 7 to 4 */ - return (0); -} - -/* - * Get and decode an SIB (scaled index base) byte - */ -static void -dtrace_get_SIB(dis86_t *x, uint_t *ss, uint_t *index, uint_t *base) -{ - int byte; - - if (x->d86_error) - return; - - byte = x->d86_get_byte(x->d86_data); - if (byte < 0) { - x->d86_error = 1; - return; - } - x->d86_bytes[x->d86_len++] = byte; - - *base = byte & 0x7; - *index = (byte >> 3) & 0x7; - *ss = (byte >> 6) & 0x3; -} - -/* - * Get the byte following the op code and separate it into the - * mode, register, and r/m fields. - */ -static void -dtrace_get_modrm(dis86_t *x, uint_t *mode, uint_t *reg, uint_t *r_m) -{ - if (x->d86_got_modrm == 0) { - if (x->d86_rmindex == -1) - x->d86_rmindex = x->d86_len; - dtrace_get_SIB(x, mode, reg, r_m); - x->d86_got_modrm = 1; - } -} - -/* - * Adjust register selection based on any REX prefix bits present. - */ -/*ARGSUSED*/ -static void -dtrace_rex_adjust(uint_t rex_prefix, uint_t mode, uint_t *reg, uint_t *r_m) -{ - if (reg != NULL && r_m == NULL) { - if (rex_prefix & REX_B) - *reg += 8; - } else { - if (reg != NULL && (REX_R & rex_prefix) != 0) - *reg += 8; - if (r_m != NULL && (REX_B & rex_prefix) != 0) - *r_m += 8; - } -} - -/* - * Get an immediate operand of the given size, with sign extension. - */ -static void -dtrace_imm_opnd(dis86_t *x, int wbit, int size, int opindex) -{ - int i; - int byte; - int valsize = 0; - - if (x->d86_numopnds < opindex + 1) - x->d86_numopnds = opindex + 1; - - switch (wbit) { - case BYTE_OPND: - valsize = 1; - break; - case LONG_OPND: - if (x->d86_opnd_size == SIZE16) - valsize = 2; - else if (x->d86_opnd_size == SIZE32) - valsize = 4; - else - valsize = 8; - break; - case MM_OPND: - case XMM_OPND: - case SEG_OPND: - case CONTROL_OPND: - case DEBUG_OPND: - case TEST_OPND: - valsize = size; - break; - case WORD_OPND: - valsize = 2; - break; - } - if (valsize < size) - valsize = size; - - if (x->d86_error) - return; - x->d86_opnd[opindex].d86_value = 0; - for (i = 0; i < size; ++i) { - byte = x->d86_get_byte(x->d86_data); - if (byte < 0) { - x->d86_error = 1; - return; - } - x->d86_bytes[x->d86_len++] = byte; - x->d86_opnd[opindex].d86_value |= (uint64_t)byte << (i * 8); - } - /* Do sign extension */ - if (x->d86_bytes[x->d86_len - 1] & 0x80) { - for (; i < valsize; i++) - x->d86_opnd[opindex].d86_value |= - (uint64_t)0xff << (i* 8); - } -#ifdef DIS_TEXT - x->d86_opnd[opindex].d86_mode = MODE_SIGNED; - x->d86_opnd[opindex].d86_value_size = valsize; - x->d86_imm_bytes += size; -#endif -} - -/* - * Get an ip relative operand of the given size, with sign extension. - */ -static void -dtrace_disp_opnd(dis86_t *x, int wbit, int size, int opindex) -{ - dtrace_imm_opnd(x, wbit, size, opindex); -#ifdef DIS_TEXT - x->d86_opnd[opindex].d86_mode = MODE_IPREL; -#endif -} - -/* - * Check to see if there is a segment override prefix pending. - * If so, print it in the current 'operand' location and set - * the override flag back to false. - */ -/*ARGSUSED*/ -static void -dtrace_check_override(dis86_t *x, int opindex) -{ -#ifdef DIS_TEXT - if (x->d86_seg_prefix) { - (void) strlcat(x->d86_opnd[opindex].d86_prefix, - x->d86_seg_prefix, PFIXLEN); - } -#endif - x->d86_seg_prefix = NULL; -} - - -/* - * Process a single instruction Register or Memory operand. - * - * mode = addressing mode from ModRM byte - * r_m = r_m (or reg if mode == 3) field from ModRM byte - * wbit = indicates which register (8bit, 16bit, ... MMX, etc.) set to use. - * o = index of operand that we are processing (0, 1 or 2) - * - * the value of reg or r_m must have already been adjusted for any REX prefix. - */ -/*ARGSUSED*/ -static void -dtrace_get_operand(dis86_t *x, uint_t mode, uint_t r_m, int wbit, int opindex) -{ - int have_SIB = 0; /* flag presence of scale-index-byte */ - uint_t ss; /* scale-factor from opcode */ - uint_t index; /* index register number */ - uint_t base; /* base register number */ - int dispsize; /* size of displacement in bytes */ -#ifdef DIS_TEXT - char *opnd = x->d86_opnd[opindex].d86_opnd; -#endif - - if (x->d86_numopnds < opindex + 1) - x->d86_numopnds = opindex + 1; - - if (x->d86_error) - return; - - /* - * first handle a simple register - */ - if (mode == REG_ONLY) { -#ifdef DIS_TEXT - switch (wbit) { - case MM_OPND: - (void) strlcat(opnd, dis_MMREG[r_m], OPLEN); - break; - case XMM_OPND: - (void) strlcat(opnd, dis_XMMREG[r_m], OPLEN); - break; - case SEG_OPND: - (void) strlcat(opnd, dis_SEGREG[r_m], OPLEN); - break; - case CONTROL_OPND: - (void) strlcat(opnd, dis_CONTROLREG[r_m], OPLEN); - break; - case DEBUG_OPND: - (void) strlcat(opnd, dis_DEBUGREG[r_m], OPLEN); - break; - case TEST_OPND: - (void) strlcat(opnd, dis_TESTREG[r_m], OPLEN); - break; - case BYTE_OPND: - if (x->d86_rex_prefix == 0) - (void) strlcat(opnd, dis_REG8[r_m], OPLEN); - else - (void) strlcat(opnd, dis_REG8_REX[r_m], OPLEN); - break; - case WORD_OPND: - (void) strlcat(opnd, dis_REG16[r_m], OPLEN); - break; - case LONG_OPND: - if (x->d86_opnd_size == SIZE16) - (void) strlcat(opnd, dis_REG16[r_m], OPLEN); - else if (x->d86_opnd_size == SIZE32) - (void) strlcat(opnd, dis_REG32[r_m], OPLEN); - else - (void) strlcat(opnd, dis_REG64[r_m], OPLEN); - break; - } -#endif /* DIS_TEXT */ - return; - } - - /* - * if symbolic representation, skip override prefix, if any - */ - dtrace_check_override(x, opindex); - - /* - * Handle 16 bit memory references first, since they decode - * the mode values more simply. - * mode 1 is r_m + 8 bit displacement - * mode 2 is r_m + 16 bit displacement - * mode 0 is just r_m, unless r_m is 6 which is 16 bit disp - */ - if (x->d86_addr_size == SIZE16) { - if ((mode == 0 && r_m == 6) || mode == 2) - dtrace_imm_opnd(x, WORD_OPND, 2, opindex); - else if (mode == 1) - dtrace_imm_opnd(x, BYTE_OPND, 1, opindex); -#ifdef DIS_TEXT - if (mode == 0 && r_m == 6) - x->d86_opnd[opindex].d86_mode = MODE_SIGNED; - else if (mode == 0) - x->d86_opnd[opindex].d86_mode = MODE_NONE; - else - x->d86_opnd[opindex].d86_mode = MODE_OFFSET; - (void) strlcat(opnd, dis_addr16[mode][r_m], OPLEN); -#endif - return; - } - - /* - * 32 and 64 bit addressing modes are more complex since they - * can involve an SIB (scaled index and base) byte to decode. - */ - if (r_m == ESP_REGNO || r_m == ESP_REGNO + 8) { - have_SIB = 1; - dtrace_get_SIB(x, &ss, &index, &base); - if (x->d86_error) - return; - if (base != 5 || mode != 0) - if (x->d86_rex_prefix & REX_B) - base += 8; - if (x->d86_rex_prefix & REX_X) - index += 8; - } else { - base = r_m; - } - - /* - * Compute the displacement size and get its bytes - */ - dispsize = 0; - - if (mode == 1) - dispsize = 1; - else if (mode == 2) - dispsize = 4; - else if ((r_m & 7) == EBP_REGNO || - (have_SIB && (base & 7) == EBP_REGNO)) - dispsize = 4; - - if (dispsize > 0) { - dtrace_imm_opnd(x, dispsize == 4 ? LONG_OPND : BYTE_OPND, - dispsize, opindex); - if (x->d86_error) - return; - } - -#ifdef DIS_TEXT - if (dispsize > 0) - x->d86_opnd[opindex].d86_mode = MODE_OFFSET; - - if (have_SIB == 0) { - if (x->d86_mode == SIZE32) { - if (mode == 0) - (void) strlcat(opnd, dis_addr32_mode0[r_m], - OPLEN); - else - (void) strlcat(opnd, dis_addr32_mode12[r_m], - OPLEN); - } else { - if (mode == 0) - (void) strlcat(opnd, dis_addr64_mode0[r_m], - OPLEN); - else - (void) strlcat(opnd, dis_addr64_mode12[r_m], - OPLEN); - } - } else { - uint_t need_paren = 0; - char **regs; - if (x->d86_mode == SIZE32) /* NOTE this is not addr_size! */ - regs = (char **)dis_REG32; - else - regs = (char **)dis_REG64; - - /* - * print the base (if any) - */ - if (base == EBP_REGNO && mode == 0) { - if (index != ESP_REGNO) { - (void) strlcat(opnd, "(", OPLEN); - need_paren = 1; - } - } else { - (void) strlcat(opnd, "(", OPLEN); - (void) strlcat(opnd, regs[base], OPLEN); - need_paren = 1; - } - - /* - * print the index (if any) - */ - if (index != ESP_REGNO) { - (void) strlcat(opnd, ",", OPLEN); - (void) strlcat(opnd, regs[index], OPLEN); - (void) strlcat(opnd, dis_scale_factor[ss], OPLEN); - } else - if (need_paren) - (void) strlcat(opnd, ")", OPLEN); - } -#endif -} - -/* - * Operand sequence for standard instruction involving one register - * and one register/memory operand. - * wbit indicates a byte(0) or opnd_size(1) operation - * vbit indicates direction (0 for "opcode r,r_m") or (1 for "opcode r_m, r") - */ -#define STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, vbit) { \ - dtrace_get_modrm(x, &mode, ®, &r_m); \ - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); \ - dtrace_get_operand(x, mode, r_m, wbit, vbit); \ - dtrace_get_operand(x, REG_ONLY, reg, wbit, 1 - vbit); \ -} - -/* - * Similar to above, but allows for the two operands to be of different - * classes (ie. wbit). - * wbit is for the r_m operand - * w2 is for the reg operand - */ -#define MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, w2, vbit) { \ - dtrace_get_modrm(x, &mode, ®, &r_m); \ - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); \ - dtrace_get_operand(x, mode, r_m, wbit, vbit); \ - dtrace_get_operand(x, REG_ONLY, reg, w2, 1 - vbit); \ -} - -/* - * Similar, but for 2 operands plus an immediate. - */ -#define THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, w2, immsize) { \ - dtrace_get_modrm(x, &mode, ®, &r_m); \ - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); \ - dtrace_get_operand(x, mode, r_m, wbit, 1); \ - dtrace_get_operand(x, REG_ONLY, reg, w2, 2); \ - dtrace_imm_opnd(x, wbit, immsize, 0); \ -} - -/* - * Dissassemble a single x86 or amd64 instruction. - * - * Mode determines the default operating mode (SIZE16, SIZE32 or SIZE64) - * for interpreting instructions. - * - * returns non-zero for bad opcode - */ -int -dtrace_disx86(dis86_t *x, uint_t cpu_mode) -{ - const instable_t *dp = NULL; /* decode table being used */ -#ifdef DIS_TEXT - uint_t i; -#endif -#ifdef DIS_MEM - uint_t nomem = 0; -#define NOMEM (nomem = 1) -#else -#define NOMEM /* nothing */ -#endif - uint_t wbit = 0; /* opcode wbit, 0 is 8 bit, !0 for opnd_size */ - uint_t w2; /* wbit value for second operand */ - uint_t vbit; - uint_t mode = 0; /* mode value from ModRM byte */ - uint_t reg; /* reg value from ModRM byte */ - uint_t r_m; /* r_m value from ModRM byte */ - - uint_t opcode1; /* high nibble of 1st byte */ - uint_t opcode2; /* low nibble of 1st byte */ - uint_t opcode3; /* extra opcode bits usually from ModRM byte */ - uint_t opcode4; /* high nibble of 2nd byte */ - uint_t opcode5 = 0; /* low nibble of 2ne byte */ /* XXX: gcc */ - uint_t opcode6; /* high nibble of 3rd byte */ - uint_t opcode7 = 0; /* low nibble of 3rd byte */ /* XXX: gcc */ - uint_t opcode_bytes = 1; - - /* - * legacy prefixes come in 5 flavors, you should have only one of each - */ - uint_t opnd_size_prefix = 0; - uint_t addr_size_prefix = 0; - uint_t segment_prefix = 0; - uint_t lock_prefix = 0; - uint_t rep_prefix = 0; - uint_t rex_prefix = 0; /* amd64 register extension prefix */ - size_t off; - - x->d86_len = 0; - x->d86_rmindex = -1; - x->d86_error = 0; -#ifdef DIS_TEXT - x->d86_numopnds = 0; - x->d86_seg_prefix = NULL; - x->d86_mneu[0] = 0; - for (i = 0; i < 3; ++i) { - x->d86_opnd[i].d86_opnd[0] = 0; - x->d86_opnd[i].d86_prefix[0] = 0; - x->d86_opnd[i].d86_value_size = 0; - x->d86_opnd[i].d86_value = 0; - x->d86_opnd[i].d86_mode = MODE_NONE; - } -#endif - x->d86_error = 0; - x->d86_memsize = 0; - - if (cpu_mode == SIZE16) { - opnd_size = SIZE16; - addr_size = SIZE16; - } else if (cpu_mode == SIZE32) { - opnd_size = SIZE32; - addr_size = SIZE32; - } else { - opnd_size = SIZE32; - addr_size = SIZE64; - } - - /* - * Get one opcode byte and check for zero padding that follows - * jump tables. - */ - if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0) - goto error; - - if (opcode1 == 0 && opcode2 == 0 && - x->d86_check_func != NULL && x->d86_check_func(x->d86_data)) { -#ifdef DIS_TEXT - (void) strncpy(x->d86_mneu, ".byte\t0", OPLEN); -#endif - goto done; - } - - /* - * Gather up legacy x86 prefix bytes. - */ - for (;;) { - uint_t *which_prefix = NULL; - - dp = &dis_distable[opcode1][opcode2]; - - switch (dp->it_adrmode) { - case PREFIX: - which_prefix = &rep_prefix; - break; - case LOCK: - which_prefix = &lock_prefix; - break; - case OVERRIDE: - which_prefix = &segment_prefix; -#ifdef DIS_TEXT - x->d86_seg_prefix = (char *)dp->it_name; -#endif - if (dp->it_invalid64 && cpu_mode == SIZE64) - goto error; - break; - case AM: - which_prefix = &addr_size_prefix; - break; - case DM: - which_prefix = &opnd_size_prefix; - break; - } - if (which_prefix == NULL) - break; - *which_prefix = (opcode1 << 4) | opcode2; - if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0) - goto error; - } - - /* - * Handle amd64 mode PREFIX values. - * Some of the segment prefixes are no-ops. (only FS/GS actually work) - * We might have a REX prefix (opcodes 0x40-0x4f) - */ - if (cpu_mode == SIZE64) { - if (segment_prefix != 0x64 && segment_prefix != 0x65) - segment_prefix = 0; - - if (opcode1 == 0x4) { - rex_prefix = (opcode1 << 4) | opcode2; - if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0) - goto error; - dp = &dis_distable[opcode1][opcode2]; - } - } - - /* - * Deal with selection of operand and address size now. - * Note that the REX.W bit being set causes opnd_size_prefix to be - * ignored. - */ - if (cpu_mode == SIZE64) { - if (rex_prefix & 0x08) - opnd_size = SIZE64; - else if (opnd_size_prefix) - opnd_size = SIZE16; - - if (addr_size_prefix) - addr_size = SIZE32; - } else if (cpu_mode == SIZE32) { - if (opnd_size_prefix) - opnd_size = SIZE16; - if (addr_size_prefix) - addr_size = SIZE16; - } else { - if (opnd_size_prefix) - opnd_size = SIZE32; - if (addr_size_prefix) - addr_size = SIZE32; - } - - /* - * The pause instruction - a repz'd nop. This doesn't fit - * with any of the other prefix goop added for SSE, so we'll - * special-case it here. - */ - if (rep_prefix == 0xf3 && opcode1 == 0x9 && opcode2 == 0x0) { - rep_prefix = 0; - dp = &dis_opPause; - } - - /* - * Some 386 instructions have 2 bytes of opcode before the mod_r/m - * byte so we may need to perform a table indirection. - */ - if (dp->it_indirect == dis_op0F[0]) { - if (dtrace_get_opcode(x, &opcode4, &opcode5) != 0) - goto error; - opcode_bytes = 2; - if (opcode4 == 0x7 && opcode5 >= 0x1 && opcode5 <= 0x3) { - uint_t subcode; - - if (dtrace_get_opcode(x, &opcode6, &opcode7) != 0) - goto error; - opcode_bytes = 3; - subcode = ((opcode6 & 0x3) << 1) | - ((opcode7 & 0x8) >> 3); - dp = &dis_op0F7123[opcode5][subcode]; - } else if ((opcode4 == 0xc) && (opcode5 >= 0x8)) { - dp = &dis_op0FC8[0]; - } else { - dp = &dis_op0F[opcode4][opcode5]; - } - } - - /* - * If still not at a TERM decode entry, then a ModRM byte - * exists and its fields further decode the instruction. - */ - x->d86_got_modrm = 0; - if (dp->it_indirect != TERM) { - dtrace_get_modrm(x, &mode, &opcode3, &r_m); - if (x->d86_error) - goto error; - reg = opcode3; - - /* - * decode 287 instructions (D8-DF) from opcodeN - */ - if (opcode1 == 0xD && opcode2 >= 0x8) { - if (opcode2 == 0xB && mode == 0x3 && opcode3 == 4) - dp = &dis_opFP5[r_m]; - else if (opcode2 == 0xA && mode == 0x3 && opcode3 < 4) - dp = &dis_opFP7[opcode3]; - else if (opcode2 == 0xB && mode == 0x3) - dp = &dis_opFP6[opcode3]; - else if (opcode2 == 0x9 && mode == 0x3 && opcode3 >= 4) - dp = &dis_opFP4[opcode3 - 4][r_m]; - else if (mode == 0x3) - dp = &dis_opFP3[opcode2 - 8][opcode3]; - else - dp = &dis_opFP1n2[opcode2 - 8][opcode3]; - } else { - dp = dp->it_indirect + opcode3; - } - } - - /* - * In amd64 bit mode, ARPL opcode is changed to MOVSXD - * (sign extend 32bit to 64 bit) - */ - if (cpu_mode == SIZE64 && opcode1 == 0x6 && opcode2 == 0x3) - dp = &dis_opMOVSLD; - - /* - * at this point we should have a correct (or invalid) opcode - */ - if ((cpu_mode == SIZE64 && dp->it_invalid64) || - (cpu_mode != SIZE64 && dp->it_invalid32)) - goto error; - if (dp->it_indirect != TERM) - goto error; - - /* - * deal with MMX/SSE opcodes which are changed by prefixes - */ - switch (dp->it_adrmode) { - case MMO: - case MMOIMPL: - case MMO3P: - case MMOM3: - case MMOMS: - case MMOPM: - case MMOPRM: - case MMOS: - case XMMO: - case XMMOM: - case XMMOMS: - case XMMOPM: - case XMMOS: - case XMMOMX: - case XMMOX3: - case XMMOXMM: - /* - * This is horrible. Some SIMD instructions take the - * form 0x0F 0x?? ..., which is easily decoded using the - * existing tables. Other SIMD instructions use various - * prefix bytes to overload existing instructions. For - * Example, addps is F0, 58, whereas addss is F3 (repz), - * F0, 58. Presumably someone got a raise for this. - * - * If we see one of the instructions which can be - * modified in this way (if we've got one of the SIMDO* - * address modes), we'll check to see if the last prefix - * was a repz. If it was, we strip the prefix from the - * mnemonic, and we indirect using the dis_opSIMDrepz - * table. - */ - - /* - * Calculate our offset in dis_op0F - */ - if ((uintptr_t)dp - (uintptr_t)dis_op0F > sizeof (dis_op0F)) - goto error; - - off = ((uintptr_t)dp - (uintptr_t)dis_op0F) / - sizeof (instable_t); - - /* - * Rewrite if this instruction used one of the magic prefixes. - */ - if (rep_prefix) { - if (rep_prefix == 0xf2) - dp = &dis_opSIMDrepnz[off]; - else - dp = &dis_opSIMDrepz[off]; - rep_prefix = 0; - } else if (opnd_size_prefix) { - dp = &dis_opSIMDdata16[off]; - opnd_size_prefix = 0; - if (opnd_size == SIZE16) - opnd_size = SIZE32; - } - break; - - case MMOSH: - /* - * As with the "normal" SIMD instructions, the MMX - * shuffle instructions are overloaded. These - * instructions, however, are special in that they use - * an extra byte, and thus an extra table. As of this - * writing, they only use the opnd_size prefix. - */ - - /* - * Calculate our offset in dis_op0F7123 - */ - if ((uintptr_t)dp - (uintptr_t)dis_op0F7123 > - sizeof (dis_op0F7123)) - goto error; - - if (opnd_size_prefix) { - off = ((uintptr_t)dp - (uintptr_t)dis_op0F7123) / - sizeof (instable_t); - dp = &dis_opSIMD7123[off]; - opnd_size_prefix = 0; - if (opnd_size == SIZE16) - opnd_size = SIZE32; - } - break; - } - - /* - * In 64 bit mode, some opcodes automatically use opnd_size == SIZE64. - */ - if (cpu_mode == SIZE64) - if (dp->it_always64 || (opnd_size == SIZE32 && dp->it_stackop)) - opnd_size = SIZE64; - -#ifdef DIS_TEXT - /* - * At this point most instructions can format the opcode mnemonic - * including the prefixes. - */ - if (lock_prefix) - (void) strlcat(x->d86_mneu, "lock ", OPLEN); - - if (rep_prefix == 0xf2) - (void) strlcat(x->d86_mneu, "repnz ", OPLEN); - else if (rep_prefix == 0xf3) - (void) strlcat(x->d86_mneu, "repz ", OPLEN); - - if (cpu_mode == SIZE64 && addr_size_prefix) - (void) strlcat(x->d86_mneu, "addr32 ", OPLEN); - - if (dp->it_adrmode != CBW && - dp->it_adrmode != CWD && - dp->it_adrmode != XMMSFNC) { - if (strcmp(dp->it_name, "INVALID") == 0) - goto error; - (void) strlcat(x->d86_mneu, dp->it_name, OPLEN); - if (dp->it_suffix) { - char *types[] = {"", "w", "l", "q"}; - if (opcode_bytes == 2 && opcode4 == 4) { - /* It's a cmovx.yy. Replace the suffix x */ - for (i = 5; i < OPLEN; i++) { - if (x->d86_mneu[i] == '.') - break; - } - x->d86_mneu[i - 1] = *types[opnd_size]; - } else { - (void) strlcat(x->d86_mneu, types[opnd_size], - OPLEN); - } - } - } -#endif - - /* - * Process operands based on the addressing modes. - */ - x->d86_mode = cpu_mode; - x->d86_rex_prefix = rex_prefix; - x->d86_opnd_size = opnd_size; - x->d86_addr_size = addr_size; - vbit = 0; /* initialize for mem/reg -> reg */ - switch (dp->it_adrmode) { - /* - * amd64 instruction to sign extend 32 bit reg/mem operands - * into 64 bit register values - */ - case MOVSXZ: -#ifdef DIS_TEXT - if (rex_prefix == 0) - (void) strncpy(x->d86_mneu, "movzld", OPLEN); -#endif - dtrace_get_modrm(x, &mode, ®, &r_m); - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - x->d86_opnd_size = SIZE64; - dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1); - x->d86_opnd_size = opnd_size = SIZE32; - wbit = LONG_OPND; - dtrace_get_operand(x, mode, r_m, wbit, 0); - break; - - /* - * movsbl movsbw movsbq (0x0FBE) or movswl movswq (0x0FBF) - * movzbl movzbw movzbq (0x0FB6) or mobzwl movzwq (0x0FB7) - * wbit lives in 2nd byte, note that operands - * are different sized - */ - case MOVZ: - if (rex_prefix & REX_W) { - /* target register size = 64 bit */ - x->d86_mneu[5] = 'q'; - } - dtrace_get_modrm(x, &mode, ®, &r_m); - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1); - x->d86_opnd_size = opnd_size = SIZE16; - wbit = WBIT(opcode5); - dtrace_get_operand(x, mode, r_m, wbit, 0); - break; - - /* - * imul instruction, with either 8-bit or longer immediate - * opcode 0x6B for byte, sign-extended displacement, 0x69 for word(s) - */ - case IMUL: - wbit = LONG_OPND; - THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, - OPSIZE(opnd_size, opcode2 == 0x9)); - break; - - /* memory or register operand to register, with 'w' bit */ - case MRw: - wbit = WBIT(opcode2); - STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0); - break; - - /* register to memory or register operand, with 'w' bit */ - /* arpl happens to fit here also because it is odd */ - case RMw: - if (opcode_bytes == 2) - wbit = WBIT(opcode5); - else - wbit = WBIT(opcode2); - STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1); - break; - - /* xaddb instruction */ - case XADDB: - wbit = 0; - STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1); - break; - - /* MMX register to memory or register operand */ - case MMS: - case MMOS: -#ifdef DIS_TEXT - wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND; -#else - wbit = LONG_OPND; -#endif - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 1); - break; - - /* MMX register to memory */ - case MMOMS: - dtrace_get_modrm(x, &mode, ®, &r_m); - if (mode == REG_ONLY) - goto error; - wbit = MM_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 1); - break; - - /* Double shift. Has immediate operand specifying the shift. */ - case DSHIFT: - wbit = LONG_OPND; - dtrace_get_modrm(x, &mode, ®, &r_m); - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 2); - dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1); - dtrace_imm_opnd(x, wbit, 1, 0); - break; - - /* - * Double shift. With no immediate operand, specifies using %cl. - */ - case DSHIFTcl: - wbit = LONG_OPND; - STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1); - break; - - /* immediate to memory or register operand */ - case IMlw: - wbit = WBIT(opcode2); - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 1); - /* - * Have long immediate for opcode 0x81, but not 0x80 nor 0x83 - */ - dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, opcode2 == 1), 0); - break; - - /* immediate to memory or register operand with the */ - /* 'w' bit present */ - case IMw: - wbit = WBIT(opcode2); - dtrace_get_modrm(x, &mode, ®, &r_m); - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 1); - dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, wbit), 0); - break; - - /* immediate to register with register in low 3 bits */ - /* of op code */ - case IR: - /* w-bit here (with regs) is bit 3 */ - wbit = opcode2 >>3 & 0x1; - reg = REGNO(opcode2); - dtrace_rex_adjust(rex_prefix, mode, ®, NULL); - mode = REG_ONLY; - r_m = reg; - dtrace_get_operand(x, mode, r_m, wbit, 1); - dtrace_imm_opnd(x, wbit, OPSIZE64(opnd_size, wbit), 0); - break; - - /* MMX immediate shift of register */ - case MMSH: - case MMOSH: - wbit = MM_OPND; - goto mm_shift; /* in next case */ - - /* SIMD immediate shift of register */ - case XMMSH: - wbit = XMM_OPND; -mm_shift: - reg = REGNO(opcode7); - dtrace_rex_adjust(rex_prefix, mode, ®, NULL); - dtrace_get_operand(x, REG_ONLY, reg, wbit, 1); - dtrace_imm_opnd(x, wbit, 1, 0); - NOMEM; - break; - - /* accumulator to memory operand */ - case AO: - vbit = 1; - /*FALLTHROUGH*/ - - /* memory operand to accumulator */ - case OA: - wbit = WBIT(opcode2); - dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1 - vbit); - dtrace_imm_opnd(x, wbit, OPSIZE64(addr_size, LONG_OPND), vbit); -#ifdef DIS_TEXT - x->d86_opnd[vbit].d86_mode = MODE_OFFSET; -#endif - break; - - - /* segment register to memory or register operand */ - case SM: - vbit = 1; - /*FALLTHROUGH*/ - - /* memory or register operand to segment register */ - case MS: - dtrace_get_modrm(x, &mode, ®, &r_m); - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, LONG_OPND, vbit); - dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 1 - vbit); - break; - - /* - * rotate or shift instructions, which may shift by 1 or - * consult the cl register, depending on the 'v' bit - */ - case Mv: - vbit = VBIT(opcode2); - wbit = WBIT(opcode2); - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 1); -#ifdef DIS_TEXT - if (vbit) { - (void) strlcat(x->d86_opnd[0].d86_opnd, "%cl", OPLEN); - } else { - x->d86_opnd[0].d86_mode = MODE_SIGNED; - x->d86_opnd[0].d86_value_size = 1; - x->d86_opnd[0].d86_value = 1; - } -#endif - break; - /* - * immediate rotate or shift instructions - */ - case MvI: - wbit = WBIT(opcode2); -normal_imm_mem: - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 1); - dtrace_imm_opnd(x, wbit, 1, 0); - break; - - /* bit test instructions */ - case MIb: - wbit = LONG_OPND; - goto normal_imm_mem; - - /* single memory or register operand with 'w' bit present */ - case Mw: - wbit = WBIT(opcode2); -just_mem: - dtrace_get_modrm(x, &mode, ®, &r_m); - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 0); - break; - - case SWAPGS: - if (cpu_mode == SIZE64 && mode == 3 && r_m == 0) { -#ifdef DIS_TEXT - (void) strncpy(x->d86_mneu, "swapgs", OPLEN); -#endif - NOMEM; - break; - } - /*FALLTHROUGH*/ - - /* prefetch instruction - memory operand, but no memory acess */ - case PREF: - NOMEM; - /*FALLTHROUGH*/ - - /* single memory or register operand */ - case M: - wbit = LONG_OPND; - goto just_mem; - - /* single memory or register byte operand */ - case Mb: - wbit = BYTE_OPND; - goto just_mem; - - case MO: - /* Similar to M, but only memory (no direct registers) */ - wbit = LONG_OPND; - dtrace_get_modrm(x, &mode, ®, &r_m); - if (mode == 3) - goto error; - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 0); - break; - - /* move special register to register or reverse if vbit */ - case SREG: - switch (opcode5) { - - case 2: - vbit = 1; - /*FALLTHROUGH*/ - case 0: - wbit = CONTROL_OPND; - break; - - case 3: - vbit = 1; - /*FALLTHROUGH*/ - case 1: - wbit = DEBUG_OPND; - break; - - case 6: - vbit = 1; - /*FALLTHROUGH*/ - case 4: - wbit = TEST_OPND; - break; - - } - dtrace_get_modrm(x, &mode, ®, &r_m); - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - dtrace_get_operand(x, REG_ONLY, reg, wbit, vbit); - dtrace_get_operand(x, REG_ONLY, r_m, LONG_OPND, 1 - vbit); - NOMEM; - break; - - /* - * single register operand with register in the low 3 - * bits of op code - */ - case R: - if (opcode_bytes == 2) - reg = REGNO(opcode5); - else - reg = REGNO(opcode2); - dtrace_rex_adjust(rex_prefix, mode, ®, NULL); - dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 0); - NOMEM; - break; - - /* - * register to accumulator with register in the low 3 - * bits of op code, xchg instructions - */ - case RA: - NOMEM; - reg = REGNO(opcode2); - dtrace_rex_adjust(rex_prefix, mode, ®, NULL); - dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 0); - dtrace_get_operand(x, REG_ONLY, EAX_REGNO, LONG_OPND, 1); - break; - - /* - * single segment register operand, with register in - * bits 3-4 of op code byte - */ - case SEG: - NOMEM; - reg = (x->d86_bytes[x->d86_len - 1] >> 3) & 0x3; - dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 0); - break; - - /* - * single segment register operand, with register in - * bits 3-5 of op code - */ - case LSEG: - NOMEM; - /* long seg reg from opcode */ - reg = (x->d86_bytes[x->d86_len - 1] >> 3) & 0x7; - dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 0); - break; - - /* memory or register operand to register */ - case MR: - wbit = LONG_OPND; - STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0); - break; - - case RM: - wbit = LONG_OPND; - STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1); - break; - - /* MMX/SIMD-Int memory or mm reg to mm reg */ - case MM: - case MMO: -#ifdef DIS_TEXT - wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND; -#else - wbit = LONG_OPND; -#endif - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 0); - break; - - case MMOIMPL: -#ifdef DIS_TEXT - wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND; -#else - wbit = LONG_OPND; -#endif - dtrace_get_modrm(x, &mode, ®, &r_m); - if (mode != REG_ONLY) - goto error; - - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 0); - dtrace_get_operand(x, REG_ONLY, reg, MM_OPND, 1); - mode = 0; /* change for memory access size... */ - break; - - /* MMX/SIMD-Int and SIMD-FP predicated mm reg to r32 */ - case MMO3P: - wbit = MM_OPND; - goto xmm3p; - case XMM3P: - wbit = XMM_OPND; -xmm3p: - dtrace_get_modrm(x, &mode, ®, &r_m); - if (mode != REG_ONLY) - goto error; - - THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 1); - NOMEM; - break; - - /* MMX/SIMD-Int predicated r32/mem to mm reg */ - case MMOPRM: - wbit = LONG_OPND; - w2 = MM_OPND; - goto xmmprm; - case XMMPRM: - wbit = LONG_OPND; - w2 = XMM_OPND; -xmmprm: - THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, w2, 1); - break; - - /* MMX/SIMD-Int predicated mm/mem to mm reg */ - case MMOPM: - wbit = w2 = MM_OPND; - goto xmmprm; - - /* MMX/SIMD-Int mm reg to r32 */ - case MMOM3: - NOMEM; - dtrace_get_modrm(x, &mode, ®, &r_m); - if (mode != REG_ONLY) - goto error; - wbit = MM_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 0); - break; - - /* SIMD memory or xmm reg operand to xmm reg */ - case XMM: - case XMMO: - case XMMXIMPL: - wbit = XMM_OPND; - STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0); - - if (dp->it_adrmode == XMMXIMPL && mode != REG_ONLY) - goto error; - -#ifdef DIS_TEXT - /* - * movlps and movhlps share opcodes. They differ in the - * addressing modes allowed for their operands. - * movhps and movlhps behave similarly. - */ - if (mode == REG_ONLY) { - if (strcmp(dp->it_name, "movlps") == 0) - (void) strncpy(x->d86_mneu, "movhlps", OPLEN); - else if (strcmp(dp->it_name, "movhps") == 0) - (void) strncpy(x->d86_mneu, "movlhps", OPLEN); - } -#endif - if (dp->it_adrmode == XMMXIMPL) - mode = 0; /* change for memory access size... */ - break; - - /* SIMD xmm reg to memory or xmm reg */ - case XMMS: - case XMMOS: - case XMMMS: - case XMMOMS: - dtrace_get_modrm(x, &mode, ®, &r_m); -#ifdef DIS_TEXT - if ((strcmp(dp->it_name, "movlps") == 0 || - strcmp(dp->it_name, "movhps") == 0 || - strcmp(dp->it_name, "movntps") == 0) && - mode == REG_ONLY) - goto error; -#endif - wbit = XMM_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1); - break; - - /* SIMD memory to xmm reg */ - case XMMM: - case XMMOM: - wbit = XMM_OPND; - dtrace_get_modrm(x, &mode, ®, &r_m); -#ifdef DIS_TEXT - if (mode == REG_ONLY) { - if (strcmp(dp->it_name, "movhps") == 0) - (void) strncpy(x->d86_mneu, "movlhps", OPLEN); - else - goto error; - } -#endif - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0); - break; - - /* SIMD memory or r32 to xmm reg */ - case XMM3MX: - wbit = LONG_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0); - break; - - case XMM3MXS: - wbit = LONG_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1); - break; - - /* SIMD memory or mm reg to xmm reg */ - case XMMOMX: - /* SIMD mm to xmm */ - case XMMMX: - wbit = MM_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0); - break; - - /* SIMD memory or xmm reg to mm reg */ - case XMMXMM: - case XMMOXMM: - case XMMXM: - wbit = XMM_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 0); - break; - - - /* SIMD memory or xmm reg to r32 */ - case XMMXM3: - wbit = XMM_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 0); - break; - - /* SIMD xmm to r32 */ - case XMMX3: - case XMMOX3: - dtrace_get_modrm(x, &mode, ®, &r_m); - if (mode != REG_ONLY) - goto error; - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - dtrace_get_operand(x, mode, r_m, XMM_OPND, 0); - dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1); - NOMEM; - break; - - /* SIMD predicated memory or xmm reg with/to xmm reg */ - case XMMP: - case XMMOPM: - wbit = XMM_OPND; - THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1); - -#ifdef DIS_TEXT - /* - * cmpps and cmpss vary their instruction name based - * on the value of imm8. Other XMMP instructions, - * such as shufps, require explicit specification of - * the predicate. - */ - if (dp->it_name[0] == 'c' && - dp->it_name[1] == 'm' && - dp->it_name[2] == 'p' && - strlen(dp->it_name) == 5) { - uchar_t pred = x->d86_opnd[0].d86_value & 0xff; - - if (pred >= (sizeof (dis_PREDSUFFIX) / sizeof (char *))) - goto error; - - (void) strncpy(x->d86_mneu, "cmp", OPLEN); - (void) strlcat(x->d86_mneu, dis_PREDSUFFIX[pred], - OPLEN); - (void) strlcat(x->d86_mneu, - dp->it_name + strlen(dp->it_name) - 2, - OPLEN); - x->d86_opnd[0] = x->d86_opnd[1]; - x->d86_opnd[1] = x->d86_opnd[2]; - x->d86_numopnds = 2; - } -#endif - break; - - /* immediate operand to accumulator */ - case IA: - wbit = WBIT(opcode2); - dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1); - dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, wbit), 0); - NOMEM; - break; - - /* memory or register operand to accumulator */ - case MA: - wbit = WBIT(opcode2); - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 0); - break; - - /* si register to di register used to reference memory */ - case SD: -#ifdef DIS_TEXT - dtrace_check_override(x, 0); - x->d86_numopnds = 2; - if (addr_size == SIZE64) { - (void) strlcat(x->d86_opnd[0].d86_opnd, "(%rsi)", - OPLEN); - (void) strlcat(x->d86_opnd[1].d86_opnd, "(%rdi)", - OPLEN); - } else if (addr_size == SIZE32) { - (void) strlcat(x->d86_opnd[0].d86_opnd, "(%esi)", - OPLEN); - (void) strlcat(x->d86_opnd[1].d86_opnd, "(%edi)", - OPLEN); - } else { - (void) strlcat(x->d86_opnd[0].d86_opnd, "(%si)", - OPLEN); - (void) strlcat(x->d86_opnd[1].d86_opnd, "(%di)", - OPLEN); - } -#endif - wbit = LONG_OPND; - break; - - /* accumulator to di register */ - case AD: - wbit = WBIT(opcode2); -#ifdef DIS_TEXT - dtrace_check_override(x, 1); - x->d86_numopnds = 2; - dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 0); - if (addr_size == SIZE64) - (void) strlcat(x->d86_opnd[1].d86_opnd, "(%rdi)", - OPLEN); - else if (addr_size == SIZE32) - (void) strlcat(x->d86_opnd[1].d86_opnd, "(%edi)", - OPLEN); - else - (void) strlcat(x->d86_opnd[1].d86_opnd, "(%di)", - OPLEN); -#endif - break; - - /* si register to accumulator */ - case SA: - wbit = WBIT(opcode2); -#ifdef DIS_TEXT - dtrace_check_override(x, 0); - x->d86_numopnds = 2; - if (addr_size == SIZE64) - (void) strlcat(x->d86_opnd[0].d86_opnd, "(%rsi)", - OPLEN); - else if (addr_size == SIZE32) - (void) strlcat(x->d86_opnd[0].d86_opnd, "(%esi)", - OPLEN); - else - (void) strlcat(x->d86_opnd[0].d86_opnd, "(%si)", - OPLEN); - dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1); -#endif - break; - - /* - * single operand, a 16/32 bit displacement - */ - case D: - wbit = LONG_OPND; - dtrace_disp_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 0); - NOMEM; - break; - - /* jmp/call indirect to memory or register operand */ - case INM: -#ifdef DIS_TEXT - (void) strlcat(x->d86_opnd[0].d86_prefix, "*", OPLEN); -#endif - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, LONG_OPND, 0); - wbit = LONG_OPND; - break; - - /* - * for long jumps and long calls -- a new code segment - * register and an offset in IP -- stored in object - * code in reverse order. Note - not valid in amd64 - */ - case SO: - dtrace_check_override(x, 1); - wbit = LONG_OPND; - dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 1); -#ifdef DIS_TEXT - x->d86_opnd[1].d86_mode = MODE_SIGNED; -#endif - /* will now get segment operand */ - dtrace_imm_opnd(x, wbit, 2, 0); - break; - - /* - * jmp/call. single operand, 8 bit displacement. - * added to current EIP in 'compofff' - */ - case BD: - dtrace_disp_opnd(x, BYTE_OPND, 1, 0); - NOMEM; - break; - - /* single 32/16 bit immediate operand */ - case I: - wbit = LONG_OPND; - dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 0); - break; - - /* single 8 bit immediate operand */ - case Ib: - wbit = LONG_OPND; - dtrace_imm_opnd(x, wbit, 1, 0); - break; - - case ENTER: - wbit = LONG_OPND; - dtrace_imm_opnd(x, wbit, 2, 0); - dtrace_imm_opnd(x, wbit, 1, 1); - switch (opnd_size) { - case SIZE64: - x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 8; - break; - case SIZE32: - x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 4; - break; - case SIZE16: - x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 2; - break; - } - - break; - - /* 16-bit immediate operand */ - case RET: - wbit = LONG_OPND; - dtrace_imm_opnd(x, wbit, 2, 0); - break; - - /* single 8 bit port operand */ - case P: - dtrace_check_override(x, 0); - dtrace_imm_opnd(x, BYTE_OPND, 1, 0); - NOMEM; - break; - - /* single operand, dx register (variable port instruction) */ - case V: - x->d86_numopnds = 1; - dtrace_check_override(x, 0); -#ifdef DIS_TEXT - (void) strlcat(x->d86_opnd[0].d86_opnd, "(%dx)", OPLEN); -#endif - NOMEM; - break; - - /* - * The int instruction, which has two forms: - * int 3 (breakpoint) or - * int n, where n is indicated in the subsequent - * byte (format Ib). The int 3 instruction (opcode 0xCC), - * where, although the 3 looks like an operand, - * it is implied by the opcode. It must be converted - * to the correct base and output. - */ - case INT3: -#ifdef DIS_TEXT - x->d86_numopnds = 1; - x->d86_opnd[0].d86_mode = MODE_SIGNED; - x->d86_opnd[0].d86_value_size = 1; - x->d86_opnd[0].d86_value = 3; -#endif - NOMEM; - break; - - /* single 8 bit immediate operand */ - case INTx: - dtrace_imm_opnd(x, BYTE_OPND, 1, 0); - NOMEM; - break; - - /* an unused byte must be discarded */ - case U: - if (x->d86_get_byte(x->d86_data) < 0) - goto error; - x->d86_len++; - NOMEM; - break; - - case CBW: -#ifdef DIS_TEXT - if (opnd_size == SIZE16) - (void) strlcat(x->d86_mneu, "cbtw", OPLEN); - else if (opnd_size == SIZE32) - (void) strlcat(x->d86_mneu, "cwtl", OPLEN); - else - (void) strlcat(x->d86_mneu, "cltq", OPLEN); -#endif - wbit = LONG_OPND; - NOMEM; - break; - - case CWD: -#ifdef DIS_TEXT - if (opnd_size == SIZE16) - (void) strlcat(x->d86_mneu, "cwtd", OPLEN); - else if (opnd_size == SIZE32) - (void) strlcat(x->d86_mneu, "cltd", OPLEN); - else - (void) strlcat(x->d86_mneu, "cqtd", OPLEN); -#endif - wbit = LONG_OPND; - NOMEM; - break; - - case XMMSFNC: - /* - * sfence is sfence if mode is REG_ONLY. If mode isn't - * REG_ONLY, mnemonic should be 'clflush'. - */ - dtrace_get_modrm(x, &mode, ®, &r_m); - - /* sfence doesn't take operands */ -#ifdef DIS_TEXT - if (mode == REG_ONLY) { - (void) strlcat(x->d86_mneu, "sfence", OPLEN); - } else { - (void) strlcat(x->d86_mneu, "clflush", OPLEN); - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - dtrace_get_operand(x, mode, r_m, BYTE_OPND, 0); - NOMEM; - } -#else - if (mode != REG_ONLY) { - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - dtrace_get_operand(x, mode, r_m, BYTE_OPND, 0); - NOMEM; - } -#endif - break; - - /* - * no disassembly, the mnemonic was all there was so go on - */ - case NORM: - if (dp->it_invalid32 && cpu_mode != SIZE64) - goto error; - NOMEM; - /*FALLTHROUGH*/ - case IMPLMEM: - break; - - case XMMFENCE: - /* - * Only the following exact byte sequences are allowed: - * - * 0f ae e8 lfence - * 0f ae f0 mfence - */ - if ((uint8_t)x->d86_bytes[x->d86_len - 1] != 0xe8 && - (uint8_t)x->d86_bytes[x->d86_len - 1] != 0xf0) - goto error; - - break; - - - /* float reg */ - case F: -#ifdef DIS_TEXT - x->d86_numopnds = 1; - (void) strlcat(x->d86_opnd[0].d86_opnd, "%st(X)", OPLEN); - x->d86_opnd[0].d86_opnd[4] = r_m + '0'; -#endif - NOMEM; - break; - - /* float reg to float reg, with ret bit present */ - case FF: - vbit = opcode2 >> 2 & 0x1; /* vbit = 1: st -> st(i) */ - /*FALLTHROUGH*/ - case FFC: /* case for vbit always = 0 */ -#ifdef DIS_TEXT - x->d86_numopnds = 2; - (void) strlcat(x->d86_opnd[1 - vbit].d86_opnd, "%st", OPLEN); - (void) strlcat(x->d86_opnd[vbit].d86_opnd, "%st(X)", OPLEN); - x->d86_opnd[vbit].d86_opnd[4] = r_m + '0'; -#endif - NOMEM; - break; - - /* an invalid op code */ - case AM: - case DM: - case OVERRIDE: - case PREFIX: - case UNKNOWN: - NOMEM; - default: - goto error; - } /* end switch */ - if (x->d86_error) - goto error; - -done: -#ifdef DIS_MEM - /* - * compute the size of any memory accessed by the instruction - */ - if (x->d86_memsize != 0) { - return (0); - } else if (dp->it_stackop) { - switch (opnd_size) { - case SIZE16: - x->d86_memsize = 2; - break; - case SIZE32: - x->d86_memsize = 4; - break; - case SIZE64: - x->d86_memsize = 8; - break; - } - } else if (nomem || mode == REG_ONLY) { - x->d86_memsize = 0; - - } else if (dp->it_size != 0) { - /* - * In 64 bit mode descriptor table entries - * go up to 10 bytes and popf/pushf are always 8 bytes - */ - if (x->d86_mode == SIZE64 && dp->it_size == 6) - x->d86_memsize = 10; - else if (x->d86_mode == SIZE64 && opcode1 == 0x9 && - (opcode2 == 0xc || opcode2 == 0xd)) - x->d86_memsize = 8; - else - x->d86_memsize = dp->it_size; - - } else if (wbit == 0) { - x->d86_memsize = 1; - - } else if (wbit == LONG_OPND) { - if (opnd_size == SIZE64) - x->d86_memsize = 8; - else if (opnd_size == SIZE32) - x->d86_memsize = 4; - else - x->d86_memsize = 2; - - } else if (wbit == SEG_OPND) { - x->d86_memsize = 4; - - } else { - x->d86_memsize = 8; - } -#endif - return (0); - -error: -#ifdef DIS_TEXT - (void) strlcat(x->d86_mneu, "undef", OPLEN); -#endif - return (1); -} - -#ifdef DIS_TEXT - -/* - * Some instructions should have immediate operands printed - * as unsigned integers. We compare against this table. - */ -static char *unsigned_ops[] = { - "or", "and", "xor", "test", "in", "out", "lcall", "ljmp", - "rcr", "rcl", "ror", "rol", "shl", "shr", "sal", "psr", "psl", - 0 -}; - -static int -isunsigned_op(char *opcode) -{ - char *where; - int i; - int is_unsigned = 0; - - /* - * Work back to start of last mnemonic, since we may have - * prefixes on some opcodes. - */ - where = opcode + strlen(opcode) - 1; - while (where > opcode && *where != ' ') - --where; - if (*where == ' ') - ++where; - - for (i = 0; unsigned_ops[i]; ++i) { - if (strncmp(where, unsigned_ops[i], - strlen(unsigned_ops[i]))) - continue; - is_unsigned = 1; - break; - } - return (is_unsigned); -} - -/* ARGSUSED */ -void -dtrace_disx86_str(dis86_t *dis, uint_t mode, uintptr_t pc, char *buf, - size_t buflen) -{ - int i; - - dis->d86_sprintf_func(buf, buflen, "%-6s ", dis->d86_mneu); - - /* - * For PC-relative jumps, the pc is really the next pc after executing - * this instruction, so increment it appropriately. - */ - pc += dis->d86_len; - - for (i = 0; i < dis->d86_numopnds; i++) { - d86opnd_t *op = &dis->d86_opnd[i]; - int64_t sv; - uint64_t mask; - - if (i != 0) - (void) strlcat(buf, ",", buflen); - - (void) strlcat(buf, op->d86_prefix, buflen); - - sv = op->d86_value; - - switch (op->d86_mode) { - - case MODE_NONE: - - (void) strlcat(buf, op->d86_opnd, buflen); - break; - - case MODE_SIGNED: - case MODE_IMPLIED: - case MODE_OFFSET: - - if (dis->d86_seg_prefix) - (void) strlcat(buf, dis->d86_seg_prefix, - buflen); - - switch (op->d86_value_size) { - case 1: - sv = (int8_t)sv; - mask = 0xff; - break; - case 2: - sv = (int16_t)sv; - mask = 0xffff; - break; - case 4: - sv = (int32_t)sv; - mask = 0xffffffff; - break; - case 8: - mask = 0xffffffffffffffffULL; - break; - } - - if (op->d86_mode == MODE_SIGNED || - op->d86_mode == MODE_IMPLIED) - (void) strlcat(buf, "$", buflen); - - if (sv < 0 && sv > -0xffff && - !isunsigned_op(dis->d86_mneu)) { - dis->d86_sprintf_func(buf + strlen(buf), - buflen - strlen(buf), - (dis->d86_flags & DIS_OP_OCTAL) ? - "-0%llo" : "-0x%llx", -sv & mask); - } else { - dis->d86_sprintf_func(buf + strlen(buf), - buflen - strlen(buf), - (dis->d86_flags & DIS_OP_OCTAL) ? - "0%llo" : "0x%llx", sv & mask); - } - (void) strlcat(buf, op->d86_opnd, buflen); - break; - - case MODE_IPREL: - - switch (op->d86_value_size) { - case 1: - sv = (int8_t)sv; - break; - case 2: - sv = (int16_t)sv; - break; - case 4: - sv = (int32_t)sv; - break; - } - - if (sv < 0) - dis->d86_sprintf_func(buf + strlen(buf), - buflen - strlen(buf), - (dis->d86_flags & DIS_OP_OCTAL) ? - "-0%llo" : "-0x%llx", -sv - dis->d86_len); - else - dis->d86_sprintf_func(buf + strlen(buf), - buflen - strlen(buf), - (dis->d86_flags & DIS_OP_OCTAL) ? - "+0%llo" : "+0x%llx", sv + dis->d86_len); - - (void) strlcat(buf, "\t<", buflen); - - if (dis->d86_sym_lookup == NULL || - dis->d86_sym_lookup(dis->d86_data, pc + sv, - buf + strlen(buf), buflen - strlen(buf)) != 0) - dis->d86_sprintf_func(buf + strlen(buf), - buflen - strlen(buf), - (dis->d86_flags & DIS_OP_OCTAL) ? - "0%llo" : "0x%llx", pc + sv); - - (void) strlcat(buf, ">", buflen); - - break; - } - } -} - -#endif /* DIS_TEXT */ Index: src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.h =================================================================== RCS file: src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.h diff -N src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.h --- src/external/cddl/osnet/dev/dtrace/amd64/dis_tables.h 21 Feb 2010 01:46:33 -0000 1.2 +++ /dev/null 1 Jan 1970 00:00:00 -0000 @@ -1,114 +0,0 @@ -/* $NetBSD: dis_tables.h,v 1.2 2010/02/21 01:46:33 darran Exp $ */ - -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - * - * $FreeBSD: src/sys/cddl/dev/dtrace/amd64/dis_tables.h,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* Copyright (c) 1988 AT&T */ -/* All Rights Reserved */ - - -#ifndef _DIS_TABLES_H -#define _DIS_TABLES_H - -#if defined(sun) -#pragma ident "@(#)dis_tables.h 1.7 06/03/02 SMI" -#endif - -/* - * Constants and prototypes for the IA32 disassembler backend. See dis_tables.c - * for usage information and documentation. - */ - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include - -/* - * values for cpu mode - */ -#define SIZE16 1 -#define SIZE32 2 -#define SIZE64 3 - -#define OPLEN 256 -#define PFIXLEN 8 -#define NCPS 12 /* number of chars per symbol */ - -/* - * data structures that must be provided to dtrace_dis86() - */ -typedef struct d86opnd { - char d86_opnd[OPLEN]; /* symbolic rep of operand */ - char d86_prefix[PFIXLEN]; /* any prefix string or "" */ - uint_t d86_mode; /* mode for immediate */ - uint_t d86_value_size; /* size in bytes of d86_value */ - uint64_t d86_value; /* immediate value of opnd */ -} d86opnd_t; - -typedef struct dis86 { - uint_t d86_mode; - uint_t d86_error; - uint_t d86_len; /* instruction length */ - int d86_rmindex; /* index of modrm byte or -1 */ - uint_t d86_memsize; /* size of memory referenced */ - char d86_bytes[16]; /* bytes of instruction */ - char d86_mneu[OPLEN]; - uint_t d86_numopnds; - uint_t d86_rex_prefix; /* value of REX prefix if !0 */ - char *d86_seg_prefix; /* segment prefix, if any */ - uint_t d86_opnd_size; - uint_t d86_addr_size; - uint_t d86_got_modrm; - struct d86opnd d86_opnd[3]; /* up to 3 operands */ - int (*d86_check_func)(void *); - int (*d86_get_byte)(void *); -#ifdef DIS_TEXT - int (*d86_sym_lookup)(void *, uint64_t, char *, size_t); - int (*d86_sprintf_func)(char *, size_t, const char *, ...); - int d86_flags; - uint_t d86_imm_bytes; -#endif - void *d86_data; -} dis86_t; - -extern int dtrace_disx86(dis86_t *x, uint_t cpu_mode); - -#define DIS_OP_OCTAL 0x1 /* Print all numbers in octal */ - -#ifdef DIS_TEXT -extern void dtrace_disx86_str(dis86_t *x, uint_t cpu_mode, uintptr_t pc, - char *buf, size_t len); -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _DIS_TABLES_H */ Index: src/external/cddl/osnet/dev/dtrace/amd64/dtrace_asm.S =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/amd64/dtrace_asm.S,v retrieving revision 1.7 diff -u -p -r1.7 dtrace_asm.S --- src/external/cddl/osnet/dev/dtrace/amd64/dtrace_asm.S 27 Feb 2017 06:46:59 -0000 1.7 +++ src/external/cddl/osnet/dev/dtrace/amd64/dtrace_asm.S 8 May 2017 22:42:29 -0000 @@ -22,7 +22,7 @@ * * Portions Copyright 2008 John Birrell * - * $FreeBSD: src/sys/cddl/dev/dtrace/amd64/dtrace_asm.S,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/amd64/dtrace_asm.S 298171 2016-04-17 23:08:47Z markj $ * */ /* Index: src/external/cddl/osnet/dev/dtrace/amd64/dtrace_isa.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/amd64/dtrace_isa.c,v retrieving revision 1.6 diff -u -p -r1.6 dtrace_isa.c --- src/external/cddl/osnet/dev/dtrace/amd64/dtrace_isa.c 27 Feb 2017 06:46:59 -0000 1.6 +++ src/external/cddl/osnet/dev/dtrace/amd64/dtrace_isa.c 20 Apr 2017 11:07:10 -0000 @@ -21,7 +21,7 @@ * * CDDL HEADER END * - * $FreeBSD: src/sys/cddl/dev/dtrace/amd64/dtrace_isa.c,v 1.2.2.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/amd64/dtrace_isa.c 298171 2016-04-17 23:08:47Z markj $ */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. @@ -38,6 +38,8 @@ #include +#include "regset.h" + uint8_t dtrace_fuword8_nocheck(void *); uint16_t dtrace_fuword16_nocheck(void *); uint32_t dtrace_fuword32_nocheck(void *); @@ -54,6 +56,8 @@ struct amd64_frame { typedef unsigned long vm_offset_t; +int dtrace_ustackdepth_max = 2048; + void dtrace_getpcstack(pc_t *pcstack, int pcstack_limit, int aframes, uint32_t *intrpc) @@ -107,14 +111,25 @@ static int dtrace_getustack_common(uint64_t *pcstack, int pcstack_limit, uintptr_t pc, uintptr_t sp) { + uintptr_t oldsp; volatile uint16_t *flags = (volatile uint16_t *)&cpu_core[cpu_number()].cpuc_dtrace_flags; int ret = 0; ASSERT(pcstack == NULL || pcstack_limit > 0); + ASSERT(dtrace_ustackdepth_max > 0); while (pc != 0) { - ret++; + /* + * We limit the number of times we can go around this + * loop to account for a circular stack. + */ + if (ret++ >= dtrace_ustackdepth_max) { + *flags |= CPU_DTRACE_BADSTACK; + cpu_core[cpu_number()].cpuc_dtrace_illval = sp; + break; + } + if (pcstack != NULL) { *pcstack++ = (uint64_t)pc; pcstack_limit--; @@ -125,10 +140,18 @@ dtrace_getustack_common(uint64_t *pcstac if (sp == 0) break; + oldsp = sp; + pc = dtrace_fuword64((void *)(sp + offsetof(struct amd64_frame, f_retaddr))); sp = dtrace_fuword64((void *)sp); + if (sp == oldsp) { + *flags |= CPU_DTRACE_BADSTACK; + cpu_core[cpu_number()].cpuc_dtrace_illval = sp; + break; + } + /* * This is totally bogus: if we faulted, we're going to clear * the fault and break. This is to deal with the apparently @@ -467,11 +490,10 @@ dtrace_getstackdepth(int aframes) return depth - aframes; } -#ifdef notyet ulong_t -dtrace_getreg(struct regs *rp, uint_t reg) +dtrace_getreg(struct trapframe *rp, uint_t reg) { -#if defined(__amd64) + /* CHUQ skipped */ int regmap[] = { REG_GS, /* GS */ REG_FS, /* FS */ @@ -507,72 +529,62 @@ dtrace_getreg(struct regs *rp, uint_t re switch (reg) { case REG_RDI: - return (rp->r_rdi); + return (rp->tf_rdi); case REG_RSI: - return (rp->r_rsi); + return (rp->tf_rsi); case REG_RDX: - return (rp->r_rdx); + return (rp->tf_rdx); case REG_RCX: - return (rp->r_rcx); + return (rp->tf_rcx); case REG_R8: - return (rp->r_r8); + return (rp->tf_r8); case REG_R9: - return (rp->r_r9); + return (rp->tf_r9); case REG_RAX: - return (rp->r_rax); + return (rp->tf_rax); case REG_RBX: - return (rp->r_rbx); + return (rp->tf_rbx); case REG_RBP: - return (rp->r_rbp); + return (rp->tf_rbp); case REG_R10: - return (rp->r_r10); + return (rp->tf_r10); case REG_R11: - return (rp->r_r11); + return (rp->tf_r11); case REG_R12: - return (rp->r_r12); + return (rp->tf_r12); case REG_R13: - return (rp->r_r13); + return (rp->tf_r13); case REG_R14: - return (rp->r_r14); + return (rp->tf_r14); case REG_R15: - return (rp->r_r15); + return (rp->tf_r15); case REG_DS: - return (rp->r_ds); + return (rp->tf_ds); case REG_ES: - return (rp->r_es); + return (rp->tf_es); case REG_FS: - return (rp->r_fs); + return (rp->tf_fs); case REG_GS: - return (rp->r_gs); + return (rp->tf_gs); case REG_TRAPNO: - return (rp->r_trapno); + return (rp->tf_trapno); case REG_ERR: - return (rp->r_err); + return (rp->tf_err); case REG_RIP: - return (rp->r_rip); + return (rp->tf_rip); case REG_CS: - return (rp->r_cs); + return (rp->tf_cs); case REG_SS: - return (rp->r_ss); + return (rp->tf_ss); case REG_RFL: - return (rp->r_rfl); + return (rp->tf_rflags); case REG_RSP: - return (rp->r_rsp); + return (rp->tf_rsp); default: DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); return (0); } - -#else - if (reg > SS) { - DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); - return (0); - } - - return ((&rp->r_gs)[reg]); -#endif } -#endif static int dtrace_copycheck(uintptr_t uaddr, uintptr_t kaddr, size_t size) Index: src/external/cddl/osnet/dev/dtrace/amd64/dtrace_subr.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/amd64/dtrace_subr.c,v retrieving revision 1.8 diff -u -p -r1.8 dtrace_subr.c --- src/external/cddl/osnet/dev/dtrace/amd64/dtrace_subr.c 27 Feb 2017 06:46:59 -0000 1.8 +++ src/external/cddl/osnet/dev/dtrace/amd64/dtrace_subr.c 20 Apr 2017 11:58:41 -0000 @@ -21,7 +21,7 @@ * * CDDL HEADER END * - * $FreeBSD: src/sys/cddl/dev/dtrace/amd64/dtrace_subr.c,v 1.3.2.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/amd64/dtrace_subr.c 313850 2017-02-17 03:27:20Z markj $ * */ /* @@ -29,6 +29,11 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + +#include #include #include #include @@ -38,18 +43,13 @@ #include #include #include -//#include #include #include #include #include -#include -#include -#include +#include extern uintptr_t kernelbase; -extern uintptr_t dtrace_in_probe_addr; -extern int dtrace_in_probe; int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); @@ -61,6 +61,7 @@ typedef struct dtrace_invop_hdlr { dtrace_invop_hdlr_t *dtrace_invop_hdlr; void dtrace_gethrtime_init(void *); +void dtrace_getnanotime(struct timespec *); int dtrace_invop(uintptr_t addr, struct trapframe *frame, uintptr_t eax) @@ -160,122 +161,6 @@ dtrace_sync(void) } #ifdef notyet -int (*dtrace_fasttrap_probe_ptr)(struct regs *); -int (*dtrace_pid_probe_ptr)(struct regs *); -int (*dtrace_return_probe_ptr)(struct regs *); - -void -dtrace_user_probe(struct regs *rp, caddr_t addr, processorid_t cpuid) -{ - krwlock_t *rwp; - proc_t *p = curproc; - extern void trap(struct regs *, caddr_t, processorid_t); - - if (USERMODE(rp->r_cs) || (rp->r_ps & PS_VM)) { - if (curthread->t_cred != p->p_cred) { - cred_t *oldcred = curthread->t_cred; - /* - * DTrace accesses t_cred in probe context. t_cred - * must always be either NULL, or point to a valid, - * allocated cred structure. - */ - curthread->t_cred = crgetcred(); - crfree(oldcred); - } - } - - if (rp->r_trapno == T_DTRACE_RET) { - uint8_t step = curthread->t_dtrace_step; - uint8_t ret = curthread->t_dtrace_ret; - uintptr_t npc = curthread->t_dtrace_npc; - - if (curthread->t_dtrace_ast) { - aston(curthread); - curthread->t_sig_check = 1; - } - - /* - * Clear all user tracing flags. - */ - curthread->t_dtrace_ft = 0; - - /* - * If we weren't expecting to take a return probe trap, kill - * the process as though it had just executed an unassigned - * trap instruction. - */ - if (step == 0) { - tsignal(curthread, SIGILL); - return; - } - - /* - * If we hit this trap unrelated to a return probe, we're - * just here to reset the AST flag since we deferred a signal - * until after we logically single-stepped the instruction we - * copied out. - */ - if (ret == 0) { - rp->r_pc = npc; - return; - } - - /* - * We need to wait until after we've called the - * dtrace_return_probe_ptr function pointer to set %pc. - */ - rwp = &CPU->cpu_ft_lock; - rw_enter(rwp, RW_READER); - if (dtrace_return_probe_ptr != NULL) - (void) (*dtrace_return_probe_ptr)(rp); - rw_exit(rwp); - rp->r_pc = npc; - - } else if (rp->r_trapno == T_DTRACE_PROBE) { - rwp = &CPU->cpu_ft_lock; - rw_enter(rwp, RW_READER); - if (dtrace_fasttrap_probe_ptr != NULL) - (void) (*dtrace_fasttrap_probe_ptr)(rp); - rw_exit(rwp); - - } else if (rp->r_trapno == T_BPTFLT) { - uint8_t instr; - rwp = &CPU->cpu_ft_lock; - - /* - * The DTrace fasttrap provider uses the breakpoint trap - * (int 3). We let DTrace take the first crack at handling - * this trap; if it's not a probe that DTrace knowns about, - * we call into the trap() routine to handle it like a - * breakpoint placed by a conventional debugger. - */ - rw_enter(rwp, RW_READER); - if (dtrace_pid_probe_ptr != NULL && - (*dtrace_pid_probe_ptr)(rp) == 0) { - rw_exit(rwp); - return; - } - rw_exit(rwp); - - /* - * If the instruction that caused the breakpoint trap doesn't - * look like an int 3 anymore, it may be that this tracepoint - * was removed just after the user thread executed it. In - * that case, return to user land to retry the instuction. - */ - if (fuword8((void *)(rp->r_pc - 1), &instr) == 0 && - instr != FASTTRAP_INSTR) { - rp->r_pc--; - return; - } - - trap(rp, addr, cpuid); - - } else { - trap(rp, addr, cpuid); - } -} - void dtrace_safe_synchronous_signal(void) { @@ -321,14 +206,15 @@ dtrace_safe_defer_signal(void) } /* - * If we've executed the original instruction, but haven't performed - * the jmp back to t->t_dtrace_npc or the clean up of any registers - * used to emulate %rip-relative instructions in 64-bit mode, do that - * here and take the signal right away. We detect this condition by - * seeing if the program counter is the range [scrpc + isz, astpc). + * If we have executed the original instruction, but we have performed + * neither the jmp back to t->t_dtrace_npc nor the clean up of any + * registers used to emulate %rip-relative instructions in 64-bit mode, + * we'll save ourselves some effort by doing that here and taking the + * signal right away. We detect this condition by seeing if the program + * counter is the range [scrpc + isz, astpc). */ - if (t->t_dtrace_astpc - rp->r_pc < - t->t_dtrace_astpc - t->t_dtrace_scrpc - isz) { + if (rp->r_pc >= t->t_dtrace_scrpc + isz && + rp->r_pc < t->t_dtrace_astpc) { #ifdef __amd64 /* * If there is a scratch register and we're on the @@ -451,7 +337,6 @@ dtrace_gethrtime_init(void *arg) * another 32-bit integer without overflowing 64-bit. * Thus minimum supported TSC frequency is 62.5MHz. */ - //KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)), ("TSC frequency is too low")); KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT))); /* @@ -472,27 +357,6 @@ dtrace_gethrtime_init(void *arg) /* use skew relative to cpu 0 */ tsc_skew[cpu_index(cinfo)] = cinfo->ci_data.cpu_cc_skew; } - - /* Already handled in x86/tsc.c for ci_data.cpu_cc_skew */ -#if 0 - for (i = 0; i <= mp_maxid; i++) { - if (i == curcpu) - continue; - - if (pcpu_find(i) == NULL) - continue; - - map = 0; - map |= (1 << curcpu); - map |= (1 << i); - - smp_rendezvous_cpus(map, dtrace_gethrtime_init_sync, - dtrace_gethrtime_init_cpu, - smp_no_rendevous_barrier, (void *)(uintptr_t) i); - - tsc_skew[i] = tgt_cpu_tsc - hst_cpu_tsc; - } -#endif } /* @@ -525,14 +389,20 @@ dtrace_gethrtime() uint64_t dtrace_gethrestime(void) { - printf("%s(%d): XXX\n",__func__,__LINE__); - return (0); + struct timespec current_time; + + dtrace_getnanotime(¤t_time); + + return (current_time.tv_sec * 1000000000ULL + current_time.tv_nsec); } /* Function to handle DTrace traps during probes. See amd64/amd64/trap.c */ int dtrace_trap(struct trapframe *frame, u_int type) { + bool nofault; + cpuid_t cpuid = cpu_number(); /* current cpu id */ + /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets @@ -543,19 +413,19 @@ dtrace_trap(struct trapframe *frame, u_i * Check if DTrace has enabled 'no-fault' mode: * */ - if ((cpu_core[cpu_number()].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) { + nofault = (cpu_core[cpuid].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0; + if (nofault) { + KASSERTMSG((read_rflags() & PSL_I) == 0, "interrupts enabled"); + /* * There are only a couple of trap types that are expected. * All the rest will be handled in the usual way. */ switch (type) { - /* Privilieged instruction fault. */ - case T_PRIVINFLT: - break; /* General protection fault. */ case T_PROTFLT: /* Flag an illegal operation. */ - cpu_core[cpu_number()].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP; + cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP; /* * Offset the instruction pointer to the instruction @@ -566,8 +436,8 @@ dtrace_trap(struct trapframe *frame, u_i /* Page fault. */ case T_PAGEFLT: /* Flag a bad address. */ - cpu_core[cpu_number()].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; - cpu_core[cpu_number()].cpuc_dtrace_illval = rcr2(); + cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; + cpu_core[cpuid].cpuc_dtrace_illval = rcr2(); /* * Offset the instruction pointer to the instruction Index: src/external/cddl/osnet/dev/dtrace/amd64/instr_size.c =================================================================== RCS file: src/external/cddl/osnet/dev/dtrace/amd64/instr_size.c diff -N src/external/cddl/osnet/dev/dtrace/amd64/instr_size.c --- src/external/cddl/osnet/dev/dtrace/amd64/instr_size.c 21 Feb 2010 01:46:33 -0000 1.2 +++ /dev/null 1 Jan 1970 00:00:00 -0000 @@ -1,134 +0,0 @@ -/* $NetBSD: instr_size.c,v 1.2 2010/02/21 01:46:33 darran Exp $ */ - -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - * - * $FreeBSD: src/sys/cddl/dev/dtrace/amd64/instr_size.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* Copyright (c) 1988 AT&T */ -/* All Rights Reserved */ - - -#if defined(sun) -#pragma ident "@(#)instr_size.c 1.14 05/07/08 SMI" -#endif - -#include -#include -#include -#if defined(sun) -#include -#include -#include -#include -#include -#else -typedef u_int model_t; -#define DATAMODEL_NATIVE 0 -int dtrace_instr_size(uchar_t *); -#endif - -#include - -/* - * This subsystem (with the minor exception of the instr_size() function) is - * is called from DTrace probe context. This imposes several requirements on - * the implementation: - * - * 1. External subsystems and functions may not be referenced. The one current - * exception is for cmn_err, but only to signal the detection of table - * errors. Assuming the tables are correct, no combination of input is to - * trigger a cmn_err call. - * - * 2. These functions can't be allowed to be traced. To prevent this, - * all functions in the probe path (everything except instr_size()) must - * have names that begin with "dtrace_". - */ - -typedef enum dis_isize { - DIS_ISIZE_INSTR, - DIS_ISIZE_OPERAND -} dis_isize_t; - - -/* - * get a byte from instruction stream - */ -static int -dtrace_dis_get_byte(void *p) -{ - int ret; - uchar_t **instr = p; - - ret = **instr; - *instr += 1; - - return (ret); -} - -/* - * Returns either the size of a given instruction, in bytes, or the size of that - * instruction's memory access (if any), depending on the value of `which'. - * If a programming error in the tables is detected, the system will panic to - * ease diagnosis. Invalid instructions will not be flagged. They will appear - * to have an instruction size between 1 and the actual size, and will be - * reported as having no memory impact. - */ -/* ARGSUSED2 */ -static int -dtrace_dis_isize(uchar_t *instr, dis_isize_t which, model_t model, int *rmindex) -{ - int sz; - dis86_t x; - uint_t mode = SIZE64; - -#if defined(sun) - mode = (model == DATAMODEL_LP64) ? SIZE64 : SIZE32; -#endif - - x.d86_data = (void **)&instr; - x.d86_get_byte = dtrace_dis_get_byte; - x.d86_check_func = NULL; - - if (dtrace_disx86(&x, mode) != 0) - return (-1); - - if (which == DIS_ISIZE_INSTR) - sz = x.d86_len; /* length of the instruction */ - else - sz = x.d86_memsize; /* length of memory operand */ - - if (rmindex != NULL) - *rmindex = x.d86_rmindex; - return (sz); -} - -int -dtrace_instr_size(uchar_t *instr) -{ - return (dtrace_dis_isize(instr, DIS_ISIZE_INSTR, DATAMODEL_NATIVE, - NULL)); -} Index: src/external/cddl/osnet/dev/dtrace/arm/dtrace_asm.S =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/arm/dtrace_asm.S,v retrieving revision 1.6 diff -u -p -r1.6 dtrace_asm.S --- src/external/cddl/osnet/dev/dtrace/arm/dtrace_asm.S 23 Jun 2016 04:35:35 -0000 1.6 +++ src/external/cddl/osnet/dev/dtrace/arm/dtrace_asm.S 25 Apr 2017 02:37:35 -0000 @@ -19,7 +19,7 @@ * * CDDL HEADER END * - * $FreeBSD$ + * $FreeBSD: head/sys/cddl/dev/dtrace/arm/dtrace_asm.S 308427 2016-11-07 20:02:18Z gonzo $ */ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. @@ -28,7 +28,6 @@ #define _ASM #define _LOCORE -#define LOCORE #include #include @@ -36,6 +35,16 @@ #include #include +#define PSR_I I32_bit +#define PSR_F F32_bit + +#ifdef __ARM_BIG_ENDIAN +#define __BIG_ENDIAN 1 +#endif + +#define EENTRY(x) ENTRY_NP(x) +#define EEND(x) /* nothing */ + /* void dtrace_membar_producer(void) */ @@ -56,7 +65,7 @@ dtrace_icookie_t dtrace_interrupt_disabl ENTRY(dtrace_interrupt_disable) mrs r0, cpsr mov r1, r0 - orr r1, r1, #(I32_bit|F32_bit) + orr r1, r1, #(PSR_I | PSR_F) msr cpsr_c, r1 RET END(dtrace_interrupt_disable) @@ -65,44 +74,15 @@ END(dtrace_interrupt_disable) void dtrace_interrupt_enable(dtrace_icookie_t cookie) */ ENTRY(dtrace_interrupt_enable) - and r0, r0, #(I32_bit|F32_bit) + and r0, r0, #(PSR_I | PSR_F) mrs r1, cpsr - bic r1, r1, #(I32_bit|F32_bit) + bic r1, r1, #(PSR_I | PSR_F) orr r1, r1, r0 msr cpsr_c, r1 RET END(dtrace_interrupt_enable) /* -uint32_t dtrace_cas32(uint32_t *target, uint32_t cmp, uint32_t new) -XXX: just disable interrupts for now, add proper implementation for -ARMv6/ARMv7 later -*/ -ENTRY_NP(dtrace_casptr) -ENTRY(dtrace_cas32) - stmfd sp!, {r4, r5} - - mrs r3, cpsr - mov r4, r3 - orr r4, r4, #(I32_bit|F32_bit) - msr cpsr_c, r4 - - ldr r5, [r0] - cmp r5, r1 - movne r0, r5 - bne 2f - - str r2, [r0] - mov r0, r5 - -2: - msr cpsr_c, r3 - ldmfd sp!, {r4, r5} - RET -END(dtrace_cas32) -END(dtrace_casptr) - -/* uint8_t dtrace_fuword8_nocheck(void *addr) */ @@ -135,20 +115,21 @@ END(dtrace_fuword32_nocheck) /* uint64_t dtrace_fuword64_nocheck(void *addr) -XXX: add byteorder check */ ENTRY(dtrace_fuword64_nocheck) ldm r0, {r2, r3} mov r0, r2 mov r1, r3 -#if 0 -/* little endian */ - mov r0, r2 - mov r1, r3 +#if defined(__BIG_ENDIAN__) /* big endian */ mov r0, r3 mov r1, r2 +#else +/* little endian */ + mov r0, r2 + mov r1, r3 + #endif RET END(dtrace_fuword64_nocheck) @@ -159,21 +140,20 @@ dtrace_copy(uintptr_t uaddr, uintptr_t k */ ENTRY(dtrace_copy) stmfd sp!, {r4-r5} /* stack is 8 byte aligned */ - teq r2, #0x00000000 - mov r5, #0x00000000 - beq 2f - -1: ldrb r4, [r0], #0x0001 - add r5, r5, #0x00000001 - strb r4, [r1], #0x0001 - teqne r5, r2 - bne 1b + teq r2, #0x00000000 + mov r5, #0x00000000 + beq 2f + +1: ldrb r4, [r0], #0x0001 + add r5, r5, #0x00000001 + strb r4, [r1], #0x0001 + teqne r5, r2 + bne 1b -2: ldmfd sp!, {r4-r5} /* stack is 8 byte aligned */ +2: ldmfd sp!, {r4-r5} /* stack is 8 byte aligned */ RET END(dtrace_copy) - /* void dtrace_copystr(uintptr_t uaddr, uintptr_t kaddr, size_t size, @@ -181,48 +161,22 @@ dtrace_copystr(uintptr_t uaddr, uintptr_ XXX: Check for flags? */ ENTRY(dtrace_copystr) - stmfd sp!, {r4-r5} /* stack is 8 byte aligned */ - teq r2, #0x00000000 - mov r5, #0x00000000 - beq 2f - -1: ldrb r4, [r0], #0x0001 - add r5, r5, #0x00000001 - teq r4, #0x00000000 - strb r4, [r1], #0x0001 - teqne r5, r2 - bne 1b + stmfd sp!, {r4-r5} /* stack is 8 byte aligned */ + teq r2, #0x00000000 + mov r5, #0x00000000 + beq 2f + +1: ldrb r4, [r0], #0x0001 + add r5, r5, #0x00000001 + teq r4, #0x00000000 + strb r4, [r1], #0x0001 + teqne r5, r2 + bne 1b -2: ldmfd sp!, {r4-r5} /* stack is 8 byte aligned */ +2: ldmfd sp!, {r4-r5} /* stack is 8 byte aligned */ RET END(dtrace_copystr) - -/* -void dtrace_invop_init(void) -*/ -ENTRY(dtrace_invop_init) - ldr r1, .Ldtrace_invop - ldr r2, .Ldtrace_invop_jump_addr - str r1, [r2] - RET - .align 0 -.Ldtrace_invop: - .word dtrace_invop -.Ldtrace_invop_jump_addr: - .word dtrace_invop_jump_addr -END(dtrace_invop_init) - -/* -void dtrace_invop_uninit(void) -*/ -ENTRY(dtrace_invop_uninit) - mov r0, #0 - ldr r1, .Ldtrace_invop_jump_addr - str r0, [r1] - RET -END(dtrace_invop_uninit) - /* uintptr_t dtrace_caller(int aframes) @@ -231,3 +185,53 @@ ENTRY(dtrace_caller) mov r0, #-1 RET END(dtrace_caller) + +/* +uint32_t +dtrace_cas32(uint32_t *target, uint32_t cmp, uint32_t new) + +void * +dtrace_casptr(volatile void *target, volatile void *cmp, volatile void *new) +*/ +EENTRY(dtrace_casptr) +ENTRY(dtrace_cas32) +#if __ARM_ARCH >= 6 + +1: ldrex r3, [r0] /* Load target */ + cmp r3, r1 /* Check if *target == cmp */ + bne 2f /* No, return */ + strex ip, r2, [r0] /* Store new to target */ + cmp ip, #0 /* Did the store succeed? */ + bne 1b /* No, try again */ +2: mov r0, r3 /* Return the value loaded from target */ + RET + +#else + + /* + * We don't support MP on CPUs older than v6, so just disable interrupts + * and use non-atomic instructions. + */ + + stmfd sp!, {r4, r5} + + mrs r3, cpsr + mov r4, r3 + orr r4, r4, #(PSR_I | PSR_F) + msr cpsr_c, r4 + + ldr r5, [r0] + cmp r5, r1 + movne r0, r5 + bne 2f + + str r2, [r0] + mov r0, r5 + +2: + msr cpsr_c, r3 + ldmfd sp!, {r4, r5} + RET +#endif +END(dtrace_cas32) +EEND(dtrace_casptr) Index: src/external/cddl/osnet/dev/dtrace/arm/dtrace_isa.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/arm/dtrace_isa.c,v retrieving revision 1.5 diff -u -p -r1.5 dtrace_isa.c --- src/external/cddl/osnet/dev/dtrace/arm/dtrace_isa.c 2 Oct 2015 22:15:18 -0000 1.5 +++ src/external/cddl/osnet/dev/dtrace/arm/dtrace_isa.c 5 Jul 2017 20:28:13 -0000 @@ -19,7 +19,7 @@ * * CDDL HEADER END * - * $FreeBSD$ + * $FreeBSD: head/sys/cddl/dev/dtrace/arm/dtrace_isa.c 295882 2016-02-22 09:08:04Z skra $ */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. @@ -100,8 +100,6 @@ dtrace_getpcstack(pc_t *pcstack, int pcs * that generated the stack frame. We hope for the best. */ scp = frame[FR_SCP]; - printf("--> %08x\n", (uint32_t)scp); - if (aframes > 0) { aframes--; if ((aframes == 0) && (caller != 0)) { @@ -109,7 +107,6 @@ dtrace_getpcstack(pc_t *pcstack, int pcs } } else { - printf("++ --> %08x\n", (uint32_t)scp); pcstack[depth++] = scp; } @@ -154,13 +151,13 @@ dtrace_getpcstack(pc_t *pcstack, int pcs void dtrace_getupcstack(uint64_t *pcstack, int pcstack_limit) { - printf("unimplemented\n"); + printf("IMPLEMENT ME: %s\n", __func__); } int dtrace_getustackdepth(void) { - printf("unimplemented\n"); + printf("IMPLEMENT ME: %s\n", __func__); return (0); } @@ -174,8 +171,9 @@ dtrace_getufpstack(uint64_t *pcstack, ui uint64_t dtrace_getarg(int arg, int aframes) { - printf("unimplemented\n"); +/* struct arm_frame *fp = (struct arm_frame *)dtrace_getfp();*/ + printf("IMPLEMENT ME: %s\n", __func__); return (0); } @@ -227,6 +225,7 @@ dtrace_getstackdepth(int aframes) ulong_t dtrace_getreg(struct trapframe *rp, uint_t reg) { + printf("IMPLEMENT ME: %s\n", __func__); return (0); } Index: src/external/cddl/osnet/dev/dtrace/arm/dtrace_subr.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/arm/dtrace_subr.c,v retrieving revision 1.3 diff -u -p -r1.3 dtrace_subr.c --- src/external/cddl/osnet/dev/dtrace/arm/dtrace_subr.c 27 Feb 2017 06:47:00 -0000 1.3 +++ src/external/cddl/osnet/dev/dtrace/arm/dtrace_subr.c 10 May 2017 10:10:04 -0000 @@ -21,7 +21,7 @@ * * CDDL HEADER END * - * $FreeBSD$ + * $FreeBSD: head/sys/cddl/dev/dtrace/arm/dtrace_subr.c 308457 2016-11-08 23:59:41Z bdrewery $ * */ /* @@ -47,12 +47,26 @@ #include #include +#define FAULT_ALIGN FAULT_ALIGN_0 extern uintptr_t kernelbase; extern uintptr_t dtrace_in_probe_addr; extern int dtrace_in_probe; + +void dtrace_gethrtime_init(void *arg); + +#define DELAYBRANCH(x) ((int)(x) < 0) + +#define BIT_PC 15 +#define BIT_LR 14 +#define BIT_SP 13 + extern dtrace_id_t dtrace_probeid_error; +extern int (*dtrace_invop_jump_addr)(struct trapframe *); +extern void dtrace_getnanotime(struct timespec *tsp); int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); +void dtrace_invop_init(void); +void dtrace_invop_uninit(void); typedef struct dtrace_invop_hdlr { int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t); @@ -61,8 +75,6 @@ typedef struct dtrace_invop_hdlr { dtrace_invop_hdlr_t *dtrace_invop_hdlr; -void dtrace_gethrtime_init(void *arg); - int dtrace_invop(uintptr_t addr, struct trapframe *frame, uintptr_t eax) { @@ -76,6 +88,7 @@ dtrace_invop(uintptr_t addr, struct trap return (0); } + void dtrace_invop_add(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { @@ -114,6 +127,7 @@ dtrace_invop_remove(int (*func)(uintptr_ kmem_free(hdlr, sizeof (dtrace_invop_hdlr_t)); } +/*ARGSUSED*/ void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { @@ -156,7 +170,7 @@ dtrace_sync_func(void) void dtrace_sync(void) { - dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); + dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); } /* @@ -167,36 +181,35 @@ dtrace_sync(void) * Returns nanoseconds since boot. */ uint64_t -dtrace_gethrtime() +dtrace_gethrtime(void) { - struct timespec curtime; + struct timespec curtime; nanouptime(&curtime); return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec); - } uint64_t dtrace_gethrestime(void) { - struct timespec curtime; + struct timespec current_time; - getnanotime(&curtime); + dtrace_getnanotime(¤t_time); - return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec); + return (current_time.tv_sec * 1000000000UL + current_time.tv_nsec); } /* Function to handle DTrace traps during probes. Not used on ARM yet */ int dtrace_trap(struct trapframe *frame, u_int type) { - cpuid_t cpuid = cpu_number(); /* current cpu id */ + cpuid_t curcpu_id = cpu_number(); /* current cpu id */ /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets - * a flag in it's per-cpu flags to indicate that it doesn't + * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * @@ -204,24 +217,23 @@ dtrace_trap(struct trapframe *frame, u_i * */ - if ((cpu_core[cpuid].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) { + if ((cpu_core[curcpu_id].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) { /* * There are only a couple of trap types that are expected. * All the rest will be handled in the usual way. */ switch (type) { /* Page fault. */ - case 0: + case FAULT_ALIGN: /* Flag a bad address. */ - cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; - cpu_core[cpuid].cpuc_dtrace_illval = 0; + cpu_core[curcpu_id].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; + cpu_core[curcpu_id].cpuc_dtrace_illval = 0; /* * Offset the instruction pointer to the instruction * following the one causing the fault. */ - panic("%s", __func__); - // frame->pc += sizeof(int); + frame->tf_pc += sizeof(int); return (1); default: /* Handle all other traps in the usual way. */ @@ -248,3 +260,349 @@ dtrace_gethrtime_init(void *arg) { /* FIXME */ } + +static uint32_t +dtrace_expand_imm(uint32_t imm12) +{ + uint32_t unrot = imm12 & 0xff; + int amount = 2 * (imm12 >> 8); + + if (amount) + return (unrot >> amount) | (unrot << (32 - amount)); + else + return unrot; +} + +static uint32_t +dtrace_add_with_carry(uint32_t x, uint32_t y, int carry_in, + int *carry_out, int *overflow) +{ + uint32_t result; + uint64_t unsigned_sum = x + y + (uint32_t)carry_in; + int64_t signed_sum = (int32_t)x + (int32_t)y + (int32_t)carry_in; + KASSERT(carry_in == 1); + + result = (uint32_t)(unsigned_sum & 0xffffffff); + *carry_out = ((uint64_t)result == unsigned_sum) ? 1 : 0; + *overflow = ((int64_t)result == signed_sum) ? 0 : 1; + + return result; +} + +static void +dtrace_invop_emulate(int invop, struct trapframe *frame) +{ + uint32_t op = invop; +#if 1 + /* nbsd encoding */ + uint32_t code = op >> 28; + uint32_t data = op; +#else + /* fbsd encoding */ + uint32_t code = op & DTRACE_INVOP_MASK; + uint32_t data = DTRACE_INVOP_DATA(invop); +#endif + + switch (code) { + case DTRACE_INVOP_MOV_IP_SP: + /* mov ip, sp */ + frame->tf_ip = frame->tf_svc_sp; + frame->tf_pc += 4; + break; + case DTRACE_INVOP_BX_LR: + /* bx lr */ + frame->tf_pc = frame->tf_svc_lr; + break; + case DTRACE_INVOP_MOV_PC_LR: + /* mov pc, lr */ + frame->tf_pc = frame->tf_svc_lr; + break; + case DTRACE_INVOP_LDM: + /* ldm sp, {..., pc} */ + /* FALLTHRU */ + case DTRACE_INVOP_POPM: { + /* ldmib sp, {..., pc} */ + uint32_t register_list = (op & 0xffff); + uint32_t *sp = (uint32_t *)(intptr_t)frame->tf_svc_sp; + uint32_t *regs = &frame->tf_r0; + int i; + + /* POPM */ + if (code == DTRACE_INVOP_POPM) + sp++; + + for (i = 0; i <= 12; i++) { + if (register_list & (1 << i)) + regs[i] = *sp++; + } + if (register_list & (1 << 13)) + frame->tf_svc_sp = *sp++; + if (register_list & (1 << 14)) + frame->tf_svc_lr = *sp++; + frame->tf_pc = *sp; + break; + } + case DTRACE_INVOP_LDR_IMM: { + /* ldr r?, [{pc,r?}, #?] */ + uint32_t rt = (op >> 12) & 0xf; + uint32_t rn = (op >> 16) & 0xf; + uint32_t imm = op & 0xfff; + uint32_t *regs = &frame->tf_r0; + KDASSERT(rt <= 12); + KDASSERT(rn == 15 || rn <= 12); + if (rn == 15) + regs[rt] = *((uint32_t *)(intptr_t)(frame->tf_pc + 8 + imm)); + else + regs[rt] = *((uint32_t *)(intptr_t)(regs[rn] + imm)); + frame->tf_pc += 4; + break; + } + case DTRACE_INVOP_MOVW: { + /* movw r?, #? */ + uint32_t rd = (op >> 12) & 0xf; + uint32_t imm = (op & 0xfff) | ((op & 0xf0000) >> 4); + uint32_t *regs = &frame->tf_r0; + KDASSERT(rd <= 12); + regs[rd] = imm; + frame->tf_pc += 4; + break; + } + case DTRACE_INVOP_MOV_IMM: { + /* mov r?, #? */ + uint32_t rd = (op >> 12) & 0xf; + uint32_t imm = dtrace_expand_imm(op & 0xfff); + uint32_t *regs = &frame->tf_r0; + KDASSERT(rd <= 12); + regs[rd] = imm; + frame->tf_pc += 4; + break; + } + case DTRACE_INVOP_CMP_IMM: { + /* cmp r?, #? */ + uint32_t rn = (op >> 16) & 0xf; + uint32_t *regs = &frame->tf_r0; + uint32_t imm = dtrace_expand_imm(op & 0xfff); + uint32_t spsr = frame->tf_spsr; + uint32_t result; + int carry; + int overflow; + /* + * (result, carry, overflow) = AddWithCarry(R[n], NOT(imm32), ’1’); + * APSR.N = result<31>; + * APSR.Z = IsZeroBit(result); + * APSR.C = carry; + * APSR.V = overflow; + */ + KDASSERT(rn <= 12); + result = dtrace_add_with_carry(regs[rn], ~imm, 1, &carry, &overflow); + if (result & 0x80000000) + spsr |= PSR_N_bit; + else + spsr &= ~PSR_N_bit; + if (result == 0) + spsr |= PSR_Z_bit; + else + spsr &= ~PSR_Z_bit; + if (carry) + spsr |= PSR_C_bit; + else + spsr &= ~PSR_C_bit; + if (overflow) + spsr |= PSR_V_bit; + else + spsr &= ~PSR_V_bit; + +#if 0 + aprint_normal("pc=%x Rn=%x imm=%x %c%c%c%c\n", frame->tf_pc, regs[rn], imm, + (spsr & PSR_N_bit) ? 'N' : 'n', + (spsr & PSR_Z_bit) ? 'Z' : 'z', + (spsr & PSR_C_bit) ? 'C' : 'c', + (spsr & PSR_V_bit) ? 'V' : 'v'); +#endif + frame->tf_spsr = spsr; + frame->tf_pc += 4; + break; + } + case DTRACE_INVOP_B: { + /* b ??? */ + uint32_t imm = (op & 0x00ffffff) << 2; + int32_t diff; + /* SignExtend(imm26, 32) */ + if (imm & 0x02000000) + imm |= 0xfc000000; + diff = (int32_t)imm; + frame->tf_pc += 8 + diff; + break; + } + case DTRACE_INVOP_PUSHM: { + /* push {...} */ + uint32_t register_list = (op & 0xffff); + uint32_t *sp = (uint32_t *)(intptr_t)frame->tf_svc_sp; + uint32_t *regs = &frame->tf_r0; + int i; + int count = 0; + +#if 0 + if ((op & 0x0fff0fff) == 0x052d0004) { + /* A2: str r4, [sp, #-4]! */ + *(sp - 1) = regs[4]; + frame->tf_pc += 4; + break; + } +#endif + + for (i = 0; i < 16; i++) { + if (register_list & (1 << i)) + count++; + } + sp -= count; + + for (i = 0; i <= 12; i++) { + if (register_list & (1 << i)) + *sp++ = regs[i]; + } + if (register_list & (1 << 13)) + *sp++ = frame->tf_svc_sp; + if (register_list & (1 << 14)) + *sp++ = frame->tf_svc_lr; + if (register_list & (1 << 15)) + *sp = frame->tf_pc + 8; + + /* make sure the caches and memory are in sync */ + cpu_dcache_wbinv_range(frame->tf_svc_sp, count * 4); + + /* In case the current page tables have been modified ... */ + cpu_tlb_flushID(); + cpu_cpwait(); + + frame->tf_svc_sp -= count * 4; + frame->tf_pc += 4; + + break; + } + default: + KDASSERTMSG(0, "invop 0x%08x code %u tf %p", invop, code, frame); + } +} + +static int +dtrace_invop_start(struct trapframe *frame) +{ +#if 0 + register_t *r0, *sp; + int data, invop, reg, update_sp; +#endif + int invop; + + invop = dtrace_invop(frame->tf_pc, frame, frame->tf_r0); + + dtrace_invop_emulate(invop, frame); + +#if 0 + switch (invop & DTRACE_INVOP_MASK) { + case DTRACE_INVOP_PUSHM: + sp = (register_t *)frame->tf_svc_sp; + r0 = &frame->tf_r0; + data = DTRACE_INVOP_DATA(invop); + + /* + * Store the pc, lr, and sp. These have their own + * entries in the struct. + */ + if (data & (1 << BIT_PC)) { + sp--; + *sp = frame->tf_pc; + } + if (data & (1 << BIT_LR)) { + sp--; + *sp = frame->tf_svc_lr; + } + if (data & (1 << BIT_SP)) { + sp--; + *sp = frame->tf_svc_sp; + } + + /* Store the general registers */ + for (reg = 12; reg >= 0; reg--) { + if (data & (1 << reg)) { + sp--; + *sp = r0[reg]; + } + } + + /* Update the stack pointer and program counter to continue */ + frame->tf_svc_sp = (register_t)sp; + frame->tf_pc += 4; + break; + case DTRACE_INVOP_POPM: + sp = (register_t *)frame->tf_svc_sp; + r0 = &frame->tf_r0; + data = DTRACE_INVOP_DATA(invop); + + /* Read the general registers */ + for (reg = 0; reg <= 12; reg++) { + if (data & (1 << reg)) { + r0[reg] = *sp; + sp++; + } + } + + /* + * Set the stack pointer. If we don't update it here we will + * need to update it at the end as the instruction would do + */ + update_sp = 1; + if (data & (1 << BIT_SP)) { + frame->tf_svc_sp = *sp; + *sp++; + update_sp = 0; + } + + /* Update the link register, we need to use the correct copy */ + if (data & (1 << BIT_LR)) { + frame->tf_svc_lr = *sp; + *sp++; + } + /* + * And the program counter. If it's not in the list skip over + * it when we return so to not hit this again. + */ + if (data & (1 << BIT_PC)) { + frame->tf_pc = *sp; + *sp++; + } else + frame->tf_pc += 4; + + /* Update the stack pointer if we haven't already done so */ + if (update_sp) + frame->tf_svc_sp = (register_t)sp; + break; + case DTRACE_INVOP_B: + data = DTRACE_INVOP_DATA(invop) & 0x00ffffff; + /* Sign extend the data */ + if ((data & (1 << 23)) != 0) + data |= 0xff000000; + /* The data is the number of 4-byte words to change the pc */ + data *= 4; + data += 8; + frame->tf_pc += data; + break; + + default: + return (-1); + break; + } +#endif + + return (0); +} + +void dtrace_invop_init(void) +{ + dtrace_invop_jump_addr = dtrace_invop_start; +} + +void dtrace_invop_uninit(void) +{ + dtrace_invop_jump_addr = 0; +} Index: src/external/cddl/osnet/dev/dtrace/arm/regset.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/arm/regset.h,v retrieving revision 1.1 diff -u -p -r1.1 regset.h --- src/external/cddl/osnet/dev/dtrace/arm/regset.h 21 Jun 2013 19:16:00 -0000 1.1 +++ src/external/cddl/osnet/dev/dtrace/arm/regset.h 12 Apr 2017 09:46:41 -0000 @@ -19,7 +19,7 @@ * * CDDL HEADER END * - * $FreeBSD$ + * $FreeBSD: head/sys/cddl/dev/dtrace/arm/regset.h 278529 2015-02-10 19:41:30Z gnn $ */ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. @@ -42,18 +42,13 @@ extern "C" { #endif -/* - * XXX: define registers properly - */ - #if 0 -#define REG_PC PC -#define REG_FP EBP -#define REG_SP SP -#define REG_PS EFL -#define REG_R0 EAX -#define REG_R1 EDX -#endif +#define REG_LINK R14 +#define REG_SP R12 +#define REG_PS R0 +#define REG_R0 R0 +#define REG_R1 R1 +#endif #ifdef __cplusplus } Index: src/external/cddl/osnet/dev/dtrace/i386/dis_tables.c =================================================================== RCS file: src/external/cddl/osnet/dev/dtrace/i386/dis_tables.c diff -N src/external/cddl/osnet/dev/dtrace/i386/dis_tables.c --- src/external/cddl/osnet/dev/dtrace/i386/dis_tables.c 20 Jul 2011 19:51:57 -0000 1.3 +++ /dev/null 1 Jan 1970 00:00:00 -0000 @@ -1,3195 +0,0 @@ -/* $NetBSD: dis_tables.c,v 1.3 2011/07/20 19:51:57 tron Exp $ */ - -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - * - * $FreeBSD: src/sys/cddl/dev/dtrace/i386/dis_tables.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* Copyright (c) 1988 AT&T */ -/* All Rights Reserved */ - - -#if defined(sun) -#pragma ident "@(#)dis_tables.c 1.11 06/03/02 SMI" -#endif - -#include "dis_tables.h" - -/* BEGIN CSTYLED */ - -/* - * Disassembly begins in dis_distable, which is equivalent to the One-byte - * Opcode Map in the Intel IA32 ISA Reference (page A-6 in my copy). The - * decoding loops then traverse out through the other tables as necessary to - * decode a given instruction. - * - * The behavior of this file can be controlled by one of the following flags: - * - * DIS_TEXT Include text for disassembly - * DIS_MEM Include memory-size calculations - * - * Either or both of these can be defined. - * - * This file is not, and will never be, cstyled. If anything, the tables should - * be taken out another tab stop or two so nothing overlaps. - */ - -/* - * These functions must be provided for the consumer to do disassembly. - */ -#ifdef DIS_TEXT -extern char *strncpy(char *, const char *, size_t); -extern size_t strlen(const char *); -extern int strcmp(const char *, const char *); -extern int strncmp(const char *, const char *, size_t); -extern size_t strlcat(char *, const char *, size_t); -#endif - - -#define TERM NULL /* used to indicate that the 'indirect' */ - /* field terminates - no pointer. */ - -/* Used to decode instructions. */ -typedef struct instable { - const struct instable *it_indirect; /* for decode op codes */ - uchar_t it_adrmode; -#ifdef DIS_TEXT - char it_name[NCPS]; - uint_t it_suffix:1; /* mneu + "w", "l", or "d" */ -#endif -#ifdef DIS_MEM - uint_t it_size:16; -#endif - uint_t it_invalid64:1; /* opcode invalid in amd64 */ - uint_t it_always64:1; /* 64 bit when in 64 bit mode */ - uint_t it_invalid32:1; /* invalid in IA32 */ - uint_t it_stackop:1; /* push/pop stack operation */ -} instable_t; - -/* - * Instruction formats. - */ -enum { - UNKNOWN, - MRw, - IMlw, - IMw, - IR, - OA, - AO, - MS, - SM, - Mv, - Mw, - M, /* register or memory */ - Mb, /* register or memory, always byte sized */ - MO, /* memory only (no registers) */ - PREF, - SWAPGS, - R, - RA, - SEG, - MR, - RM, - IA, - MA, - SD, - AD, - SA, - D, - INM, - SO, - BD, - I, - P, - V, - DSHIFT, /* for double shift that has an 8-bit immediate */ - U, - OVERRIDE, - NORM, /* instructions w/o ModR/M byte, no memory access */ - IMPLMEM, /* instructions w/o ModR/M byte, implicit mem access */ - O, /* for call */ - JTAB, /* jump table */ - IMUL, /* for 186 iimul instr */ - CBW, /* so data16 can be evaluated for cbw and variants */ - MvI, /* for 186 logicals */ - ENTER, /* for 186 enter instr */ - RMw, /* for 286 arpl instr */ - Ib, /* for push immediate byte */ - F, /* for 287 instructions */ - FF, /* for 287 instructions */ - FFC, /* for 287 instructions */ - DM, /* 16-bit data */ - AM, /* 16-bit addr */ - LSEG, /* for 3-bit seg reg encoding */ - MIb, /* for 386 logicals */ - SREG, /* for 386 special registers */ - PREFIX, /* a REP instruction prefix */ - LOCK, /* a LOCK instruction prefix */ - INT3, /* The int 3 instruction, which has a fake operand */ - INTx, /* The normal int instruction, with explicit int num */ - DSHIFTcl, /* for double shift that implicitly uses %cl */ - CWD, /* so data16 can be evaluated for cwd and variants */ - RET, /* single immediate 16-bit operand */ - MOVZ, /* for movs and movz, with different size operands */ - XADDB, /* for xaddb */ - MOVSXZ, /* AMD64 mov sign extend 32 to 64 bit instruction */ - -/* - * MMX/SIMD addressing modes. - */ - - MMO, /* Prefixable MMX/SIMD-Int mm/mem -> mm */ - MMOIMPL, /* Prefixable MMX/SIMD-Int mm -> mm (mem) */ - MMO3P, /* Prefixable MMX/SIMD-Int mm -> r32,imm8 */ - MMOM3, /* Prefixable MMX/SIMD-Int mm -> r32 */ - MMOS, /* Prefixable MMX/SIMD-Int mm -> mm/mem */ - MMOMS, /* Prefixable MMX/SIMD-Int mm -> mem */ - MMOPM, /* MMX/SIMD-Int mm/mem -> mm,imm8 */ - MMOPRM, /* Prefixable MMX/SIMD-Int r32/mem -> mm,imm8 */ - MMOSH, /* Prefixable MMX mm,imm8 */ - MM, /* MMX/SIMD-Int mm/mem -> mm */ - MMS, /* MMX/SIMD-Int mm -> mm/mem */ - MMSH, /* MMX mm,imm8 */ - XMMO, /* Prefixable SIMD xmm/mem -> xmm */ - XMMOS, /* Prefixable SIMD xmm -> xmm/mem */ - XMMOPM, /* Prefixable SIMD xmm/mem w/to xmm,imm8 */ - XMMOMX, /* Prefixable SIMD mm/mem -> xmm */ - XMMOX3, /* Prefixable SIMD xmm -> r32 */ - XMMOXMM, /* Prefixable SIMD xmm/mem -> mm */ - XMMOM, /* Prefixable SIMD xmm -> mem */ - XMMOMS, /* Prefixable SIMD mem -> xmm */ - XMM, /* SIMD xmm/mem -> xmm */ - XMMXIMPL, /* SIMD xmm -> xmm (mem) */ - XMM3P, /* SIMD xmm -> r32,imm8 */ - XMMP, /* SIMD xmm/mem w/to xmm,imm8 */ - XMMPRM, /* SIMD r32/mem -> xmm,imm8 */ - XMMS, /* SIMD xmm -> xmm/mem */ - XMMM, /* SIMD mem -> xmm */ - XMMMS, /* SIMD xmm -> mem */ - XMM3MX, /* SIMD r32/mem -> xmm */ - XMM3MXS, /* SIMD xmm -> r32/mem */ - XMMSH, /* SIMD xmm,imm8 */ - XMMXM3, /* SIMD xmm/mem -> r32 */ - XMMX3, /* SIMD xmm -> r32 */ - XMMXMM, /* SIMD xmm/mem -> mm */ - XMMMX, /* SIMD mm -> xmm */ - XMMXM, /* SIMD xmm -> mm */ - XMMFENCE, /* SIMD lfence or mfence */ - XMMSFNC /* SIMD sfence (none or mem) */ -}; - -#define FILL 0x90 /* Fill byte used for alignment (nop) */ - -/* -** Register numbers for the i386 -*/ -#define EAX_REGNO 0 -#define ECX_REGNO 1 -#define EDX_REGNO 2 -#define EBX_REGNO 3 -#define ESP_REGNO 4 -#define EBP_REGNO 5 -#define ESI_REGNO 6 -#define EDI_REGNO 7 - -/* - * modes for immediate values - */ -#define MODE_NONE 0 -#define MODE_IPREL 1 /* signed IP relative value */ -#define MODE_SIGNED 2 /* sign extended immediate */ -#define MODE_IMPLIED 3 /* constant value implied from opcode */ -#define MODE_OFFSET 4 /* offset part of an address */ - -/* - * The letters used in these macros are: - * IND - indirect to another to another table - * "T" - means to Terminate indirections (this is the final opcode) - * "S" - means "operand length suffix required" - * "NS" - means "no suffix" which is the operand length suffix of the opcode - * "Z" - means instruction size arg required - * "u" - means the opcode is invalid in IA32 but valid in amd64 - * "x" - means the opcode is invalid in amd64, but not IA32 - * "y" - means the operand size is always 64 bits in 64 bit mode - * "p" - means push/pop stack operation - */ - -#if defined(DIS_TEXT) && defined(DIS_MEM) -#define IND(table) {table, 0, "", 0, 0, 0, 0, 0, 0} -#define INDx(table) {table, 0, "", 0, 0, 1, 0, 0, 0} -#define TNS(name, amode) {TERM, amode, name, 0, 0, 0, 0, 0, 0} -#define TNSu(name, amode) {TERM, amode, name, 0, 0, 0, 0, 1, 0} -#define TNSx(name, amode) {TERM, amode, name, 0, 0, 1, 0, 0, 0} -#define TNSy(name, amode) {TERM, amode, name, 0, 0, 0, 1, 0, 0} -#define TNSyp(name, amode) {TERM, amode, name, 0, 0, 0, 1, 0, 1} -#define TNSZ(name, amode, sz) {TERM, amode, name, 0, sz, 0, 0, 0, 0} -#define TNSZy(name, amode, sz) {TERM, amode, name, 0, sz, 0, 1, 0, 0} -#define TS(name, amode) {TERM, amode, name, 1, 0, 0, 0, 0, 0} -#define TSx(name, amode) {TERM, amode, name, 1, 0, 1, 0, 0, 0} -#define TSy(name, amode) {TERM, amode, name, 1, 0, 0, 1, 0, 0} -#define TSp(name, amode) {TERM, amode, name, 1, 0, 0, 0, 0, 1} -#define TSZ(name, amode, sz) {TERM, amode, name, 1, sz, 0, 0, 0, 0} -#define TSZx(name, amode, sz) {TERM, amode, name, 1, sz, 1, 0, 0, 0} -#define TSZy(name, amode, sz) {TERM, amode, name, 1, sz, 0, 1, 0, 0} -#define INVALID {TERM, UNKNOWN, "", 0, 0, 0, 0, 0} -#elif defined(DIS_TEXT) -#define IND(table) {table, 0, "", 0, 0, 0, 0, 0} -#define INDx(table) {table, 0, "", 0, 1, 0, 0, 0} -#define TNS(name, amode) {TERM, amode, name, 0, 0, 0, 0, 0} -#define TNSu(name, amode) {TERM, amode, name, 0, 0, 0, 1, 0} -#define TNSx(name, amode) {TERM, amode, name, 0, 1, 0, 0, 0} -#define TNSy(name, amode) {TERM, amode, name, 0, 0, 1, 0, 0} -#define TNSyp(name, amode) {TERM, amode, name, 0, 0, 1, 0, 1} -#define TNSZ(name, amode, sz) {TERM, amode, name, 0, 0, 0, 0, 0} -#define TNSZy(name, amode, sz) {TERM, amode, name, 0, 0, 1, 0, 0} -#define TS(name, amode) {TERM, amode, name, 1, 0, 0, 0, 0} -#define TSx(name, amode) {TERM, amode, name, 1, 1, 0, 0, 0} -#define TSy(name, amode) {TERM, amode, name, 1, 0, 1, 0, 0} -#define TSp(name, amode) {TERM, amode, name, 1, 0, 0, 0, 1} -#define TSZ(name, amode, sz) {TERM, amode, name, 1, 0, 0, 0, 0} -#define TSZx(name, amode, sz) {TERM, amode, name, 1, 1, 0, 0, 0} -#define TSZy(name, amode, sz) {TERM, amode, name, 1, 0, 1, 0, 0} -#define INVALID {TERM, UNKNOWN, "", 0, 0, 0, 0, 0} -#elif defined(DIS_MEM) -#define IND(table) {table, 0, 0, 0, 0, 0, 0} -#define INDx(table) {table, 0, 0, 1, 0, 0, 0} -#define TNS(name, amode) {TERM, amode, 0, 0, 0, 0, 0} -#define TNSu(name, amode) {TERM, amode, 0, 0, 0, 1, 0} -#define TNSy(name, amode) {TERM, amode, 0, 0, 1, 0, 0} -#define TNSyp(name, amode) {TERM, amode, 0, 0, 1, 0, 1} -#define TNSx(name, amode) {TERM, amode, 0, 1, 0, 0, 0} -#define TNSZ(name, amode, sz) {TERM, amode, sz, 0, 0, 0, 0} -#define TNSZy(name, amode, sz) {TERM, amode, sz, 0, 1, 0, 0} -#define TS(name, amode) {TERM, amode, 0, 0, 0, 0, 0} -#define TSx(name, amode) {TERM, amode, 0, 1, 0, 0, 0} -#define TSy(name, amode) {TERM, amode, 0, 0, 1, 0, 0} -#define TSp(name, amode) {TERM, amode, 0, 0, 0, 0, 1} -#define TSZ(name, amode, sz) {TERM, amode, sz, 0, 0, 0, 0} -#define TSZx(name, amode, sz) {TERM, amode, sz, 1, 0, 0, 0} -#define TSZy(name, amode, sz) {TERM, amode, sz, 0, 1, 0, 0} -#define INVALID {TERM, UNKNOWN, 0, 0, 0, 0, 0} -#else -#define IND(table) {table[0], 0, 0, 0, 0, 0} -#define INDx(table) {table[0], 0, 1, 0, 0, 0} -#define TNS(name, amode) {TERM, amode, 0, 0, 0, 0} -#define TNSu(name, amode) {TERM, amode, 0, 0, 1, 0} -#define TNSy(name, amode) {TERM, amode, 0, 1, 0, 0} -#define TNSyp(name, amode) {TERM, amode, 0, 1, 0, 1} -#define TNSx(name, amode) {TERM, amode, 1, 0, 0, 0} -#define TNSZ(name, amode, sz) {TERM, amode, 0, 0, 0, 0} -#define TNSZy(name, amode, sz) {TERM, amode, 0, 1, 0, 0} -#define TS(name, amode) {TERM, amode, 0, 0, 0, 0} -#define TSx(name, amode) {TERM, amode, 1, 0, 0, 0} -#define TSy(name, amode) {TERM, amode, 0, 1, 0, 0} -#define TSp(name, amode) {TERM, amode, 0, 0, 0, 1} -#define TSZ(name, amode, sz) {TERM, amode, 0, 0, 0, 0} -#define TSZx(name, amode, sz) {TERM, amode, 1, 0, 0, 0} -#define TSZy(name, amode, sz) {TERM, amode, 0, 1, 0, 0} -#define INVALID {TERM, UNKNOWN, 0, 0, 0, 0} -#endif - -#ifdef DIS_TEXT -/* - * this decodes the r_m field for mode's 0, 1, 2 in 16 bit mode - */ -const char *const dis_addr16[3][8] = { -"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di)", "", - "(%bx)", -"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di", "(%bp)", - "(%bx)", -"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di)", "(%bp)", - "(%bx)", -}; - - -/* - * This decodes 32 bit addressing mode r_m field for modes 0, 1, 2 - */ -const char *const dis_addr32_mode0[16] = { - "(%eax)", "(%ecx)", "(%edx)", "(%ebx)", "", "", "(%esi)", "(%edi)", - "(%r8d)", "(%r9d)", "(%r10d)", "(%r11d)", "", "", "(%r14d)", "(%r15d)" -}; - -const char *const dis_addr32_mode12[16] = { - "(%eax)", "(%ecx)", "(%edx)", "(%ebx)", "", "(%ebp)", "(%esi)", "(%edi)", - "(%r8d)", "(%r9d)", "(%r10d)", "(%r11d)", "", "(%r13d)", "(%r14d)", "(%r15d)" -}; - -/* - * This decodes 64 bit addressing mode r_m field for modes 0, 1, 2 - */ -const char *const dis_addr64_mode0[16] = { - "(%rax)", "(%rcx)", "(%rdx)", "(%rbx)", "", "(%rip)", "(%rsi)", "(%rdi)", - "(%r8)", "(%r9)", "(%r10)", "(%r11)", "(%r12)", "(%rip)", "(%r14)", "(%r15)" -}; -const char *const dis_addr64_mode12[16] = { - "(%rax)", "(%rcx)", "(%rdx)", "(%rbx)", "", "(%rbp)", "(%rsi)", "(%rdi)", - "(%r8)", "(%r9)", "(%r10)", "(%r11)", "(%r12)", "(%r13)", "(%r14)", "(%r15)" -}; - -/* - * decode for scale from SIB byte - */ -const char *const dis_scale_factor[4] = { ")", ",2)", ",4)", ",8)" }; - -/* - * register decoding for normal references to registers (ie. not addressing) - */ -const char *const dis_REG8[16] = { - "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh", - "%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b" -}; - -const char *const dis_REG8_REX[16] = { - "%al", "%cl", "%dl", "%bl", "%spl", "%bpl", "%sil", "%dil", - "%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b" -}; - -const char *const dis_REG16[16] = { - "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di", - "%r8w", "%r9w", "%r10w", "%r11w", "%r12w", "%r13w", "%r14w", "%r15w" -}; - -const char *const dis_REG32[16] = { - "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", - "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" -}; - -const char *const dis_REG64[16] = { - "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", - "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" -}; - -const char *const dis_DEBUGREG[16] = { - "%db0", "%db1", "%db2", "%db3", "%db4", "%db5", "%db6", "%db7", - "%db8", "%db9", "%db10", "%db11", "%db12", "%db13", "%db14", "%db15" -}; - -const char *const dis_CONTROLREG[16] = { - "%cr0", "%cr1", "%cr2", "%cr3", "%cr4", "%cr5?", "%cr6?", "%cr7?", - "%cr8", "%cr9?", "%cr10?", "%cr11?", "%cr12?", "%cr13?", "%cr14?", "%cr15?" -}; - -const char *const dis_TESTREG[16] = { - "%tr0?", "%tr1?", "%tr2?", "%tr3", "%tr4", "%tr5", "%tr6", "%tr7", - "%tr0?", "%tr1?", "%tr2?", "%tr3", "%tr4", "%tr5", "%tr6", "%tr7" -}; - -const char *const dis_MMREG[16] = { - "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", - "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" -}; - -const char *const dis_XMMREG[16] = { - "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" -}; - -const char *const dis_SEGREG[16] = { - "%es", "%cs", "%ss", "%ds", "%fs", "%gs", "", "", - "%es", "%cs", "%ss", "%ds", "%fs", "%gs", "", "" -}; - -/* - * SIMD predicate suffixes - */ -const char *const dis_PREDSUFFIX[8] = { - "eq", "lt", "le", "unord", "neq", "nlt", "nle", "ord" -}; - - - -#endif /* DIS_TEXT */ - - - - -/* - * "decode table" for 64 bit mode MOVSXD instruction (opcode 0x63) - */ -const instable_t dis_opMOVSLD = TNS("movslq",MOVSXZ); - -/* - * "decode table" for pause and clflush instructions - */ -const instable_t dis_opPause = TNS("pause", NORM); - -/* - * Decode table for 0x0F00 opcodes - */ -const instable_t dis_op0F00[8] = { - -/* [0] */ TNS("sldt",M), TNS("str",M), TNSy("lldt",M), TNSy("ltr",M), -/* [4] */ TNSZ("verr",M,2), TNSZ("verw",M,2), INVALID, INVALID, -}; - - -/* - * Decode table for 0x0F01 opcodes - */ -const instable_t dis_op0F01[8] = { - -/* [0] */ TNSZ("sgdt",MO,6), TNSZ("sidt",MO,6), TNSZ("lgdt",MO,6), TNSZ("lidt",MO,6), -/* [4] */ TNSZ("smsw",M,2), INVALID, TNSZ("lmsw",M,2), TNS("invlpg",SWAPGS), -}; - -/* - * Decode table for 0x0F18 opcodes -- SIMD prefetch - */ -const instable_t dis_op0F18[8] = { - -/* [0] */ TNS("prefetchnta",PREF),TNS("prefetcht0",PREF), TNS("prefetcht1",PREF), TNS("prefetcht2",PREF), -/* [4] */ INVALID, INVALID, INVALID, INVALID, -}; - -/* - * Decode table for 0x0FAE opcodes -- SIMD state save/restore - */ -const instable_t dis_op0FAE[8] = { -/* [0] */ TNSZ("fxsave",M,512), TNSZ("fxrstor",M,512), TNS("ldmxcsr",M), TNS("stmxcsr",M), -/* [4] */ INVALID, TNS("lfence",XMMFENCE), TNS("mfence",XMMFENCE), TNS("sfence",XMMSFNC), -}; - -/* - * Decode table for 0x0FBA opcodes - */ - -const instable_t dis_op0FBA[8] = { - -/* [0] */ INVALID, INVALID, INVALID, INVALID, -/* [4] */ TS("bt",MIb), TS("bts",MIb), TS("btr",MIb), TS("btc",MIb), -}; - -/* - * Decode table for 0x0FC7 opcode - */ - -const instable_t dis_op0FC7[8] = { - -/* [0] */ INVALID, TNS("cmpxchg8b",M), INVALID, INVALID, -/* [4] */ INVALID, INVALID, INVALID, INVALID, -}; - - -/* - * Decode table for 0x0FC8 opcode -- 486 bswap instruction - * - *bit pattern: 0000 1111 1100 1reg - */ -const instable_t dis_op0FC8[4] = { -/* [0] */ TNS("bswap",R), INVALID, INVALID, INVALID, -}; - -/* - * Decode table for 0x0F71, 0x0F72, and 0x0F73 opcodes -- MMX instructions - */ -const instable_t dis_op0F7123[4][8] = { -{ -/* [70].0 */ INVALID, INVALID, INVALID, INVALID, -/* .4 */ INVALID, INVALID, INVALID, INVALID, -}, { -/* [71].0 */ INVALID, INVALID, TNS("psrlw",MMOSH), INVALID, -/* .4 */ TNS("psraw",MMOSH), INVALID, TNS("psllw",MMOSH), INVALID, -}, { -/* [72].0 */ INVALID, INVALID, TNS("psrld",MMOSH), INVALID, -/* .4 */ TNS("psrad",MMOSH), INVALID, TNS("pslld",MMOSH), INVALID, -}, { -/* [73].0 */ INVALID, INVALID, TNS("psrlq",MMOSH), TNS("INVALID",MMOSH), -/* .4 */ INVALID, INVALID, TNS("psllq",MMOSH), TNS("INVALID",MMOSH), -} }; - -/* - * Decode table for SIMD extensions to above 0x0F71-0x0F73 opcodes. - */ -const instable_t dis_opSIMD7123[32] = { -/* [70].0 */ INVALID, INVALID, INVALID, INVALID, -/* .4 */ INVALID, INVALID, INVALID, INVALID, - -/* [71].0 */ INVALID, INVALID, TNS("psrlw",XMMSH), INVALID, -/* .4 */ TNS("psraw",XMMSH), INVALID, TNS("psllw",XMMSH), INVALID, - -/* [72].0 */ INVALID, INVALID, TNS("psrld",XMMSH), INVALID, -/* .4 */ TNS("psrad",XMMSH), INVALID, TNS("pslld",XMMSH), INVALID, - -/* [73].0 */ INVALID, INVALID, TNS("psrlq",XMMSH), TNS("psrldq",XMMSH), -/* .4 */ INVALID, INVALID, TNS("psllq",XMMSH), TNS("pslldq",XMMSH), -}; - -/* - * SIMD instructions have been wedged into the existing IA32 instruction - * set through the use of prefixes. That is, while 0xf0 0x58 may be - * addps, 0xf3 0xf0 0x58 (literally, repz addps) is a completely different - * instruction - addss. At present, three prefixes have been coopted in - * this manner - address size (0x66), repnz (0xf2) and repz (0xf3). The - * following tables are used to provide the prefixed instruction names. - * The arrays are sparse, but they're fast. - */ - -/* - * Decode table for SIMD instructions with the address size (0x66) prefix. - */ -const instable_t dis_opSIMDdata16[256] = { -/* [00] */ INVALID, INVALID, INVALID, INVALID, -/* [04] */ INVALID, INVALID, INVALID, INVALID, -/* [08] */ INVALID, INVALID, INVALID, INVALID, -/* [0C] */ INVALID, INVALID, INVALID, INVALID, - -/* [10] */ TNSZ("movupd",XMM,16), TNSZ("movupd",XMMS,16), TNSZ("movlpd",XMMM,8), TNSZ("movlpd",XMMMS,8), -/* [14] */ TNSZ("unpcklpd",XMM,16),TNSZ("unpckhpd",XMM,16),TNSZ("movhpd",XMMM,8), TNSZ("movhpd",XMMMS,8), -/* [18] */ INVALID, INVALID, INVALID, INVALID, -/* [1C] */ INVALID, INVALID, INVALID, INVALID, - -/* [20] */ INVALID, INVALID, INVALID, INVALID, -/* [24] */ INVALID, INVALID, INVALID, INVALID, -/* [28] */ TNSZ("movapd",XMM,16), TNSZ("movapd",XMMS,16), TNSZ("cvtpi2pd",XMMOMX,8),TNSZ("movntpd",XMMOMS,16), -/* [2C] */ TNSZ("cvttpd2pi",XMMXMM,16),TNSZ("cvtpd2pi",XMMXMM,16),TNSZ("ucomisd",XMM,8),TNSZ("comisd",XMM,8), - -/* [30] */ INVALID, INVALID, INVALID, INVALID, -/* [34] */ INVALID, INVALID, INVALID, INVALID, -/* [38] */ INVALID, INVALID, INVALID, INVALID, -/* [3C] */ INVALID, INVALID, INVALID, INVALID, - -/* [40] */ INVALID, INVALID, INVALID, INVALID, -/* [44] */ INVALID, INVALID, INVALID, INVALID, -/* [48] */ INVALID, INVALID, INVALID, INVALID, -/* [4C] */ INVALID, INVALID, INVALID, INVALID, - -/* [50] */ TNS("movmskpd",XMMOX3), TNSZ("sqrtpd",XMM,16), INVALID, INVALID, -/* [54] */ TNSZ("andpd",XMM,16), TNSZ("andnpd",XMM,16), TNSZ("orpd",XMM,16), TNSZ("xorpd",XMM,16), -/* [58] */ TNSZ("addpd",XMM,16), TNSZ("mulpd",XMM,16), TNSZ("cvtpd2ps",XMM,16),TNSZ("cvtps2dq",XMM,16), -/* [5C] */ TNSZ("subpd",XMM,16), TNSZ("minpd",XMM,16), TNSZ("divpd",XMM,16), TNSZ("maxpd",XMM,16), - -/* [60] */ TNSZ("punpcklbw",XMM,16),TNSZ("punpcklwd",XMM,16),TNSZ("punpckldq",XMM,16),TNSZ("packsswb",XMM,16), -/* [64] */ TNSZ("pcmpgtb",XMM,16), TNSZ("pcmpgtw",XMM,16), TNSZ("pcmpgtd",XMM,16), TNSZ("packuswb",XMM,16), -/* [68] */ TNSZ("punpckhbw",XMM,16),TNSZ("punpckhwd",XMM,16),TNSZ("punpckhdq",XMM,16),TNSZ("packssdw",XMM,16), -/* [6C] */ TNSZ("punpcklqdq",XMM,16),TNSZ("punpckhqdq",XMM,16),TNSZ("movd",XMM3MX,4),TNSZ("movdqa",XMM,16), - -/* [70] */ TNSZ("pshufd",XMMP,16), INVALID, INVALID, INVALID, -/* [74] */ TNSZ("pcmpeqb",XMM,16), TNSZ("pcmpeqw",XMM,16), TNSZ("pcmpeqd",XMM,16), INVALID, -/* [78] */ INVALID, INVALID, INVALID, INVALID, -/* [7C] */ INVALID, INVALID, TNSZ("movd",XMM3MXS,4), TNSZ("movdqa",XMMS,16), - -/* [80] */ INVALID, INVALID, INVALID, INVALID, -/* [84] */ INVALID, INVALID, INVALID, INVALID, -/* [88] */ INVALID, INVALID, INVALID, INVALID, -/* [8C] */ INVALID, INVALID, INVALID, INVALID, - -/* [90] */ INVALID, INVALID, INVALID, INVALID, -/* [94] */ INVALID, INVALID, INVALID, INVALID, -/* [98] */ INVALID, INVALID, INVALID, INVALID, -/* [9C] */ INVALID, INVALID, INVALID, INVALID, - -/* [A0] */ INVALID, INVALID, INVALID, INVALID, -/* [A4] */ INVALID, INVALID, INVALID, INVALID, -/* [A8] */ INVALID, INVALID, INVALID, INVALID, -/* [AC] */ INVALID, INVALID, INVALID, INVALID, - -/* [B0] */ INVALID, INVALID, INVALID, INVALID, -/* [B4] */ INVALID, INVALID, INVALID, INVALID, -/* [B8] */ INVALID, INVALID, INVALID, INVALID, -/* [BC] */ INVALID, INVALID, INVALID, INVALID, - -/* [C0] */ INVALID, INVALID, TNSZ("cmppd",XMMP,16), INVALID, -/* [C4] */ TNSZ("pinsrw",XMMPRM,2),TNS("pextrw",XMM3P), TNSZ("shufpd",XMMP,16), INVALID, -/* [C8] */ INVALID, INVALID, INVALID, INVALID, -/* [CC] */ INVALID, INVALID, INVALID, INVALID, - -/* [D0] */ INVALID, TNSZ("psrlw",XMM,16), TNSZ("psrld",XMM,16), TNSZ("psrlq",XMM,16), -/* [D4] */ TNSZ("paddq",XMM,16), TNSZ("pmullw",XMM,16), TNSZ("movq",XMMS,8), TNS("pmovmskb",XMMX3), -/* [D8] */ TNSZ("psubusb",XMM,16), TNSZ("psubusw",XMM,16), TNSZ("pminub",XMM,16), TNSZ("pand",XMM,16), -/* [DC] */ TNSZ("paddusb",XMM,16), TNSZ("paddusw",XMM,16), TNSZ("pmaxub",XMM,16), TNSZ("pandn",XMM,16), - -/* [E0] */ TNSZ("pavgb",XMM,16), TNSZ("psraw",XMM,16), TNSZ("psrad",XMM,16), TNSZ("pavgw",XMM,16), -/* [E4] */ TNSZ("pmulhuw",XMM,16), TNSZ("pmulhw",XMM,16), TNSZ("cvttpd2dq",XMM,16),TNSZ("movntdq",XMMS,16), -/* [E8] */ TNSZ("psubsb",XMM,16), TNSZ("psubsw",XMM,16), TNSZ("pminsw",XMM,16), TNSZ("por",XMM,16), -/* [EC] */ TNSZ("paddsb",XMM,16), TNSZ("paddsw",XMM,16), TNSZ("pmaxsw",XMM,16), TNSZ("pxor",XMM,16), - -/* [F0] */ INVALID, TNSZ("psllw",XMM,16), TNSZ("pslld",XMM,16), TNSZ("psllq",XMM,16), -/* [F4] */ TNSZ("pmuludq",XMM,16), TNSZ("pmaddwd",XMM,16), TNSZ("psadbw",XMM,16), TNSZ("maskmovdqu", XMMXIMPL,16), -/* [F8] */ TNSZ("psubb",XMM,16), TNSZ("psubw",XMM,16), TNSZ("psubd",XMM,16), TNSZ("psubq",XMM,16), -/* [FC] */ TNSZ("paddb",XMM,16), TNSZ("paddw",XMM,16), TNSZ("paddd",XMM,16), INVALID, -}; - -/* - * Decode table for SIMD instructions with the repnz (0xf2) prefix. - */ -const instable_t dis_opSIMDrepnz[256] = { -/* [00] */ INVALID, INVALID, INVALID, INVALID, -/* [04] */ INVALID, INVALID, INVALID, INVALID, -/* [08] */ INVALID, INVALID, INVALID, INVALID, -/* [0C] */ INVALID, INVALID, INVALID, INVALID, - -/* [10] */ TNSZ("movsd",XMM,8), TNSZ("movsd",XMMS,8), INVALID, INVALID, -/* [14] */ INVALID, INVALID, INVALID, INVALID, -/* [18] */ INVALID, INVALID, INVALID, INVALID, -/* [1C] */ INVALID, INVALID, INVALID, INVALID, - -/* [20] */ INVALID, INVALID, INVALID, INVALID, -/* [24] */ INVALID, INVALID, INVALID, INVALID, -/* [28] */ INVALID, INVALID, TNSZ("cvtsi2sd",XMM3MX,4),INVALID, -/* [2C] */ TNSZ("cvttsd2si",XMMXM3,8),TNSZ("cvtsd2si",XMMXM3,8),INVALID, INVALID, - -/* [30] */ INVALID, INVALID, INVALID, INVALID, -/* [34] */ INVALID, INVALID, INVALID, INVALID, -/* [38] */ INVALID, INVALID, INVALID, INVALID, -/* [3C] */ INVALID, INVALID, INVALID, INVALID, - -/* [40] */ INVALID, INVALID, INVALID, INVALID, -/* [44] */ INVALID, INVALID, INVALID, INVALID, -/* [48] */ INVALID, INVALID, INVALID, INVALID, -/* [4C] */ INVALID, INVALID, INVALID, INVALID, - -/* [50] */ INVALID, TNSZ("sqrtsd",XMM,8), INVALID, INVALID, -/* [54] */ INVALID, INVALID, INVALID, INVALID, -/* [58] */ TNSZ("addsd",XMM,8), TNSZ("mulsd",XMM,8), TNSZ("cvtsd2ss",XMM,8), INVALID, -/* [5C] */ TNSZ("subsd",XMM,8), TNSZ("minsd",XMM,8), TNSZ("divsd",XMM,8), TNSZ("maxsd",XMM,8), - -/* [60] */ INVALID, INVALID, INVALID, INVALID, -/* [64] */ INVALID, INVALID, INVALID, INVALID, -/* [68] */ INVALID, INVALID, INVALID, INVALID, -/* [6C] */ INVALID, INVALID, INVALID, INVALID, - -/* [70] */ TNSZ("pshuflw",XMMP,16),INVALID, INVALID, INVALID, -/* [74] */ INVALID, INVALID, INVALID, INVALID, -/* [78] */ INVALID, INVALID, INVALID, INVALID, -/* [7C] */ INVALID, INVALID, INVALID, INVALID, - -/* [80] */ INVALID, INVALID, INVALID, INVALID, -/* [84] */ INVALID, INVALID, INVALID, INVALID, -/* [88] */ INVALID, INVALID, INVALID, INVALID, -/* [0C] */ INVALID, INVALID, INVALID, INVALID, - -/* [90] */ INVALID, INVALID, INVALID, INVALID, -/* [94] */ INVALID, INVALID, INVALID, INVALID, -/* [98] */ INVALID, INVALID, INVALID, INVALID, -/* [9C] */ INVALID, INVALID, INVALID, INVALID, - -/* [A0] */ INVALID, INVALID, INVALID, INVALID, -/* [A4] */ INVALID, INVALID, INVALID, INVALID, -/* [A8] */ INVALID, INVALID, INVALID, INVALID, -/* [AC] */ INVALID, INVALID, INVALID, INVALID, - -/* [B0] */ INVALID, INVALID, INVALID, INVALID, -/* [B4] */ INVALID, INVALID, INVALID, INVALID, -/* [B8] */ INVALID, INVALID, INVALID, INVALID, -/* [BC] */ INVALID, INVALID, INVALID, INVALID, - -/* [C0] */ INVALID, INVALID, TNSZ("cmpsd",XMMP,8), INVALID, -/* [C4] */ INVALID, INVALID, INVALID, INVALID, -/* [C8] */ INVALID, INVALID, INVALID, INVALID, -/* [CC] */ INVALID, INVALID, INVALID, INVALID, - -/* [D0] */ INVALID, INVALID, INVALID, INVALID, -/* [D4] */ INVALID, INVALID, TNS("movdq2q",XMMXM), INVALID, -/* [D8] */ INVALID, INVALID, INVALID, INVALID, -/* [DC] */ INVALID, INVALID, INVALID, INVALID, - -/* [E0] */ INVALID, INVALID, INVALID, INVALID, -/* [E4] */ INVALID, INVALID, TNSZ("cvtpd2dq",XMM,16),INVALID, -/* [E8] */ INVALID, INVALID, INVALID, INVALID, -/* [EC] */ INVALID, INVALID, INVALID, INVALID, - -/* [F0] */ INVALID, INVALID, INVALID, INVALID, -/* [F4] */ INVALID, INVALID, INVALID, INVALID, -/* [F8] */ INVALID, INVALID, INVALID, INVALID, -/* [FC] */ INVALID, INVALID, INVALID, INVALID, -}; - -/* - * Decode table for SIMD instructions with the repz (0xf3) prefix. - */ -const instable_t dis_opSIMDrepz[256] = { -/* [00] */ INVALID, INVALID, INVALID, INVALID, -/* [04] */ INVALID, INVALID, INVALID, INVALID, -/* [08] */ INVALID, INVALID, INVALID, INVALID, -/* [0C] */ INVALID, INVALID, INVALID, INVALID, - -/* [10] */ TNSZ("movss",XMM,4), TNSZ("movss",XMMS,4), INVALID, INVALID, -/* [14] */ INVALID, INVALID, INVALID, INVALID, -/* [18] */ INVALID, INVALID, INVALID, INVALID, -/* [1C] */ INVALID, INVALID, INVALID, INVALID, - -/* [20] */ INVALID, INVALID, INVALID, INVALID, -/* [24] */ INVALID, INVALID, INVALID, INVALID, -/* [28] */ INVALID, INVALID, TNSZ("cvtsi2ss",XMM3MX,4),INVALID, -/* [2C] */ TNSZ("cvttss2si",XMMXM3,4),TNSZ("cvtss2si",XMMXM3,4),INVALID, INVALID, - -/* [30] */ INVALID, INVALID, INVALID, INVALID, -/* [34] */ INVALID, INVALID, INVALID, INVALID, -/* [38] */ INVALID, INVALID, INVALID, INVALID, -/* [3C] */ INVALID, INVALID, INVALID, INVALID, - -/* [40] */ INVALID, INVALID, INVALID, INVALID, -/* [44] */ INVALID, INVALID, INVALID, INVALID, -/* [48] */ INVALID, INVALID, INVALID, INVALID, -/* [4C] */ INVALID, INVALID, INVALID, INVALID, - -/* [50] */ INVALID, TNSZ("sqrtss",XMM,4), TNSZ("rsqrtss",XMM,4), TNSZ("rcpss",XMM,4), -/* [54] */ INVALID, INVALID, INVALID, INVALID, -/* [58] */ TNSZ("addss",XMM,4), TNSZ("mulss",XMM,4), TNSZ("cvtss2sd",XMM,4), TNSZ("cvttps2dq",XMM,16), -/* [5C] */ TNSZ("subss",XMM,4), TNSZ("minss",XMM,4), TNSZ("divss",XMM,4), TNSZ("maxss",XMM,4), - -/* [60] */ INVALID, INVALID, INVALID, INVALID, -/* [64] */ INVALID, INVALID, INVALID, INVALID, -/* [68] */ INVALID, INVALID, INVALID, INVALID, -/* [6C] */ INVALID, INVALID, INVALID, TNSZ("movdqu",XMM,16), - -/* [70] */ TNSZ("pshufhw",XMMP,16),INVALID, INVALID, INVALID, -/* [74] */ INVALID, INVALID, INVALID, INVALID, -/* [78] */ INVALID, INVALID, INVALID, INVALID, -/* [7C] */ INVALID, INVALID, TNSZ("movq",XMM,8), TNSZ("movdqu",XMMS,16), - -/* [80] */ INVALID, INVALID, INVALID, INVALID, -/* [84] */ INVALID, INVALID, INVALID, INVALID, -/* [88] */ INVALID, INVALID, INVALID, INVALID, -/* [0C] */ INVALID, INVALID, INVALID, INVALID, - -/* [90] */ INVALID, INVALID, INVALID, INVALID, -/* [94] */ INVALID, INVALID, INVALID, INVALID, -/* [98] */ INVALID, INVALID, INVALID, INVALID, -/* [9C] */ INVALID, INVALID, INVALID, INVALID, - -/* [A0] */ INVALID, INVALID, INVALID, INVALID, -/* [A4] */ INVALID, INVALID, INVALID, INVALID, -/* [A8] */ INVALID, INVALID, INVALID, INVALID, -/* [AC] */ INVALID, INVALID, INVALID, INVALID, - -/* [B0] */ INVALID, INVALID, INVALID, INVALID, -/* [B4] */ INVALID, INVALID, INVALID, INVALID, -/* [B8] */ INVALID, INVALID, INVALID, INVALID, -/* [BC] */ INVALID, INVALID, INVALID, INVALID, - -/* [C0] */ INVALID, INVALID, TNSZ("cmpss",XMMP,4), INVALID, -/* [C4] */ INVALID, INVALID, INVALID, INVALID, -/* [C8] */ INVALID, INVALID, INVALID, INVALID, -/* [CC] */ INVALID, INVALID, INVALID, INVALID, - -/* [D0] */ INVALID, INVALID, INVALID, INVALID, -/* [D4] */ INVALID, INVALID, TNS("movq2dq",XMMMX), INVALID, -/* [D8] */ INVALID, INVALID, INVALID, INVALID, -/* [DC] */ INVALID, INVALID, INVALID, INVALID, - -/* [E0] */ INVALID, INVALID, INVALID, INVALID, -/* [E4] */ INVALID, INVALID, TNSZ("cvtdq2pd",XMM,8), INVALID, -/* [E8] */ INVALID, INVALID, INVALID, INVALID, -/* [EC] */ INVALID, INVALID, INVALID, INVALID, - -/* [F0] */ INVALID, INVALID, INVALID, INVALID, -/* [F4] */ INVALID, INVALID, INVALID, INVALID, -/* [F8] */ INVALID, INVALID, INVALID, INVALID, -/* [FC] */ INVALID, INVALID, INVALID, INVALID, -}; - -/* - * Decode table for 0x0F opcodes - */ - -const instable_t dis_op0F[16][16] = { -{ -/* [00] */ IND(dis_op0F00), IND(dis_op0F01), TNS("lar",MR), TNS("lsl",MR), -/* [04] */ INVALID, TNS("syscall",NORM), TNS("clts",NORM), TNS("sysret",NORM), -/* [08] */ TNS("invd",NORM), TNS("wbinvd",NORM), INVALID, TNS("ud2",NORM), -/* [0C] */ INVALID, INVALID, INVALID, INVALID, -}, { -/* [10] */ TNSZ("movups",XMMO,16), TNSZ("movups",XMMOS,16),TNSZ("movlps",XMMO,8), TNSZ("movlps",XMMOS,8), -/* [14] */ TNSZ("unpcklps",XMMO,16),TNSZ("unpckhps",XMMO,16),TNSZ("movhps",XMMOM,8),TNSZ("movhps",XMMOMS,8), -/* [18] */ IND(dis_op0F18), INVALID, INVALID, INVALID, -/* [1C] */ INVALID, INVALID, INVALID, INVALID, -}, { -/* [20] */ TSy("mov",SREG), TSy("mov",SREG), TSy("mov",SREG), TSy("mov",SREG), -/* [24] */ TSx("mov",SREG), INVALID, TSx("mov",SREG), INVALID, -/* [28] */ TNSZ("movaps",XMMO,16), TNSZ("movaps",XMMOS,16),TNSZ("cvtpi2ps",XMMOMX,8),TNSZ("movntps",XMMOS,16), -/* [2C] */ TNSZ("cvttps2pi",XMMOXMM,8),TNSZ("cvtps2pi",XMMOXMM,8),TNSZ("ucomiss",XMMO,4),TNSZ("comiss",XMMO,4), -}, { -/* [30] */ TNS("wrmsr",NORM), TNS("rdtsc",NORM), TNS("rdmsr",NORM), TNS("rdpmc",NORM), -/* [34] */ TNSx("sysenter",NORM), TNSx("sysexit",NORM), INVALID, INVALID, -/* [38] */ INVALID, INVALID, INVALID, INVALID, -/* [3C] */ INVALID, INVALID, INVALID, INVALID, -}, { -/* [40] */ TS("cmovx.o",MR), TS("cmovx.no",MR), TS("cmovx.b",MR), TS("cmovx.ae",MR), -/* [44] */ TS("cmovx.e",MR), TS("cmovx.ne",MR), TS("cmovx.be",MR), TS("cmovx.a",MR), -/* [48] */ TS("cmovx.s",MR), TS("cmovx.ns",MR), TS("cmovx.pe",MR), TS("cmovx.po",MR), -/* [4C] */ TS("cmovx.l",MR), TS("cmovx.ge",MR), TS("cmovx.le",MR), TS("cmovx.g",MR), -}, { -/* [50] */ TNS("movmskps",XMMOX3), TNSZ("sqrtps",XMMO,16), TNSZ("rsqrtps",XMMO,16),TNSZ("rcpps",XMMO,16), -/* [54] */ TNSZ("andps",XMMO,16), TNSZ("andnps",XMMO,16), TNSZ("orps",XMMO,16), TNSZ("xorps",XMMO,16), -/* [58] */ TNSZ("addps",XMMO,16), TNSZ("mulps",XMMO,16), TNSZ("cvtps2pd",XMMO,8),TNSZ("cvtdq2ps",XMMO,16), -/* [5C] */ TNSZ("subps",XMMO,16), TNSZ("minps",XMMO,16), TNSZ("divps",XMMO,16), TNSZ("maxps",XMMO,16), -}, { -/* [60] */ TNSZ("punpcklbw",MMO,4),TNSZ("punpcklwd",MMO,4),TNSZ("punpckldq",MMO,4),TNSZ("packsswb",MMO,8), -/* [64] */ TNSZ("pcmpgtb",MMO,8), TNSZ("pcmpgtw",MMO,8), TNSZ("pcmpgtd",MMO,8), TNSZ("packuswb",MMO,8), -/* [68] */ TNSZ("punpckhbw",MMO,8),TNSZ("punpckhwd",MMO,8),TNSZ("punpckhdq",MMO,8),TNSZ("packssdw",MMO,8), -/* [6C] */ TNSZ("INVALID",MMO,0), TNSZ("INVALID",MMO,0), TNSZ("movd",MMO,4), TNSZ("movq",MMO,8), -}, { -/* [70] */ TNSZ("pshufw",MMOPM,8), TNS("psrXXX",MR), TNS("psrXXX",MR), TNS("psrXXX",MR), -/* [74] */ TNSZ("pcmpeqb",MMO,8), TNSZ("pcmpeqw",MMO,8), TNSZ("pcmpeqd",MMO,8), TNS("emms",NORM), -/* [78] */ INVALID, INVALID, INVALID, INVALID, -/* [7C] */ INVALID, INVALID, TNSZ("movd",MMOS,4), TNSZ("movq",MMOS,8), -}, { -/* [80] */ TNS("jo",D), TNS("jno",D), TNS("jb",D), TNS("jae",D), -/* [84] */ TNS("je",D), TNS("jne",D), TNS("jbe",D), TNS("ja",D), -/* [88] */ TNS("js",D), TNS("jns",D), TNS("jp",D), TNS("jnp",D), -/* [8C] */ TNS("jl",D), TNS("jge",D), TNS("jle",D), TNS("jg",D), -}, { -/* [90] */ TNS("seto",Mb), TNS("setno",Mb), TNS("setb",Mb), TNS("setae",Mb), -/* [94] */ TNS("sete",Mb), TNS("setne",Mb), TNS("setbe",Mb), TNS("seta",Mb), -/* [98] */ TNS("sets",Mb), TNS("setns",Mb), TNS("setp",Mb), TNS("setnp",Mb), -/* [9C] */ TNS("setl",Mb), TNS("setge",Mb), TNS("setle",Mb), TNS("setg",Mb), -}, { -/* [A0] */ TSp("push",LSEG), TSp("pop",LSEG), TNS("cpuid",NORM), TS("bt",RMw), -/* [A4] */ TS("shld",DSHIFT), TS("shld",DSHIFTcl), INVALID, INVALID, -/* [A8] */ TSp("push",LSEG), TSp("pop",LSEG), TNS("rsm",NORM), TS("bts",RMw), -/* [AC] */ TS("shrd",DSHIFT), TS("shrd",DSHIFTcl), IND(dis_op0FAE), TS("imul",MRw), -}, { -/* [B0] */ TNS("cmpxchgb",RMw), TS("cmpxchg",RMw), TS("lss",MR), TS("btr",RMw), -/* [B4] */ TS("lfs",MR), TS("lgs",MR), TS("movzb",MOVZ), TNS("movzwl",MOVZ), -/* [B8] */ INVALID, INVALID, IND(dis_op0FBA), TS("btc",RMw), -/* [BC] */ TS("bsf",MRw), TS("bsr",MRw), TS("movsb",MOVZ), TNS("movswl",MOVZ), -}, { -/* [C0] */ TNS("xaddb",XADDB), TS("xadd",RMw), TNSZ("cmpps",XMMOPM,16),TNS("movnti",RM), -/* [C4] */ TNSZ("pinsrw",MMOPRM,2),TNS("pextrw",MMO3P), TNSZ("shufps",XMMOPM,16),IND(dis_op0FC7), -/* [C8] */ INVALID, INVALID, INVALID, INVALID, -/* [CC] */ INVALID, INVALID, INVALID, INVALID, -}, { -/* [D0] */ INVALID, TNSZ("psrlw",MMO,8), TNSZ("psrld",MMO,8), TNSZ("psrlq",MMO,8), -/* [D4] */ TNSZ("paddq",MMO,8), TNSZ("pmullw",MMO,8), TNSZ("INVALID",MMO,0), TNS("pmovmskb",MMOM3), -/* [D8] */ TNSZ("psubusb",MMO,8), TNSZ("psubusw",MMO,8), TNSZ("pminub",MMO,8), TNSZ("pand",MMO,8), -/* [DC] */ TNSZ("paddusb",MMO,8), TNSZ("paddusw",MMO,8), TNSZ("pmaxub",MMO,8), TNSZ("pandn",MMO,8), -}, { -/* [E0] */ TNSZ("pavgb",MMO,8), TNSZ("psraw",MMO,8), TNSZ("psrad",MMO,8), TNSZ("pavgw",MMO,8), -/* [E4] */ TNSZ("pmulhuw",MMO,8), TNSZ("pmulhw",MMO,8), TNS("INVALID",XMMO), TNSZ("movntq",MMOMS,8), -/* [E8] */ TNSZ("psubsb",MMO,8), TNSZ("psubsw",MMO,8), TNSZ("pminsw",MMO,8), TNSZ("por",MMO,8), -/* [EC] */ TNSZ("paddsb",MMO,8), TNSZ("paddsw",MMO,8), TNSZ("pmaxsw",MMO,8), TNSZ("pxor",MMO,8), -}, { -/* [F0] */ INVALID, TNSZ("psllw",MMO,8), TNSZ("pslld",MMO,8), TNSZ("psllq",MMO,8), -/* [F4] */ TNSZ("pmuludq",MMO,8), TNSZ("pmaddwd",MMO,8), TNSZ("psadbw",MMO,8), TNSZ("maskmovq",MMOIMPL,8), -/* [F8] */ TNSZ("psubb",MMO,8), TNSZ("psubw",MMO,8), TNSZ("psubd",MMO,8), TNSZ("psubq",MMO,8), -/* [FC] */ TNSZ("paddb",MMO,8), TNSZ("paddw",MMO,8), TNSZ("paddd",MMO,8), INVALID, -} }; - - -/* - * Decode table for 0x80 opcodes - */ - -const instable_t dis_op80[8] = { - -/* [0] */ TNS("addb",IMlw), TNS("orb",IMw), TNS("adcb",IMlw), TNS("sbbb",IMlw), -/* [4] */ TNS("andb",IMw), TNS("subb",IMlw), TNS("xorb",IMw), TNS("cmpb",IMlw), -}; - - -/* - * Decode table for 0x81 opcodes. - */ - -const instable_t dis_op81[8] = { - -/* [0] */ TS("add",IMlw), TS("or",IMw), TS("adc",IMlw), TS("sbb",IMlw), -/* [4] */ TS("and",IMw), TS("sub",IMlw), TS("xor",IMw), TS("cmp",IMlw), -}; - - -/* - * Decode table for 0x82 opcodes. - */ - -const instable_t dis_op82[8] = { - -/* [0] */ TNSx("addb",IMlw), TNSx("orb",IMlw), TNSx("adcb",IMlw), TNSx("sbbb",IMlw), -/* [4] */ TNSx("andb",IMlw), TNSx("subb",IMlw), TNSx("xorb",IMlw), TNSx("cmpb",IMlw), -}; -/* - * Decode table for 0x83 opcodes. - */ - -const instable_t dis_op83[8] = { - -/* [0] */ TS("add",IMlw), TS("or",IMlw), TS("adc",IMlw), TS("sbb",IMlw), -/* [4] */ TS("and",IMlw), TS("sub",IMlw), TS("xor",IMlw), TS("cmp",IMlw), -}; - -/* - * Decode table for 0xC0 opcodes. - */ - -const instable_t dis_opC0[8] = { - -/* [0] */ TNS("rolb",MvI), TNS("rorb",MvI), TNS("rclb",MvI), TNS("rcrb",MvI), -/* [4] */ TNS("shlb",MvI), TNS("shrb",MvI), INVALID, TNS("sarb",MvI), -}; - -/* - * Decode table for 0xD0 opcodes. - */ - -const instable_t dis_opD0[8] = { - -/* [0] */ TNS("rolb",Mv), TNS("rorb",Mv), TNS("rclb",Mv), TNS("rcrb",Mv), -/* [4] */ TNS("shlb",Mv), TNS("shrb",Mv), TNS("salb",Mv), TNS("sarb",Mv), -}; - -/* - * Decode table for 0xC1 opcodes. - * 186 instruction set - */ - -const instable_t dis_opC1[8] = { - -/* [0] */ TS("rol",MvI), TS("ror",MvI), TS("rcl",MvI), TS("rcr",MvI), -/* [4] */ TS("shl",MvI), TS("shr",MvI), TS("sal",MvI), TS("sar",MvI), -}; - -/* - * Decode table for 0xD1 opcodes. - */ - -const instable_t dis_opD1[8] = { - -/* [0] */ TS("rol",Mv), TS("ror",Mv), TS("rcl",Mv), TS("rcr",Mv), -/* [4] */ TS("shl",Mv), TS("shr",Mv), TS("sal",Mv), TS("sar",Mv), -}; - - -/* - * Decode table for 0xD2 opcodes. - */ - -const instable_t dis_opD2[8] = { - -/* [0] */ TNS("rolb",Mv), TNS("rorb",Mv), TNS("rclb",Mv), TNS("rcrb",Mv), -/* [4] */ TNS("shlb",Mv), TNS("shrb",Mv), TNS("salb",Mv), TNS("sarb",Mv), -}; -/* - * Decode table for 0xD3 opcodes. - */ - -const instable_t dis_opD3[8] = { - -/* [0] */ TS("rol",Mv), TS("ror",Mv), TS("rcl",Mv), TS("rcr",Mv), -/* [4] */ TS("shl",Mv), TS("shr",Mv), TS("salb",Mv), TS("sar",Mv), -}; - - -/* - * Decode table for 0xF6 opcodes. - */ - -const instable_t dis_opF6[8] = { - -/* [0] */ TNS("testb",IMw), TNS("testb",IMw), TNS("notb",Mw), TNS("negb",Mw), -/* [4] */ TNS("mulb",MA), TNS("imulb",MA), TNS("divb",MA), TNS("idivb",MA), -}; - - -/* - * Decode table for 0xF7 opcodes. - */ - -const instable_t dis_opF7[8] = { - -/* [0] */ TS("test",IMw), TS("test",IMw), TS("not",Mw), TS("neg",Mw), -/* [4] */ TS("mul",MA), TS("imul",MA), TS("div",MA), TS("idiv",MA), -}; - - -/* - * Decode table for 0xFE opcodes. - */ - -const instable_t dis_opFE[8] = { - -/* [0] */ TNS("incb",Mw), TNS("decb",Mw), INVALID, INVALID, -/* [4] */ INVALID, INVALID, INVALID, INVALID, -}; -/* - * Decode table for 0xFF opcodes. - */ - -const instable_t dis_opFF[8] = { - -/* [0] */ TS("inc",Mw), TS("dec",Mw), TNSyp("call",INM), TNS("lcall",INM), -/* [4] */ TNSy("jmp",INM), TNS("ljmp",INM), TSp("push",M), INVALID, -}; - -/* for 287 instructions, which are a mess to decode */ - -const instable_t dis_opFP1n2[8][8] = { -{ -/* bit pattern: 1101 1xxx MODxx xR/M */ -/* [0,0] */ TNS("fadds",M), TNS("fmuls",M), TNS("fcoms",M), TNS("fcomps",M), -/* [0,4] */ TNS("fsubs",M), TNS("fsubrs",M), TNS("fdivs",M), TNS("fdivrs",M), -}, { -/* [1,0] */ TNS("flds",M), INVALID, TNS("fsts",M), TNS("fstps",M), -/* [1,4] */ TNSZ("fldenv",M,28), TNSZ("fldcw",M,2), TNSZ("fnstenv",M,28), TNSZ("fnstcw",M,2), -}, { -/* [2,0] */ TNS("fiaddl",M), TNS("fimull",M), TNS("ficoml",M), TNS("ficompl",M), -/* [2,4] */ TNS("fisubl",M), TNS("fisubrl",M), TNS("fidivl",M), TNS("fidivrl",M), -}, { -/* [3,0] */ TNS("fildl",M), INVALID, TNS("fistl",M), TNS("fistpl",M), -/* [3,4] */ INVALID, TNSZ("fldt",M,10), INVALID, TNSZ("fstpt",M,10), -}, { -/* [4,0] */ TNSZ("faddl",M,8), TNSZ("fmull",M,8), TNSZ("fcoml",M,8), TNSZ("fcompl",M,8), -/* [4,1] */ TNSZ("fsubl",M,8), TNSZ("fsubrl",M,8), TNSZ("fdivl",M,8), TNSZ("fdivrl",M,8), -}, { -/* [5,0] */ TNSZ("fldl",M,8), INVALID, TNSZ("fstl",M,8), TNSZ("fstpl",M,8), -/* [5,4] */ TNSZ("frstor",M,108), INVALID, TNSZ("fnsave",M,108), TNSZ("fnstsw",M,2), -}, { -/* [6,0] */ TNSZ("fiadd",M,2), TNSZ("fimul",M,2), TNSZ("ficom",M,2), TNSZ("ficomp",M,2), -/* [6,4] */ TNSZ("fisub",M,2), TNSZ("fisubr",M,2), TNSZ("fidiv",M,2), TNSZ("fidivr",M,2), -}, { -/* [7,0] */ TNSZ("fild",M,2), INVALID, TNSZ("fist",M,2), TNSZ("fistp",M,2), -/* [7,4] */ TNSZ("fbld",M,10), TNSZ("fildll",M,8), TNSZ("fbstp",M,10), TNSZ("fistpll",M,8), -} }; - -const instable_t dis_opFP3[8][8] = { -{ -/* bit pattern: 1101 1xxx 11xx xREG */ -/* [0,0] */ TNS("fadd",FF), TNS("fmul",FF), TNS("fcom",F), TNS("fcomp",F), -/* [0,4] */ TNS("fsub",FF), TNS("fsubr",FF), TNS("fdiv",FF), TNS("fdivr",FF), -}, { -/* [1,0] */ TNS("fld",F), TNS("fxch",F), TNS("fnop",NORM), TNS("fstp",F), -/* [1,4] */ INVALID, INVALID, INVALID, INVALID, -}, { -/* [2,0] */ INVALID, INVALID, INVALID, INVALID, -/* [2,4] */ INVALID, TNS("fucompp",NORM), INVALID, INVALID, -}, { -/* [3,0] */ INVALID, INVALID, INVALID, INVALID, -/* [3,4] */ INVALID, INVALID, INVALID, INVALID, -}, { -/* [4,0] */ TNS("fadd",FF), TNS("fmul",FF), TNS("fcom",F), TNS("fcomp",F), -/* [4,4] */ TNS("fsub",FF), TNS("fsubr",FF), TNS("fdiv",FF), TNS("fdivr",FF), -}, { -/* [5,0] */ TNS("ffree",F), TNS("fxch",F), TNS("fst",F), TNS("fstp",F), -/* [5,4] */ TNS("fucom",F), TNS("fucomp",F), INVALID, INVALID, -}, { -/* [6,0] */ TNS("faddp",FF), TNS("fmulp",FF), TNS("fcomp",F), TNS("fcompp",NORM), -/* [6,4] */ TNS("fsubp",FF), TNS("fsubrp",FF), TNS("fdivp",FF), TNS("fdivrp",FF), -}, { -/* [7,0] */ TNS("ffree",F), TNS("fxch",F), TNS("fstp",F), TNS("fstp",F), -/* [7,4] */ TNS("fnstsw",M), TNS("fucomip",FFC), TNS("fcomip",FFC), INVALID, -} }; - -const instable_t dis_opFP4[4][8] = { -{ -/* bit pattern: 1101 1001 111x xxxx */ -/* [0,0] */ TNS("fchs",NORM), TNS("fabs",NORM), INVALID, INVALID, -/* [0,4] */ TNS("ftst",NORM), TNS("fxam",NORM), TNS("ftstp",NORM), INVALID, -}, { -/* [1,0] */ TNS("fld1",NORM), TNS("fldl2t",NORM), TNS("fldl2e",NORM), TNS("fldpi",NORM), -/* [1,4] */ TNS("fldlg2",NORM), TNS("fldln2",NORM), TNS("fldz",NORM), INVALID, -}, { -/* [2,0] */ TNS("f2xm1",NORM), TNS("fyl2x",NORM), TNS("fptan",NORM), TNS("fpatan",NORM), -/* [2,4] */ TNS("fxtract",NORM), TNS("fprem1",NORM), TNS("fdecstp",NORM), TNS("fincstp",NORM), -}, { -/* [3,0] */ TNS("fprem",NORM), TNS("fyl2xp1",NORM), TNS("fsqrt",NORM), TNS("fsincos",NORM), -/* [3,4] */ TNS("frndint",NORM), TNS("fscale",NORM), TNS("fsin",NORM), TNS("fcos",NORM), -} }; - -const instable_t dis_opFP5[8] = { -/* bit pattern: 1101 1011 111x xxxx */ -/* [0] */ TNS("feni",NORM), TNS("fdisi",NORM), TNS("fnclex",NORM), TNS("fninit",NORM), -/* [4] */ TNS("fsetpm",NORM), TNS("frstpm",NORM), INVALID, INVALID, -}; - -const instable_t dis_opFP6[8] = { -/* bit pattern: 1101 1011 11yy yxxx */ -/* [00] */ TNS("fcmov.nb",FF), TNS("fcmov.ne",FF), TNS("fcmov.nbe",FF), TNS("fcmov.nu",FF), -/* [04] */ INVALID, TNS("fucomi",F), TNS("fcomi",F), INVALID, -}; - -const instable_t dis_opFP7[8] = { -/* bit pattern: 1101 1010 11yy yxxx */ -/* [00] */ TNS("fcmov.b",FF), TNS("fcmov.e",FF), TNS("fcmov.be",FF), TNS("fcmov.u",FF), -/* [04] */ INVALID, INVALID, INVALID, INVALID, -}; - -/* - * Main decode table for the op codes. The first two nibbles - * will be used as an index into the table. If there is a - * a need to further decode an instruction, the array to be - * referenced is indicated with the other two entries being - * empty. - */ - -const instable_t dis_distable[16][16] = { -{ -/* [0,0] */ TNS("addb",RMw), TS("add",RMw), TNS("addb",MRw), TS("add",MRw), -/* [0,4] */ TNS("addb",IA), TS("add",IA), TSx("push",SEG), TSx("pop",SEG), -/* [0,8] */ TNS("orb",RMw), TS("or",RMw), TNS("orb",MRw), TS("or",MRw), -/* [0,C] */ TNS("orb",IA), TS("or",IA), TSx("push",SEG), IND(&dis_op0F[0][0]), -}, { -/* [1,0] */ TNS("adcb",RMw), TS("adc",RMw), TNS("adcb",MRw), TS("adc",MRw), -/* [1,4] */ TNS("adcb",IA), TS("adc",IA), TSx("push",SEG), TSx("pop",SEG), -/* [1,8] */ TNS("sbbb",RMw), TS("sbb",RMw), TNS("sbbb",MRw), TS("sbb",MRw), -/* [1,C] */ TNS("sbbb",IA), TS("sbb",IA), TSx("push",SEG), TSx("pop",SEG), -}, { -/* [2,0] */ TNS("andb",RMw), TS("and",RMw), TNS("andb",MRw), TS("and",MRw), -/* [2,4] */ TNS("andb",IA), TS("and",IA), TNSx("%es:",OVERRIDE), TNSx("daa",NORM), -/* [2,8] */ TNS("subb",RMw), TS("sub",RMw), TNS("subb",MRw), TS("sub",MRw), -/* [2,C] */ TNS("subb",IA), TS("sub",IA), TNSx("%cs:",OVERRIDE), TNSx("das",NORM), -}, { -/* [3,0] */ TNS("xorb",RMw), TS("xor",RMw), TNS("xorb",MRw), TS("xor",MRw), -/* [3,4] */ TNS("xorb",IA), TS("xor",IA), TNSx("%ss:",OVERRIDE), TNSx("aaa",NORM), -/* [3,8] */ TNS("cmpb",RMw), TS("cmp",RMw), TNS("cmpb",MRw), TS("cmp",MRw), -/* [3,C] */ TNS("cmpb",IA), TS("cmp",IA), TNSx("%ds:",OVERRIDE), TNSx("aas",NORM), -}, { -/* [4,0] */ TSx("inc",R), TSx("inc",R), TSx("inc",R), TSx("inc",R), -/* [4,4] */ TSx("inc",R), TSx("inc",R), TSx("inc",R), TSx("inc",R), -/* [4,8] */ TSx("dec",R), TSx("dec",R), TSx("dec",R), TSx("dec",R), -/* [4,C] */ TSx("dec",R), TSx("dec",R), TSx("dec",R), TSx("dec",R), -}, { -/* [5,0] */ TSp("push",R), TSp("push",R), TSp("push",R), TSp("push",R), -/* [5,4] */ TSp("push",R), TSp("push",R), TSp("push",R), TSp("push",R), -/* [5,8] */ TSp("pop",R), TSp("pop",R), TSp("pop",R), TSp("pop",R), -/* [5,C] */ TSp("pop",R), TSp("pop",R), TSp("pop",R), TSp("pop",R), -}, { -/* [6,0] */ TSZx("pusha",IMPLMEM,28),TSZx("popa",IMPLMEM,28), TSx("bound",MR), TNS("arpl",RMw), -/* [6,4] */ TNS("%fs:",OVERRIDE), TNS("%gs:",OVERRIDE), TNS("data16",DM), TNS("addr16",AM), -/* [6,8] */ TSp("push",I), TS("imul",IMUL), TSp("push",Ib), TS("imul",IMUL), -/* [6,C] */ TNSZ("insb",IMPLMEM,1), TSZ("ins",IMPLMEM,4), TNSZ("outsb",IMPLMEM,1),TSZ("outs",IMPLMEM,4), -}, { -/* [7,0] */ TNSy("jo",BD), TNSy("jno",BD), TNSy("jb",BD), TNSy("jae",BD), -/* [7,4] */ TNSy("je",BD), TNSy("jne",BD), TNSy("jbe",BD), TNSy("ja",BD), -/* [7,8] */ TNSy("js",BD), TNSy("jns",BD), TNSy("jp",BD), TNSy("jnp",BD), -/* [7,C] */ TNSy("jl",BD), TNSy("jge",BD), TNSy("jle",BD), TNSy("jg",BD), -}, { -/* [8,0] */ IND(dis_op80), IND(dis_op81), INDx(dis_op82), IND(dis_op83), -/* [8,4] */ TNS("testb",RMw), TS("test",RMw), TNS("xchgb",RMw), TS("xchg",RMw), -/* [8,8] */ TNS("movb",RMw), TS("mov",RMw), TNS("movb",MRw), TS("mov",MRw), -/* [8,C] */ TNS("movw",SM), TS("lea",MR), TNS("movw",MS), TSp("pop",M), -}, { -/* [9,0] */ TNS("nop",NORM), TS("xchg",RA), TS("xchg",RA), TS("xchg",RA), -/* [9,4] */ TS("xchg",RA), TS("xchg",RA), TS("xchg",RA), TS("xchg",RA), -/* [9,8] */ TNS("cXtX",CBW), TNS("cXtX",CWD), TNSx("lcall",SO), TNS("fwait",NORM), -/* [9,C] */ TSZy("pushf",IMPLMEM,4),TSZy("popf",IMPLMEM,4), TNSx("sahf",NORM), TNSx("lahf",NORM), -}, { -/* [A,0] */ TNS("movb",OA), TS("mov",OA), TNS("movb",AO), TS("mov",AO), -/* [A,4] */ TNSZ("movsb",SD,1), TS("movs",SD), TNSZ("cmpsb",SD,1), TS("cmps",SD), -/* [A,8] */ TNS("testb",IA), TS("test",IA), TNS("stosb",AD), TS("stos",AD), -/* [A,C] */ TNS("lodsb",SA), TS("lods",SA), TNS("scasb",AD), TS("scas",AD), -}, { -/* [B,0] */ TNS("movb",IR), TNS("movb",IR), TNS("movb",IR), TNS("movb",IR), -/* [B,4] */ TNS("movb",IR), TNS("movb",IR), TNS("movb",IR), TNS("movb",IR), -/* [B,8] */ TS("mov",IR), TS("mov",IR), TS("mov",IR), TS("mov",IR), -/* [B,C] */ TS("mov",IR), TS("mov",IR), TS("mov",IR), TS("mov",IR), -}, { -/* [C,0] */ IND(dis_opC0), IND(dis_opC1), TNSyp("ret",RET), TNSyp("ret",NORM), -/* [C,4] */ TNSx("les",MR), TNSx("lds",MR), TNS("movb",IMw), TS("mov",IMw), -/* [C,8] */ TNSyp("enter",ENTER), TNSyp("leave",NORM), TNS("lret",RET), TNS("lret",NORM), -/* [C,C] */ TNS("int",INT3), TNS("int",INTx), TNSx("into",NORM), TNS("iret",NORM), -}, { -/* [D,0] */ IND(dis_opD0), IND(dis_opD1), IND(dis_opD2), IND(dis_opD3), -/* [D,4] */ TNSx("aam",U), TNSx("aad",U), TNSx("falc",NORM), TNSZ("xlat",IMPLMEM,1), - -/* 287 instructions. Note that although the indirect field */ -/* indicates opFP1n2 for further decoding, this is not necessarily */ -/* the case since the opFP arrays are not partitioned according to key1 */ -/* and key2. opFP1n2 is given only to indicate that we haven't */ -/* finished decoding the instruction. */ -/* [D,8] */ IND(&dis_opFP1n2[0][0]), IND(&dis_opFP1n2[0][0]), IND(&dis_opFP1n2[0][0]), IND(&dis_opFP1n2[0][0]), -/* [D,C] */ IND(&dis_opFP1n2[0][0]), IND(&dis_opFP1n2[0][0]), IND(&dis_opFP1n2[0][0]), IND(&dis_opFP1n2[0][0]), -}, { -/* [E,0] */ TNSy("loopnz",BD), TNSy("loopz",BD), TNSy("loop",BD), TNSy("jcxz",BD), -/* [E,4] */ TNS("inb",P), TS("in",P), TNS("outb",P), TS("out",P), -/* [E,8] */ TNSyp("call",D), TNSy("jmp",D), TNSx("ljmp",SO), TNSy("jmp",BD), -/* [E,C] */ TNS("inb",V), TS("in",V), TNS("outb",V), TS("out",V), -}, { -/* [F,0] */ TNS("lock",LOCK), TNS("icebp", NORM), TNS("repnz",PREFIX), TNS("repz",PREFIX), -/* [F,4] */ TNS("hlt",NORM), TNS("cmc",NORM), IND(dis_opF6), IND(dis_opF7), -/* [F,8] */ TNS("clc",NORM), TNS("stc",NORM), TNS("cli",NORM), TNS("sti",NORM), -/* [F,C] */ TNS("cld",NORM), TNS("std",NORM), IND(dis_opFE), IND(dis_opFF), -} }; - -/* END CSTYLED */ - -/* - * common functions to decode and disassemble an x86 or amd64 instruction - */ - -/* - * These are the individual fields of a REX prefix. Note that a REX - * prefix with none of these set is still needed to: - * - use the MOVSXD (sign extend 32 to 64 bits) instruction - * - access the %sil, %dil, %bpl, %spl registers - */ -#define REX_W 0x08 /* 64 bit operand size when set */ -#define REX_R 0x04 /* high order bit extension of ModRM reg field */ -#define REX_X 0x02 /* high order bit extension of SIB index field */ -#define REX_B 0x01 /* extends ModRM r_m, SIB base, or opcode reg */ - -static uint_t opnd_size; /* SIZE16, SIZE32 or SIZE64 */ -static uint_t addr_size; /* SIZE16, SIZE32 or SIZE64 */ - -/* - * Even in 64 bit mode, usually only 4 byte immediate operands are supported. - */ -static int isize[] = {1, 2, 4, 4}; -static int isize64[] = {1, 2, 4, 8}; - -/* - * Just a bunch of useful macros. - */ -#define WBIT(x) (x & 0x1) /* to get w bit */ -#define REGNO(x) (x & 0x7) /* to get 3 bit register */ -#define VBIT(x) ((x)>>1 & 0x1) /* to get 'v' bit */ -#define OPSIZE(osize, wbit) ((wbit) ? isize[osize] : 1) -#define OPSIZE64(osize, wbit) ((wbit) ? isize64[osize] : 1) - -#define REG_ONLY 3 /* mode to indicate a register operand (not memory) */ - -#define BYTE_OPND 0 /* w-bit value indicating byte register */ -#define LONG_OPND 1 /* w-bit value indicating opnd_size register */ -#define MM_OPND 2 /* "value" used to indicate a mmx reg */ -#define XMM_OPND 3 /* "value" used to indicate a xmm reg */ -#define SEG_OPND 4 /* "value" used to indicate a segment reg */ -#define CONTROL_OPND 5 /* "value" used to indicate a control reg */ -#define DEBUG_OPND 6 /* "value" used to indicate a debug reg */ -#define TEST_OPND 7 /* "value" used to indicate a test reg */ -#define WORD_OPND 8 /* w-bit value indicating word size reg */ - -/* - * Get the next byte and separate the op code into the high and low nibbles. - */ -static int -dtrace_get_opcode(dis86_t *x, uint_t *high, uint_t *low) -{ - int byte; - - /* - * x86 instructions have a maximum length of 15 bytes. Bail out if - * we try to read more. - */ - if (x->d86_len >= 15) - return (x->d86_error = 1); - - if (x->d86_error) - return (1); - byte = x->d86_get_byte(x->d86_data); - if (byte < 0) - return (x->d86_error = 1); - x->d86_bytes[x->d86_len++] = byte; - *low = byte & 0xf; /* ----xxxx low 4 bits */ - *high = byte >> 4 & 0xf; /* xxxx---- bits 7 to 4 */ - return (0); -} - -/* - * Get and decode an SIB (scaled index base) byte - */ -static void -dtrace_get_SIB(dis86_t *x, uint_t *ss, uint_t *index, uint_t *base) -{ - int byte; - - if (x->d86_error) - return; - - byte = x->d86_get_byte(x->d86_data); - if (byte < 0) { - x->d86_error = 1; - return; - } - x->d86_bytes[x->d86_len++] = byte; - - *base = byte & 0x7; - *index = (byte >> 3) & 0x7; - *ss = (byte >> 6) & 0x3; -} - -/* - * Get the byte following the op code and separate it into the - * mode, register, and r/m fields. - */ -static void -dtrace_get_modrm(dis86_t *x, uint_t *mode, uint_t *reg, uint_t *r_m) -{ - if (x->d86_got_modrm == 0) { - if (x->d86_rmindex == -1) - x->d86_rmindex = x->d86_len; - dtrace_get_SIB(x, mode, reg, r_m); - x->d86_got_modrm = 1; - } -} - -/* - * Adjust register selection based on any REX prefix bits present. - */ -/*ARGSUSED*/ -static void -dtrace_rex_adjust(uint_t rex_prefix, uint_t mode, uint_t *reg, uint_t *r_m) -{ - if (reg != NULL && r_m == NULL) { - if (rex_prefix & REX_B) - *reg += 8; - } else { - if (reg != NULL && (REX_R & rex_prefix) != 0) - *reg += 8; - if (r_m != NULL && (REX_B & rex_prefix) != 0) - *r_m += 8; - } -} - -/* - * Get an immediate operand of the given size, with sign extension. - */ -static void -dtrace_imm_opnd(dis86_t *x, int wbit, int size, int opindex) -{ - int i; - int byte; - int valsize = 0; - - if (x->d86_numopnds < opindex + 1) - x->d86_numopnds = opindex + 1; - - switch (wbit) { - case BYTE_OPND: - valsize = 1; - break; - case LONG_OPND: - if (x->d86_opnd_size == SIZE16) - valsize = 2; - else if (x->d86_opnd_size == SIZE32) - valsize = 4; - else - valsize = 8; - break; - case MM_OPND: - case XMM_OPND: - case SEG_OPND: - case CONTROL_OPND: - case DEBUG_OPND: - case TEST_OPND: - valsize = size; - break; - case WORD_OPND: - valsize = 2; - break; - } - if (valsize < size) - valsize = size; - - if (x->d86_error) - return; - x->d86_opnd[opindex].d86_value = 0; - for (i = 0; i < size; ++i) { - byte = x->d86_get_byte(x->d86_data); - if (byte < 0) { - x->d86_error = 1; - return; - } - x->d86_bytes[x->d86_len++] = byte; - x->d86_opnd[opindex].d86_value |= (uint64_t)byte << (i * 8); - } - /* Do sign extension */ - if (x->d86_bytes[x->d86_len - 1] & 0x80) { - for (; i < valsize; i++) - x->d86_opnd[opindex].d86_value |= - (uint64_t)0xff << (i* 8); - } -#ifdef DIS_TEXT - x->d86_opnd[opindex].d86_mode = MODE_SIGNED; - x->d86_opnd[opindex].d86_value_size = valsize; - x->d86_imm_bytes += size; -#endif -} - -/* - * Get an ip relative operand of the given size, with sign extension. - */ -static void -dtrace_disp_opnd(dis86_t *x, int wbit, int size, int opindex) -{ - dtrace_imm_opnd(x, wbit, size, opindex); -#ifdef DIS_TEXT - x->d86_opnd[opindex].d86_mode = MODE_IPREL; -#endif -} - -/* - * Check to see if there is a segment override prefix pending. - * If so, print it in the current 'operand' location and set - * the override flag back to false. - */ -/*ARGSUSED*/ -static void -dtrace_check_override(dis86_t *x, int opindex) -{ -#ifdef DIS_TEXT - if (x->d86_seg_prefix) { - (void) strlcat(x->d86_opnd[opindex].d86_prefix, - x->d86_seg_prefix, PFIXLEN); - } -#endif - x->d86_seg_prefix = NULL; -} - - -/* - * Process a single instruction Register or Memory operand. - * - * mode = addressing mode from ModRM byte - * r_m = r_m (or reg if mode == 3) field from ModRM byte - * wbit = indicates which register (8bit, 16bit, ... MMX, etc.) set to use. - * o = index of operand that we are processing (0, 1 or 2) - * - * the value of reg or r_m must have already been adjusted for any REX prefix. - */ -/*ARGSUSED*/ -static void -dtrace_get_operand(dis86_t *x, uint_t mode, uint_t r_m, int wbit, int opindex) -{ - int have_SIB = 0; /* flag presence of scale-index-byte */ - uint_t ss; /* scale-factor from opcode */ - uint_t index; /* index register number */ - uint_t base; /* base register number */ - int dispsize; /* size of displacement in bytes */ -#ifdef DIS_TEXT - char *opnd = x->d86_opnd[opindex].d86_opnd; -#endif - - if (x->d86_numopnds < opindex + 1) - x->d86_numopnds = opindex + 1; - - if (x->d86_error) - return; - - /* - * first handle a simple register - */ - if (mode == REG_ONLY) { -#ifdef DIS_TEXT - switch (wbit) { - case MM_OPND: - (void) strlcat(opnd, dis_MMREG[r_m], OPLEN); - break; - case XMM_OPND: - (void) strlcat(opnd, dis_XMMREG[r_m], OPLEN); - break; - case SEG_OPND: - (void) strlcat(opnd, dis_SEGREG[r_m], OPLEN); - break; - case CONTROL_OPND: - (void) strlcat(opnd, dis_CONTROLREG[r_m], OPLEN); - break; - case DEBUG_OPND: - (void) strlcat(opnd, dis_DEBUGREG[r_m], OPLEN); - break; - case TEST_OPND: - (void) strlcat(opnd, dis_TESTREG[r_m], OPLEN); - break; - case BYTE_OPND: - if (x->d86_rex_prefix == 0) - (void) strlcat(opnd, dis_REG8[r_m], OPLEN); - else - (void) strlcat(opnd, dis_REG8_REX[r_m], OPLEN); - break; - case WORD_OPND: - (void) strlcat(opnd, dis_REG16[r_m], OPLEN); - break; - case LONG_OPND: - if (x->d86_opnd_size == SIZE16) - (void) strlcat(opnd, dis_REG16[r_m], OPLEN); - else if (x->d86_opnd_size == SIZE32) - (void) strlcat(opnd, dis_REG32[r_m], OPLEN); - else - (void) strlcat(opnd, dis_REG64[r_m], OPLEN); - break; - } -#endif /* DIS_TEXT */ - return; - } - - /* - * if symbolic representation, skip override prefix, if any - */ - dtrace_check_override(x, opindex); - - /* - * Handle 16 bit memory references first, since they decode - * the mode values more simply. - * mode 1 is r_m + 8 bit displacement - * mode 2 is r_m + 16 bit displacement - * mode 0 is just r_m, unless r_m is 6 which is 16 bit disp - */ - if (x->d86_addr_size == SIZE16) { - if ((mode == 0 && r_m == 6) || mode == 2) - dtrace_imm_opnd(x, WORD_OPND, 2, opindex); - else if (mode == 1) - dtrace_imm_opnd(x, BYTE_OPND, 1, opindex); -#ifdef DIS_TEXT - if (mode == 0 && r_m == 6) - x->d86_opnd[opindex].d86_mode = MODE_SIGNED; - else if (mode == 0) - x->d86_opnd[opindex].d86_mode = MODE_NONE; - else - x->d86_opnd[opindex].d86_mode = MODE_OFFSET; - (void) strlcat(opnd, dis_addr16[mode][r_m], OPLEN); -#endif - return; - } - - /* - * 32 and 64 bit addressing modes are more complex since they - * can involve an SIB (scaled index and base) byte to decode. - */ - if (r_m == ESP_REGNO || r_m == ESP_REGNO + 8) { - have_SIB = 1; - dtrace_get_SIB(x, &ss, &index, &base); - if (x->d86_error) - return; - if (base != 5 || mode != 0) - if (x->d86_rex_prefix & REX_B) - base += 8; - if (x->d86_rex_prefix & REX_X) - index += 8; - } else { - base = r_m; - } - - /* - * Compute the displacement size and get its bytes - */ - dispsize = 0; - - if (mode == 1) - dispsize = 1; - else if (mode == 2) - dispsize = 4; - else if ((r_m & 7) == EBP_REGNO || - (have_SIB && (base & 7) == EBP_REGNO)) - dispsize = 4; - - if (dispsize > 0) { - dtrace_imm_opnd(x, dispsize == 4 ? LONG_OPND : BYTE_OPND, - dispsize, opindex); - if (x->d86_error) - return; - } - -#ifdef DIS_TEXT - if (dispsize > 0) - x->d86_opnd[opindex].d86_mode = MODE_OFFSET; - - if (have_SIB == 0) { - if (x->d86_mode == SIZE32) { - if (mode == 0) - (void) strlcat(opnd, dis_addr32_mode0[r_m], - OPLEN); - else - (void) strlcat(opnd, dis_addr32_mode12[r_m], - OPLEN); - } else { - if (mode == 0) - (void) strlcat(opnd, dis_addr64_mode0[r_m], - OPLEN); - else - (void) strlcat(opnd, dis_addr64_mode12[r_m], - OPLEN); - } - } else { - uint_t need_paren = 0; - char **regs; - if (x->d86_mode == SIZE32) /* NOTE this is not addr_size! */ - regs = (char **)dis_REG32; - else - regs = (char **)dis_REG64; - - /* - * print the base (if any) - */ - if (base == EBP_REGNO && mode == 0) { - if (index != ESP_REGNO) { - (void) strlcat(opnd, "(", OPLEN); - need_paren = 1; - } - } else { - (void) strlcat(opnd, "(", OPLEN); - (void) strlcat(opnd, regs[base], OPLEN); - need_paren = 1; - } - - /* - * print the index (if any) - */ - if (index != ESP_REGNO) { - (void) strlcat(opnd, ",", OPLEN); - (void) strlcat(opnd, regs[index], OPLEN); - (void) strlcat(opnd, dis_scale_factor[ss], OPLEN); - } else - if (need_paren) - (void) strlcat(opnd, ")", OPLEN); - } -#endif -} - -/* - * Operand sequence for standard instruction involving one register - * and one register/memory operand. - * wbit indicates a byte(0) or opnd_size(1) operation - * vbit indicates direction (0 for "opcode r,r_m") or (1 for "opcode r_m, r") - */ -#define STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, vbit) { \ - dtrace_get_modrm(x, &mode, ®, &r_m); \ - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); \ - dtrace_get_operand(x, mode, r_m, wbit, vbit); \ - dtrace_get_operand(x, REG_ONLY, reg, wbit, 1 - vbit); \ -} - -/* - * Similar to above, but allows for the two operands to be of different - * classes (ie. wbit). - * wbit is for the r_m operand - * w2 is for the reg operand - */ -#define MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, w2, vbit) { \ - dtrace_get_modrm(x, &mode, ®, &r_m); \ - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); \ - dtrace_get_operand(x, mode, r_m, wbit, vbit); \ - dtrace_get_operand(x, REG_ONLY, reg, w2, 1 - vbit); \ -} - -/* - * Similar, but for 2 operands plus an immediate. - */ -#define THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, w2, immsize) { \ - dtrace_get_modrm(x, &mode, ®, &r_m); \ - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); \ - dtrace_get_operand(x, mode, r_m, wbit, 1); \ - dtrace_get_operand(x, REG_ONLY, reg, w2, 2); \ - dtrace_imm_opnd(x, wbit, immsize, 0); \ -} - -/* - * Dissassemble a single x86 or amd64 instruction. - * - * Mode determines the default operating mode (SIZE16, SIZE32 or SIZE64) - * for interpreting instructions. - * - * returns non-zero for bad opcode - */ -int -dtrace_disx86(dis86_t *x, uint_t cpu_mode) -{ - const instable_t *dp = NULL; /* decode table being used */ -#ifdef DIS_TEXT - uint_t i; -#endif -#ifdef DIS_MEM - uint_t nomem = 0; -#define NOMEM (nomem = 1) -#else -#define NOMEM /* nothing */ -#endif - uint_t wbit = 0; /* opcode wbit, 0 is 8 bit, !0 for opnd_size */ - uint_t w2; /* wbit value for second operand */ - uint_t vbit; - uint_t mode = 0; /* mode value from ModRM byte */ - uint_t reg; /* reg value from ModRM byte */ - uint_t r_m; /* r_m value from ModRM byte */ - - uint_t opcode1; /* high nibble of 1st byte */ - uint_t opcode2; /* low nibble of 1st byte */ - uint_t opcode3; /* extra opcode bits usually from ModRM byte */ - uint_t opcode4; /* high nibble of 2nd byte */ - uint_t opcode5 = 0xff; /* low nibble of 2nd byte */ - uint_t opcode6; /* high nibble of 3rd byte */ - uint_t opcode7 = 0xff; /* low nibble of 3rd byte */ - uint_t opcode_bytes = 1; - - /* - * legacy prefixes come in 5 flavors, you should have only one of each - */ - uint_t opnd_size_prefix = 0; - uint_t addr_size_prefix = 0; - uint_t segment_prefix = 0; - uint_t lock_prefix = 0; - uint_t rep_prefix = 0; - uint_t rex_prefix = 0; /* amd64 register extension prefix */ - size_t off; - - x->d86_len = 0; - x->d86_rmindex = -1; - x->d86_error = 0; -#ifdef DIS_TEXT - x->d86_numopnds = 0; - x->d86_seg_prefix = NULL; - x->d86_mneu[0] = 0; - for (i = 0; i < 3; ++i) { - x->d86_opnd[i].d86_opnd[0] = 0; - x->d86_opnd[i].d86_prefix[0] = 0; - x->d86_opnd[i].d86_value_size = 0; - x->d86_opnd[i].d86_value = 0; - x->d86_opnd[i].d86_mode = MODE_NONE; - } -#endif - x->d86_error = 0; - x->d86_memsize = 0; - - if (cpu_mode == SIZE16) { - opnd_size = SIZE16; - addr_size = SIZE16; - } else if (cpu_mode == SIZE32) { - opnd_size = SIZE32; - addr_size = SIZE32; - } else { - opnd_size = SIZE32; - addr_size = SIZE64; - } - - /* - * Get one opcode byte and check for zero padding that follows - * jump tables. - */ - if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0) - goto error; - - if (opcode1 == 0 && opcode2 == 0 && - x->d86_check_func != NULL && x->d86_check_func(x->d86_data)) { -#ifdef DIS_TEXT - (void) strncpy(x->d86_mneu, ".byte\t0", OPLEN); -#endif - goto done; - } - - /* - * Gather up legacy x86 prefix bytes. - */ - for (;;) { - uint_t *which_prefix = NULL; - - dp = &dis_distable[opcode1][opcode2]; - - switch (dp->it_adrmode) { - case PREFIX: - which_prefix = &rep_prefix; - break; - case LOCK: - which_prefix = &lock_prefix; - break; - case OVERRIDE: - which_prefix = &segment_prefix; -#ifdef DIS_TEXT - x->d86_seg_prefix = (char *)dp->it_name; -#endif - if (dp->it_invalid64 && cpu_mode == SIZE64) - goto error; - break; - case AM: - which_prefix = &addr_size_prefix; - break; - case DM: - which_prefix = &opnd_size_prefix; - break; - } - if (which_prefix == NULL) - break; - *which_prefix = (opcode1 << 4) | opcode2; - if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0) - goto error; - } - - /* - * Handle amd64 mode PREFIX values. - * Some of the segment prefixes are no-ops. (only FS/GS actually work) - * We might have a REX prefix (opcodes 0x40-0x4f) - */ - if (cpu_mode == SIZE64) { - if (segment_prefix != 0x64 && segment_prefix != 0x65) - segment_prefix = 0; - - if (opcode1 == 0x4) { - rex_prefix = (opcode1 << 4) | opcode2; - if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0) - goto error; - dp = &dis_distable[opcode1][opcode2]; - } - } - - /* - * Deal with selection of operand and address size now. - * Note that the REX.W bit being set causes opnd_size_prefix to be - * ignored. - */ - if (cpu_mode == SIZE64) { - if (rex_prefix & 0x08) - opnd_size = SIZE64; - else if (opnd_size_prefix) - opnd_size = SIZE16; - - if (addr_size_prefix) - addr_size = SIZE32; - } else if (cpu_mode == SIZE32) { - if (opnd_size_prefix) - opnd_size = SIZE16; - if (addr_size_prefix) - addr_size = SIZE16; - } else { - if (opnd_size_prefix) - opnd_size = SIZE32; - if (addr_size_prefix) - addr_size = SIZE32; - } - - /* - * The pause instruction - a repz'd nop. This doesn't fit - * with any of the other prefix goop added for SSE, so we'll - * special-case it here. - */ - if (rep_prefix == 0xf3 && opcode1 == 0x9 && opcode2 == 0x0) { - rep_prefix = 0; - dp = &dis_opPause; - } - - /* - * Some 386 instructions have 2 bytes of opcode before the mod_r/m - * byte so we may need to perform a table indirection. - */ - if (dp->it_indirect == dis_op0F[0]) { - if (dtrace_get_opcode(x, &opcode4, &opcode5) != 0) - goto error; - opcode_bytes = 2; - if (opcode4 == 0x7 && opcode5 >= 0x1 && opcode5 <= 0x3) { - uint_t subcode; - - if (dtrace_get_opcode(x, &opcode6, &opcode7) != 0) - goto error; - opcode_bytes = 3; - subcode = ((opcode6 & 0x3) << 1) | - ((opcode7 & 0x8) >> 3); - dp = &dis_op0F7123[opcode5][subcode]; - } else if ((opcode4 == 0xc) && (opcode5 >= 0x8)) { - dp = &dis_op0FC8[0]; - } else { - dp = &dis_op0F[opcode4][opcode5]; - } - } - - /* - * If still not at a TERM decode entry, then a ModRM byte - * exists and its fields further decode the instruction. - */ - x->d86_got_modrm = 0; - if (dp->it_indirect != TERM) { - dtrace_get_modrm(x, &mode, &opcode3, &r_m); - if (x->d86_error) - goto error; - reg = opcode3; - - /* - * decode 287 instructions (D8-DF) from opcodeN - */ - if (opcode1 == 0xD && opcode2 >= 0x8) { - if (opcode2 == 0xB && mode == 0x3 && opcode3 == 4) - dp = &dis_opFP5[r_m]; - else if (opcode2 == 0xA && mode == 0x3 && opcode3 < 4) - dp = &dis_opFP7[opcode3]; - else if (opcode2 == 0xB && mode == 0x3) - dp = &dis_opFP6[opcode3]; - else if (opcode2 == 0x9 && mode == 0x3 && opcode3 >= 4) - dp = &dis_opFP4[opcode3 - 4][r_m]; - else if (mode == 0x3) - dp = &dis_opFP3[opcode2 - 8][opcode3]; - else - dp = &dis_opFP1n2[opcode2 - 8][opcode3]; - } else { - dp = dp->it_indirect + opcode3; - } - } - - /* - * In amd64 bit mode, ARPL opcode is changed to MOVSXD - * (sign extend 32bit to 64 bit) - */ - if (cpu_mode == SIZE64 && opcode1 == 0x6 && opcode2 == 0x3) - dp = &dis_opMOVSLD; - - /* - * at this point we should have a correct (or invalid) opcode - */ - if ((cpu_mode == SIZE64 && dp->it_invalid64) || - (cpu_mode != SIZE64 && dp->it_invalid32)) - goto error; - if (dp->it_indirect != TERM) - goto error; - - /* - * deal with MMX/SSE opcodes which are changed by prefixes - */ - switch (dp->it_adrmode) { - case MMO: - case MMOIMPL: - case MMO3P: - case MMOM3: - case MMOMS: - case MMOPM: - case MMOPRM: - case MMOS: - case XMMO: - case XMMOM: - case XMMOMS: - case XMMOPM: - case XMMOS: - case XMMOMX: - case XMMOX3: - case XMMOXMM: - /* - * This is horrible. Some SIMD instructions take the - * form 0x0F 0x?? ..., which is easily decoded using the - * existing tables. Other SIMD instructions use various - * prefix bytes to overload existing instructions. For - * Example, addps is F0, 58, whereas addss is F3 (repz), - * F0, 58. Presumably someone got a raise for this. - * - * If we see one of the instructions which can be - * modified in this way (if we've got one of the SIMDO* - * address modes), we'll check to see if the last prefix - * was a repz. If it was, we strip the prefix from the - * mnemonic, and we indirect using the dis_opSIMDrepz - * table. - */ - - /* - * Calculate our offset in dis_op0F - */ - if ((uintptr_t)dp - (uintptr_t)dis_op0F > sizeof (dis_op0F)) - goto error; - - off = ((uintptr_t)dp - (uintptr_t)dis_op0F) / - sizeof (instable_t); - - /* - * Rewrite if this instruction used one of the magic prefixes. - */ - if (rep_prefix) { - if (rep_prefix == 0xf2) - dp = &dis_opSIMDrepnz[off]; - else - dp = &dis_opSIMDrepz[off]; - rep_prefix = 0; - } else if (opnd_size_prefix) { - dp = &dis_opSIMDdata16[off]; - opnd_size_prefix = 0; - if (opnd_size == SIZE16) - opnd_size = SIZE32; - } - break; - - case MMOSH: - /* - * As with the "normal" SIMD instructions, the MMX - * shuffle instructions are overloaded. These - * instructions, however, are special in that they use - * an extra byte, and thus an extra table. As of this - * writing, they only use the opnd_size prefix. - */ - - /* - * Calculate our offset in dis_op0F7123 - */ - if ((uintptr_t)dp - (uintptr_t)dis_op0F7123 > - sizeof (dis_op0F7123)) - goto error; - - if (opnd_size_prefix) { - off = ((uintptr_t)dp - (uintptr_t)dis_op0F7123) / - sizeof (instable_t); - dp = &dis_opSIMD7123[off]; - opnd_size_prefix = 0; - if (opnd_size == SIZE16) - opnd_size = SIZE32; - } - break; - } - - /* - * In 64 bit mode, some opcodes automatically use opnd_size == SIZE64. - */ - if (cpu_mode == SIZE64) - if (dp->it_always64 || (opnd_size == SIZE32 && dp->it_stackop)) - opnd_size = SIZE64; - -#ifdef DIS_TEXT - /* - * At this point most instructions can format the opcode mnemonic - * including the prefixes. - */ - if (lock_prefix) - (void) strlcat(x->d86_mneu, "lock ", OPLEN); - - if (rep_prefix == 0xf2) - (void) strlcat(x->d86_mneu, "repnz ", OPLEN); - else if (rep_prefix == 0xf3) - (void) strlcat(x->d86_mneu, "repz ", OPLEN); - - if (cpu_mode == SIZE64 && addr_size_prefix) - (void) strlcat(x->d86_mneu, "addr32 ", OPLEN); - - if (dp->it_adrmode != CBW && - dp->it_adrmode != CWD && - dp->it_adrmode != XMMSFNC) { - if (strcmp(dp->it_name, "INVALID") == 0) - goto error; - (void) strlcat(x->d86_mneu, dp->it_name, OPLEN); - if (dp->it_suffix) { - char *types[] = {"", "w", "l", "q"}; - if (opcode_bytes == 2 && opcode4 == 4) { - /* It's a cmovx.yy. Replace the suffix x */ - for (i = 5; i < OPLEN; i++) { - if (x->d86_mneu[i] == '.') - break; - } - x->d86_mneu[i - 1] = *types[opnd_size]; - } else { - (void) strlcat(x->d86_mneu, types[opnd_size], - OPLEN); - } - } - } -#endif - - /* - * Process operands based on the addressing modes. - */ - x->d86_mode = cpu_mode; - x->d86_rex_prefix = rex_prefix; - x->d86_opnd_size = opnd_size; - x->d86_addr_size = addr_size; - vbit = 0; /* initialize for mem/reg -> reg */ - switch (dp->it_adrmode) { - /* - * amd64 instruction to sign extend 32 bit reg/mem operands - * into 64 bit register values - */ - case MOVSXZ: -#ifdef DIS_TEXT - if (rex_prefix == 0) - (void) strncpy(x->d86_mneu, "movzld", OPLEN); -#endif - dtrace_get_modrm(x, &mode, ®, &r_m); - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - x->d86_opnd_size = SIZE64; - dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1); - x->d86_opnd_size = opnd_size = SIZE32; - wbit = LONG_OPND; - dtrace_get_operand(x, mode, r_m, wbit, 0); - break; - - /* - * movsbl movsbw movsbq (0x0FBE) or movswl movswq (0x0FBF) - * movzbl movzbw movzbq (0x0FB6) or mobzwl movzwq (0x0FB7) - * wbit lives in 2nd byte, note that operands - * are different sized - */ - case MOVZ: - if (rex_prefix & REX_W) { - /* target register size = 64 bit */ - x->d86_mneu[5] = 'q'; - } - dtrace_get_modrm(x, &mode, ®, &r_m); - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1); - x->d86_opnd_size = opnd_size = SIZE16; - wbit = WBIT(opcode5); - dtrace_get_operand(x, mode, r_m, wbit, 0); - break; - - /* - * imul instruction, with either 8-bit or longer immediate - * opcode 0x6B for byte, sign-extended displacement, 0x69 for word(s) - */ - case IMUL: - wbit = LONG_OPND; - THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, - OPSIZE(opnd_size, opcode2 == 0x9)); - break; - - /* memory or register operand to register, with 'w' bit */ - case MRw: - wbit = WBIT(opcode2); - STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0); - break; - - /* register to memory or register operand, with 'w' bit */ - /* arpl happens to fit here also because it is odd */ - case RMw: - if (opcode_bytes == 2) - wbit = WBIT(opcode5); - else - wbit = WBIT(opcode2); - STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1); - break; - - /* xaddb instruction */ - case XADDB: - wbit = 0; - STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1); - break; - - /* MMX register to memory or register operand */ - case MMS: - case MMOS: -#ifdef DIS_TEXT - wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND; -#else - wbit = LONG_OPND; -#endif - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 1); - break; - - /* MMX register to memory */ - case MMOMS: - dtrace_get_modrm(x, &mode, ®, &r_m); - if (mode == REG_ONLY) - goto error; - wbit = MM_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 1); - break; - - /* Double shift. Has immediate operand specifying the shift. */ - case DSHIFT: - wbit = LONG_OPND; - dtrace_get_modrm(x, &mode, ®, &r_m); - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 2); - dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1); - dtrace_imm_opnd(x, wbit, 1, 0); - break; - - /* - * Double shift. With no immediate operand, specifies using %cl. - */ - case DSHIFTcl: - wbit = LONG_OPND; - STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1); - break; - - /* immediate to memory or register operand */ - case IMlw: - wbit = WBIT(opcode2); - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 1); - /* - * Have long immediate for opcode 0x81, but not 0x80 nor 0x83 - */ - dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, opcode2 == 1), 0); - break; - - /* immediate to memory or register operand with the */ - /* 'w' bit present */ - case IMw: - wbit = WBIT(opcode2); - dtrace_get_modrm(x, &mode, ®, &r_m); - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 1); - dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, wbit), 0); - break; - - /* immediate to register with register in low 3 bits */ - /* of op code */ - case IR: - /* w-bit here (with regs) is bit 3 */ - wbit = opcode2 >>3 & 0x1; - reg = REGNO(opcode2); - dtrace_rex_adjust(rex_prefix, mode, ®, NULL); - mode = REG_ONLY; - r_m = reg; - dtrace_get_operand(x, mode, r_m, wbit, 1); - dtrace_imm_opnd(x, wbit, OPSIZE64(opnd_size, wbit), 0); - break; - - /* MMX immediate shift of register */ - case MMSH: - case MMOSH: - wbit = MM_OPND; - goto mm_shift; /* in next case */ - - /* SIMD immediate shift of register */ - case XMMSH: - wbit = XMM_OPND; -mm_shift: - reg = REGNO(opcode7); - dtrace_rex_adjust(rex_prefix, mode, ®, NULL); - dtrace_get_operand(x, REG_ONLY, reg, wbit, 1); - dtrace_imm_opnd(x, wbit, 1, 0); - NOMEM; - break; - - /* accumulator to memory operand */ - case AO: - vbit = 1; - /*FALLTHROUGH*/ - - /* memory operand to accumulator */ - case OA: - wbit = WBIT(opcode2); - dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1 - vbit); - dtrace_imm_opnd(x, wbit, OPSIZE64(addr_size, LONG_OPND), vbit); -#ifdef DIS_TEXT - x->d86_opnd[vbit].d86_mode = MODE_OFFSET; -#endif - break; - - - /* segment register to memory or register operand */ - case SM: - vbit = 1; - /*FALLTHROUGH*/ - - /* memory or register operand to segment register */ - case MS: - dtrace_get_modrm(x, &mode, ®, &r_m); - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, LONG_OPND, vbit); - dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 1 - vbit); - break; - - /* - * rotate or shift instructions, which may shift by 1 or - * consult the cl register, depending on the 'v' bit - */ - case Mv: - vbit = VBIT(opcode2); - wbit = WBIT(opcode2); - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 1); -#ifdef DIS_TEXT - if (vbit) { - (void) strlcat(x->d86_opnd[0].d86_opnd, "%cl", OPLEN); - } else { - x->d86_opnd[0].d86_mode = MODE_SIGNED; - x->d86_opnd[0].d86_value_size = 1; - x->d86_opnd[0].d86_value = 1; - } -#endif - break; - /* - * immediate rotate or shift instructions - */ - case MvI: - wbit = WBIT(opcode2); -normal_imm_mem: - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 1); - dtrace_imm_opnd(x, wbit, 1, 0); - break; - - /* bit test instructions */ - case MIb: - wbit = LONG_OPND; - goto normal_imm_mem; - - /* single memory or register operand with 'w' bit present */ - case Mw: - wbit = WBIT(opcode2); -just_mem: - dtrace_get_modrm(x, &mode, ®, &r_m); - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 0); - break; - - case SWAPGS: - if (cpu_mode == SIZE64 && mode == 3 && r_m == 0) { -#ifdef DIS_TEXT - (void) strncpy(x->d86_mneu, "swapgs", OPLEN); -#endif - NOMEM; - break; - } - /*FALLTHROUGH*/ - - /* prefetch instruction - memory operand, but no memory acess */ - case PREF: - NOMEM; - /*FALLTHROUGH*/ - - /* single memory or register operand */ - case M: - wbit = LONG_OPND; - goto just_mem; - - /* single memory or register byte operand */ - case Mb: - wbit = BYTE_OPND; - goto just_mem; - - case MO: - /* Similar to M, but only memory (no direct registers) */ - wbit = LONG_OPND; - dtrace_get_modrm(x, &mode, ®, &r_m); - if (mode == 3) - goto error; - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 0); - break; - - /* move special register to register or reverse if vbit */ - case SREG: - switch (opcode5) { - - case 2: - vbit = 1; - /*FALLTHROUGH*/ - case 0: - wbit = CONTROL_OPND; - break; - - case 3: - vbit = 1; - /*FALLTHROUGH*/ - case 1: - wbit = DEBUG_OPND; - break; - - case 6: - vbit = 1; - /*FALLTHROUGH*/ - case 4: - wbit = TEST_OPND; - break; - - } - dtrace_get_modrm(x, &mode, ®, &r_m); - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - dtrace_get_operand(x, REG_ONLY, reg, wbit, vbit); - dtrace_get_operand(x, REG_ONLY, r_m, LONG_OPND, 1 - vbit); - NOMEM; - break; - - /* - * single register operand with register in the low 3 - * bits of op code - */ - case R: - if (opcode_bytes == 2) - reg = REGNO(opcode5); - else - reg = REGNO(opcode2); - dtrace_rex_adjust(rex_prefix, mode, ®, NULL); - dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 0); - NOMEM; - break; - - /* - * register to accumulator with register in the low 3 - * bits of op code, xchg instructions - */ - case RA: - NOMEM; - reg = REGNO(opcode2); - dtrace_rex_adjust(rex_prefix, mode, ®, NULL); - dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 0); - dtrace_get_operand(x, REG_ONLY, EAX_REGNO, LONG_OPND, 1); - break; - - /* - * single segment register operand, with register in - * bits 3-4 of op code byte - */ - case SEG: - NOMEM; - reg = (x->d86_bytes[x->d86_len - 1] >> 3) & 0x3; - dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 0); - break; - - /* - * single segment register operand, with register in - * bits 3-5 of op code - */ - case LSEG: - NOMEM; - /* long seg reg from opcode */ - reg = (x->d86_bytes[x->d86_len - 1] >> 3) & 0x7; - dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 0); - break; - - /* memory or register operand to register */ - case MR: - wbit = LONG_OPND; - STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0); - break; - - case RM: - wbit = LONG_OPND; - STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1); - break; - - /* MMX/SIMD-Int memory or mm reg to mm reg */ - case MM: - case MMO: -#ifdef DIS_TEXT - wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND; -#else - wbit = LONG_OPND; -#endif - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 0); - break; - - case MMOIMPL: -#ifdef DIS_TEXT - wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND; -#else - wbit = LONG_OPND; -#endif - dtrace_get_modrm(x, &mode, ®, &r_m); - if (mode != REG_ONLY) - goto error; - - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 0); - dtrace_get_operand(x, REG_ONLY, reg, MM_OPND, 1); - mode = 0; /* change for memory access size... */ - break; - - /* MMX/SIMD-Int and SIMD-FP predicated mm reg to r32 */ - case MMO3P: - wbit = MM_OPND; - goto xmm3p; - case XMM3P: - wbit = XMM_OPND; -xmm3p: - dtrace_get_modrm(x, &mode, ®, &r_m); - if (mode != REG_ONLY) - goto error; - - THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 1); - NOMEM; - break; - - /* MMX/SIMD-Int predicated r32/mem to mm reg */ - case MMOPRM: - wbit = LONG_OPND; - w2 = MM_OPND; - goto xmmprm; - case XMMPRM: - wbit = LONG_OPND; - w2 = XMM_OPND; -xmmprm: - THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, w2, 1); - break; - - /* MMX/SIMD-Int predicated mm/mem to mm reg */ - case MMOPM: - wbit = w2 = MM_OPND; - goto xmmprm; - - /* MMX/SIMD-Int mm reg to r32 */ - case MMOM3: - NOMEM; - dtrace_get_modrm(x, &mode, ®, &r_m); - if (mode != REG_ONLY) - goto error; - wbit = MM_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 0); - break; - - /* SIMD memory or xmm reg operand to xmm reg */ - case XMM: - case XMMO: - case XMMXIMPL: - wbit = XMM_OPND; - STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0); - - if (dp->it_adrmode == XMMXIMPL && mode != REG_ONLY) - goto error; - -#ifdef DIS_TEXT - /* - * movlps and movhlps share opcodes. They differ in the - * addressing modes allowed for their operands. - * movhps and movlhps behave similarly. - */ - if (mode == REG_ONLY) { - if (strcmp(dp->it_name, "movlps") == 0) - (void) strncpy(x->d86_mneu, "movhlps", OPLEN); - else if (strcmp(dp->it_name, "movhps") == 0) - (void) strncpy(x->d86_mneu, "movlhps", OPLEN); - } -#endif - if (dp->it_adrmode == XMMXIMPL) - mode = 0; /* change for memory access size... */ - break; - - /* SIMD xmm reg to memory or xmm reg */ - case XMMS: - case XMMOS: - case XMMMS: - case XMMOMS: - dtrace_get_modrm(x, &mode, ®, &r_m); -#ifdef DIS_TEXT - if ((strcmp(dp->it_name, "movlps") == 0 || - strcmp(dp->it_name, "movhps") == 0 || - strcmp(dp->it_name, "movntps") == 0) && - mode == REG_ONLY) - goto error; -#endif - wbit = XMM_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1); - break; - - /* SIMD memory to xmm reg */ - case XMMM: - case XMMOM: - wbit = XMM_OPND; - dtrace_get_modrm(x, &mode, ®, &r_m); -#ifdef DIS_TEXT - if (mode == REG_ONLY) { - if (strcmp(dp->it_name, "movhps") == 0) - (void) strncpy(x->d86_mneu, "movlhps", OPLEN); - else - goto error; - } -#endif - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0); - break; - - /* SIMD memory or r32 to xmm reg */ - case XMM3MX: - wbit = LONG_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0); - break; - - case XMM3MXS: - wbit = LONG_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1); - break; - - /* SIMD memory or mm reg to xmm reg */ - case XMMOMX: - /* SIMD mm to xmm */ - case XMMMX: - wbit = MM_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0); - break; - - /* SIMD memory or xmm reg to mm reg */ - case XMMXMM: - case XMMOXMM: - case XMMXM: - wbit = XMM_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 0); - break; - - - /* SIMD memory or xmm reg to r32 */ - case XMMXM3: - wbit = XMM_OPND; - MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 0); - break; - - /* SIMD xmm to r32 */ - case XMMX3: - case XMMOX3: - dtrace_get_modrm(x, &mode, ®, &r_m); - if (mode != REG_ONLY) - goto error; - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - dtrace_get_operand(x, mode, r_m, XMM_OPND, 0); - dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1); - NOMEM; - break; - - /* SIMD predicated memory or xmm reg with/to xmm reg */ - case XMMP: - case XMMOPM: - wbit = XMM_OPND; - THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1); - -#ifdef DIS_TEXT - /* - * cmpps and cmpss vary their instruction name based - * on the value of imm8. Other XMMP instructions, - * such as shufps, require explicit specification of - * the predicate. - */ - if (dp->it_name[0] == 'c' && - dp->it_name[1] == 'm' && - dp->it_name[2] == 'p' && - strlen(dp->it_name) == 5) { - uchar_t pred = x->d86_opnd[0].d86_value & 0xff; - - if (pred >= (sizeof (dis_PREDSUFFIX) / sizeof (char *))) - goto error; - - (void) strncpy(x->d86_mneu, "cmp", OPLEN); - (void) strlcat(x->d86_mneu, dis_PREDSUFFIX[pred], - OPLEN); - (void) strlcat(x->d86_mneu, - dp->it_name + strlen(dp->it_name) - 2, - OPLEN); - x->d86_opnd[0] = x->d86_opnd[1]; - x->d86_opnd[1] = x->d86_opnd[2]; - x->d86_numopnds = 2; - } -#endif - break; - - /* immediate operand to accumulator */ - case IA: - wbit = WBIT(opcode2); - dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1); - dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, wbit), 0); - NOMEM; - break; - - /* memory or register operand to accumulator */ - case MA: - wbit = WBIT(opcode2); - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, wbit, 0); - break; - - /* si register to di register used to reference memory */ - case SD: -#ifdef DIS_TEXT - dtrace_check_override(x, 0); - x->d86_numopnds = 2; - if (addr_size == SIZE64) { - (void) strlcat(x->d86_opnd[0].d86_opnd, "(%rsi)", - OPLEN); - (void) strlcat(x->d86_opnd[1].d86_opnd, "(%rdi)", - OPLEN); - } else if (addr_size == SIZE32) { - (void) strlcat(x->d86_opnd[0].d86_opnd, "(%esi)", - OPLEN); - (void) strlcat(x->d86_opnd[1].d86_opnd, "(%edi)", - OPLEN); - } else { - (void) strlcat(x->d86_opnd[0].d86_opnd, "(%si)", - OPLEN); - (void) strlcat(x->d86_opnd[1].d86_opnd, "(%di)", - OPLEN); - } -#endif - wbit = LONG_OPND; - break; - - /* accumulator to di register */ - case AD: - wbit = WBIT(opcode2); -#ifdef DIS_TEXT - dtrace_check_override(x, 1); - x->d86_numopnds = 2; - dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 0); - if (addr_size == SIZE64) - (void) strlcat(x->d86_opnd[1].d86_opnd, "(%rdi)", - OPLEN); - else if (addr_size == SIZE32) - (void) strlcat(x->d86_opnd[1].d86_opnd, "(%edi)", - OPLEN); - else - (void) strlcat(x->d86_opnd[1].d86_opnd, "(%di)", - OPLEN); -#endif - break; - - /* si register to accumulator */ - case SA: - wbit = WBIT(opcode2); -#ifdef DIS_TEXT - dtrace_check_override(x, 0); - x->d86_numopnds = 2; - if (addr_size == SIZE64) - (void) strlcat(x->d86_opnd[0].d86_opnd, "(%rsi)", - OPLEN); - else if (addr_size == SIZE32) - (void) strlcat(x->d86_opnd[0].d86_opnd, "(%esi)", - OPLEN); - else - (void) strlcat(x->d86_opnd[0].d86_opnd, "(%si)", - OPLEN); - dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1); -#endif - break; - - /* - * single operand, a 16/32 bit displacement - */ - case D: - wbit = LONG_OPND; - dtrace_disp_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 0); - NOMEM; - break; - - /* jmp/call indirect to memory or register operand */ - case INM: -#ifdef DIS_TEXT - (void) strlcat(x->d86_opnd[0].d86_prefix, "*", OPLEN); -#endif - dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); - dtrace_get_operand(x, mode, r_m, LONG_OPND, 0); - wbit = LONG_OPND; - break; - - /* - * for long jumps and long calls -- a new code segment - * register and an offset in IP -- stored in object - * code in reverse order. Note - not valid in amd64 - */ - case SO: - dtrace_check_override(x, 1); - wbit = LONG_OPND; - dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 1); -#ifdef DIS_TEXT - x->d86_opnd[1].d86_mode = MODE_SIGNED; -#endif - /* will now get segment operand */ - dtrace_imm_opnd(x, wbit, 2, 0); - break; - - /* - * jmp/call. single operand, 8 bit displacement. - * added to current EIP in 'compofff' - */ - case BD: - dtrace_disp_opnd(x, BYTE_OPND, 1, 0); - NOMEM; - break; - - /* single 32/16 bit immediate operand */ - case I: - wbit = LONG_OPND; - dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 0); - break; - - /* single 8 bit immediate operand */ - case Ib: - wbit = LONG_OPND; - dtrace_imm_opnd(x, wbit, 1, 0); - break; - - case ENTER: - wbit = LONG_OPND; - dtrace_imm_opnd(x, wbit, 2, 0); - dtrace_imm_opnd(x, wbit, 1, 1); - switch (opnd_size) { - case SIZE64: - x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 8; - break; - case SIZE32: - x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 4; - break; - case SIZE16: - x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 2; - break; - } - - break; - - /* 16-bit immediate operand */ - case RET: - wbit = LONG_OPND; - dtrace_imm_opnd(x, wbit, 2, 0); - break; - - /* single 8 bit port operand */ - case P: - dtrace_check_override(x, 0); - dtrace_imm_opnd(x, BYTE_OPND, 1, 0); - NOMEM; - break; - - /* single operand, dx register (variable port instruction) */ - case V: - x->d86_numopnds = 1; - dtrace_check_override(x, 0); -#ifdef DIS_TEXT - (void) strlcat(x->d86_opnd[0].d86_opnd, "(%dx)", OPLEN); -#endif - NOMEM; - break; - - /* - * The int instruction, which has two forms: - * int 3 (breakpoint) or - * int n, where n is indicated in the subsequent - * byte (format Ib). The int 3 instruction (opcode 0xCC), - * where, although the 3 looks like an operand, - * it is implied by the opcode. It must be converted - * to the correct base and output. - */ - case INT3: -#ifdef DIS_TEXT - x->d86_numopnds = 1; - x->d86_opnd[0].d86_mode = MODE_SIGNED; - x->d86_opnd[0].d86_value_size = 1; - x->d86_opnd[0].d86_value = 3; -#endif - NOMEM; - break; - - /* single 8 bit immediate operand */ - case INTx: - dtrace_imm_opnd(x, BYTE_OPND, 1, 0); - NOMEM; - break; - - /* an unused byte must be discarded */ - case U: - if (x->d86_get_byte(x->d86_data) < 0) - goto error; - x->d86_len++; - NOMEM; - break; - - case CBW: -#ifdef DIS_TEXT - if (opnd_size == SIZE16) - (void) strlcat(x->d86_mneu, "cbtw", OPLEN); - else if (opnd_size == SIZE32) - (void) strlcat(x->d86_mneu, "cwtl", OPLEN); - else - (void) strlcat(x->d86_mneu, "cltq", OPLEN); -#endif - wbit = LONG_OPND; - NOMEM; - break; - - case CWD: -#ifdef DIS_TEXT - if (opnd_size == SIZE16) - (void) strlcat(x->d86_mneu, "cwtd", OPLEN); - else if (opnd_size == SIZE32) - (void) strlcat(x->d86_mneu, "cltd", OPLEN); - else - (void) strlcat(x->d86_mneu, "cqtd", OPLEN); -#endif - wbit = LONG_OPND; - NOMEM; - break; - - case XMMSFNC: - /* - * sfence is sfence if mode is REG_ONLY. If mode isn't - * REG_ONLY, mnemonic should be 'clflush'. - */ - dtrace_get_modrm(x, &mode, ®, &r_m); - - /* sfence doesn't take operands */ -#ifdef DIS_TEXT - if (mode == REG_ONLY) { - (void) strlcat(x->d86_mneu, "sfence", OPLEN); - } else { - (void) strlcat(x->d86_mneu, "clflush", OPLEN); - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - dtrace_get_operand(x, mode, r_m, BYTE_OPND, 0); - NOMEM; - } -#else - if (mode != REG_ONLY) { - dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); - dtrace_get_operand(x, mode, r_m, BYTE_OPND, 0); - NOMEM; - } -#endif - break; - - /* - * no disassembly, the mnemonic was all there was so go on - */ - case NORM: - if (dp->it_invalid32 && cpu_mode != SIZE64) - goto error; - NOMEM; - /*FALLTHROUGH*/ - case IMPLMEM: - break; - - case XMMFENCE: - /* - * Only the following exact byte sequences are allowed: - * - * 0f ae e8 lfence - * 0f ae f0 mfence - */ - if ((uint8_t)x->d86_bytes[x->d86_len - 1] != 0xe8 && - (uint8_t)x->d86_bytes[x->d86_len - 1] != 0xf0) - goto error; - - break; - - - /* float reg */ - case F: -#ifdef DIS_TEXT - x->d86_numopnds = 1; - (void) strlcat(x->d86_opnd[0].d86_opnd, "%st(X)", OPLEN); - x->d86_opnd[0].d86_opnd[4] = r_m + '0'; -#endif - NOMEM; - break; - - /* float reg to float reg, with ret bit present */ - case FF: - vbit = opcode2 >> 2 & 0x1; /* vbit = 1: st -> st(i) */ - /*FALLTHROUGH*/ - case FFC: /* case for vbit always = 0 */ -#ifdef DIS_TEXT - x->d86_numopnds = 2; - (void) strlcat(x->d86_opnd[1 - vbit].d86_opnd, "%st", OPLEN); - (void) strlcat(x->d86_opnd[vbit].d86_opnd, "%st(X)", OPLEN); - x->d86_opnd[vbit].d86_opnd[4] = r_m + '0'; -#endif - NOMEM; - break; - - /* an invalid op code */ - case AM: - case DM: - case OVERRIDE: - case PREFIX: - case UNKNOWN: - NOMEM; - default: - goto error; - } /* end switch */ - if (x->d86_error) - goto error; - -done: -#ifdef DIS_MEM - /* - * compute the size of any memory accessed by the instruction - */ - if (x->d86_memsize != 0) { - return (0); - } else if (dp->it_stackop) { - switch (opnd_size) { - case SIZE16: - x->d86_memsize = 2; - break; - case SIZE32: - x->d86_memsize = 4; - break; - case SIZE64: - x->d86_memsize = 8; - break; - } - } else if (nomem || mode == REG_ONLY) { - x->d86_memsize = 0; - - } else if (dp->it_size != 0) { - /* - * In 64 bit mode descriptor table entries - * go up to 10 bytes and popf/pushf are always 8 bytes - */ - if (x->d86_mode == SIZE64 && dp->it_size == 6) - x->d86_memsize = 10; - else if (x->d86_mode == SIZE64 && opcode1 == 0x9 && - (opcode2 == 0xc || opcode2 == 0xd)) - x->d86_memsize = 8; - else - x->d86_memsize = dp->it_size; - - } else if (wbit == 0) { - x->d86_memsize = 1; - - } else if (wbit == LONG_OPND) { - if (opnd_size == SIZE64) - x->d86_memsize = 8; - else if (opnd_size == SIZE32) - x->d86_memsize = 4; - else - x->d86_memsize = 2; - - } else if (wbit == SEG_OPND) { - x->d86_memsize = 4; - - } else { - x->d86_memsize = 8; - } -#endif - return (0); - -error: -#ifdef DIS_TEXT - (void) strlcat(x->d86_mneu, "undef", OPLEN); -#endif - return (1); -} - -#ifdef DIS_TEXT - -/* - * Some instructions should have immediate operands printed - * as unsigned integers. We compare against this table. - */ -static char *unsigned_ops[] = { - "or", "and", "xor", "test", "in", "out", "lcall", "ljmp", - "rcr", "rcl", "ror", "rol", "shl", "shr", "sal", "psr", "psl", - 0 -}; - -static int -isunsigned_op(char *opcode) -{ - char *where; - int i; - int is_unsigned = 0; - - /* - * Work back to start of last mnemonic, since we may have - * prefixes on some opcodes. - */ - where = opcode + strlen(opcode) - 1; - while (where > opcode && *where != ' ') - --where; - if (*where == ' ') - ++where; - - for (i = 0; unsigned_ops[i]; ++i) { - if (strncmp(where, unsigned_ops[i], - strlen(unsigned_ops[i]))) - continue; - is_unsigned = 1; - break; - } - return (is_unsigned); -} - -/* ARGSUSED */ -void -dtrace_disx86_str(dis86_t *dis, uint_t mode, uintptr_t pc, char *buf, - size_t buflen) -{ - int i; - - dis->d86_sprintf_func(buf, buflen, "%-6s ", dis->d86_mneu); - - /* - * For PC-relative jumps, the pc is really the next pc after executing - * this instruction, so increment it appropriately. - */ - pc += dis->d86_len; - - for (i = 0; i < dis->d86_numopnds; i++) { - d86opnd_t *op = &dis->d86_opnd[i]; - int64_t sv; - uint64_t mask; - - if (i != 0) - (void) strlcat(buf, ",", buflen); - - (void) strlcat(buf, op->d86_prefix, buflen); - - sv = op->d86_value; - - switch (op->d86_mode) { - - case MODE_NONE: - - (void) strlcat(buf, op->d86_opnd, buflen); - break; - - case MODE_SIGNED: - case MODE_IMPLIED: - case MODE_OFFSET: - - if (dis->d86_seg_prefix) - (void) strlcat(buf, dis->d86_seg_prefix, - buflen); - - switch (op->d86_value_size) { - case 1: - sv = (int8_t)sv; - mask = 0xff; - break; - case 2: - sv = (int16_t)sv; - mask = 0xffff; - break; - case 4: - sv = (int32_t)sv; - mask = 0xffffffff; - break; - case 8: - mask = 0xffffffffffffffffULL; - break; - } - - if (op->d86_mode == MODE_SIGNED || - op->d86_mode == MODE_IMPLIED) - (void) strlcat(buf, "$", buflen); - - if (sv < 0 && sv > -0xffff && - !isunsigned_op(dis->d86_mneu)) { - dis->d86_sprintf_func(buf + strlen(buf), - buflen - strlen(buf), - (dis->d86_flags & DIS_OP_OCTAL) ? - "-0%llo" : "-0x%llx", -sv & mask); - } else { - dis->d86_sprintf_func(buf + strlen(buf), - buflen - strlen(buf), - (dis->d86_flags & DIS_OP_OCTAL) ? - "0%llo" : "0x%llx", sv & mask); - } - (void) strlcat(buf, op->d86_opnd, buflen); - break; - - case MODE_IPREL: - - switch (op->d86_value_size) { - case 1: - sv = (int8_t)sv; - break; - case 2: - sv = (int16_t)sv; - break; - case 4: - sv = (int32_t)sv; - break; - } - - if (sv < 0) - dis->d86_sprintf_func(buf + strlen(buf), - buflen - strlen(buf), - (dis->d86_flags & DIS_OP_OCTAL) ? - "-0%llo" : "-0x%llx", -sv - dis->d86_len); - else - dis->d86_sprintf_func(buf + strlen(buf), - buflen - strlen(buf), - (dis->d86_flags & DIS_OP_OCTAL) ? - "+0%llo" : "+0x%llx", sv + dis->d86_len); - - (void) strlcat(buf, "\t<", buflen); - - if (dis->d86_sym_lookup == NULL || - dis->d86_sym_lookup(dis->d86_data, pc + sv, - buf + strlen(buf), buflen - strlen(buf)) != 0) - dis->d86_sprintf_func(buf + strlen(buf), - buflen - strlen(buf), - (dis->d86_flags & DIS_OP_OCTAL) ? - "0%llo" : "0x%llx", pc + sv); - - (void) strlcat(buf, ">", buflen); - - break; - } - } -} - -#endif /* DIS_TEXT */ Index: src/external/cddl/osnet/dev/dtrace/i386/dis_tables.h =================================================================== RCS file: src/external/cddl/osnet/dev/dtrace/i386/dis_tables.h diff -N src/external/cddl/osnet/dev/dtrace/i386/dis_tables.h --- src/external/cddl/osnet/dev/dtrace/i386/dis_tables.h 21 Feb 2010 01:46:33 -0000 1.2 +++ /dev/null 1 Jan 1970 00:00:00 -0000 @@ -1,114 +0,0 @@ -/* $NetBSD: dis_tables.h,v 1.2 2010/02/21 01:46:33 darran Exp $ */ - -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - * - * $FreeBSD: src/sys/cddl/dev/dtrace/i386/dis_tables.h,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* Copyright (c) 1988 AT&T */ -/* All Rights Reserved */ - - -#ifndef _DIS_TABLES_H -#define _DIS_TABLES_H - -#if defined(sun) -#pragma ident "@(#)dis_tables.h 1.7 06/03/02 SMI" -#endif - -/* - * Constants and prototypes for the IA32 disassembler backend. See dis_tables.c - * for usage information and documentation. - */ - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include - -/* - * values for cpu mode - */ -#define SIZE16 1 -#define SIZE32 2 -#define SIZE64 3 - -#define OPLEN 256 -#define PFIXLEN 8 -#define NCPS 12 /* number of chars per symbol */ - -/* - * data structures that must be provided to dtrace_dis86() - */ -typedef struct d86opnd { - char d86_opnd[OPLEN]; /* symbolic rep of operand */ - char d86_prefix[PFIXLEN]; /* any prefix string or "" */ - uint_t d86_mode; /* mode for immediate */ - uint_t d86_value_size; /* size in bytes of d86_value */ - uint64_t d86_value; /* immediate value of opnd */ -} d86opnd_t; - -typedef struct dis86 { - uint_t d86_mode; - uint_t d86_error; - uint_t d86_len; /* instruction length */ - int d86_rmindex; /* index of modrm byte or -1 */ - uint_t d86_memsize; /* size of memory referenced */ - char d86_bytes[16]; /* bytes of instruction */ - char d86_mneu[OPLEN]; - uint_t d86_numopnds; - uint_t d86_rex_prefix; /* value of REX prefix if !0 */ - char *d86_seg_prefix; /* segment prefix, if any */ - uint_t d86_opnd_size; - uint_t d86_addr_size; - uint_t d86_got_modrm; - struct d86opnd d86_opnd[3]; /* up to 3 operands */ - int (*d86_check_func)(void *); - int (*d86_get_byte)(void *); -#ifdef DIS_TEXT - int (*d86_sym_lookup)(void *, uint64_t, char *, size_t); - int (*d86_sprintf_func)(char *, size_t, const char *, ...); - int d86_flags; - uint_t d86_imm_bytes; -#endif - void *d86_data; -} dis86_t; - -extern int dtrace_disx86(dis86_t *x, uint_t cpu_mode); - -#define DIS_OP_OCTAL 0x1 /* Print all numbers in octal */ - -#ifdef DIS_TEXT -extern void dtrace_disx86_str(dis86_t *x, uint_t cpu_mode, uintptr_t pc, - char *buf, size_t len); -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _DIS_TABLES_H */ Index: src/external/cddl/osnet/dev/dtrace/i386/dtrace_asm.S =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/i386/dtrace_asm.S,v retrieving revision 1.6 diff -u -p -r1.6 dtrace_asm.S --- src/external/cddl/osnet/dev/dtrace/i386/dtrace_asm.S 27 Feb 2017 06:47:00 -0000 1.6 +++ src/external/cddl/osnet/dev/dtrace/i386/dtrace_asm.S 12 Apr 2017 16:19:26 -0000 @@ -21,7 +21,7 @@ * * CDDL HEADER END * - * $FreeBSD: src/sys/cddl/dev/dtrace/i386/dtrace_asm.S,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/i386/dtrace_asm.S 298171 2016-04-17 23:08:47Z markj $ */ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. @@ -146,11 +146,11 @@ invop_leave: movl 8(%esp), %eax /* load calling EIP */ incl %eax /* increment over LOCK prefix */ movl %eax, -8(%ebx) /* store calling EIP */ - movl %ebx, -4(%esp) /* temporarily store new %esp */ + subl $8, %ebx /* adjust for three pushes, one pop */ + movl %ebx, 8(%esp) /* temporarily store new %esp */ popl %ebx /* pop off temp */ popl %eax /* pop off temp */ - movl -12(%esp), %esp /* set stack pointer */ - subl $8, %esp /* adjust for three pushes, one pop */ + movl (%esp), %esp /* set stack pointer */ iret /* return from interrupt */ invop_nop: /* @@ -193,19 +193,7 @@ uint32_t dtrace_cas32(uint32_t *target, */ ENTRY(dtrace_cas32) - movl 4(%esp), %edx - movl 8(%esp), %eax - movl 12(%esp), %ecx - lock - cmpxchgl %ecx, (%edx) - ret - END(dtrace_cas32) - -/* -uint32_t dtrace_casptr(uint32_t *target, uint32_t cmp, uint32_t new) -*/ - - ENTRY(dtrace_casptr) + ALTENTRY(dtrace_casptr) movl 4(%esp), %edx movl 8(%esp), %eax movl 12(%esp), %ecx @@ -213,7 +201,7 @@ uint32_t dtrace_casptr(uint32_t *target, cmpxchgl %ecx, (%edx) ret END(dtrace_casptr) - + END(dtrace_cas32) /* uintptr_t dtrace_caller(int aframes) Index: src/external/cddl/osnet/dev/dtrace/i386/dtrace_isa.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/i386/dtrace_isa.c,v retrieving revision 1.5 diff -u -p -r1.5 dtrace_isa.c --- src/external/cddl/osnet/dev/dtrace/i386/dtrace_isa.c 27 Feb 2017 06:47:00 -0000 1.5 +++ src/external/cddl/osnet/dev/dtrace/i386/dtrace_isa.c 8 May 2017 08:27:51 -0000 @@ -21,7 +21,7 @@ * * CDDL HEADER END * - * $FreeBSD: src/sys/cddl/dev/dtrace/i386/dtrace_isa.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/i386/dtrace_isa.c 298171 2016-04-17 23:08:47Z markj $ */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. @@ -36,6 +36,8 @@ #include #include +#include "regset.h" + uintptr_t kernelbase = (uintptr_t)KERNBASE; #define INKERNEL(va) \ @@ -54,6 +56,8 @@ uint16_t dtrace_fuword16_nocheck(void *) uint32_t dtrace_fuword32_nocheck(void *); uint64_t dtrace_fuword64_nocheck(void *); +int dtrace_ustackdepth_max = 2048; + void dtrace_getpcstack(pc_t *pcstack, int pcstack_limit, int aframes, uint32_t *intrpc) @@ -112,11 +116,13 @@ dtrace_getustack_common(uint64_t *pcstac uintptr_t oldcontext = lwp->lwp_oldcontext; /* XXX signal stack. */ size_t s1, s2; #endif + uintptr_t oldsp; volatile uint16_t *flags = (volatile uint16_t *)&cpu_core[cpu_number()].cpuc_dtrace_flags; int ret = 0; ASSERT(pcstack == NULL || pcstack_limit > 0); + ASSERT(dtrace_ustackdepth_max > 0); #ifdef notyet /* XXX signal stack. */ if (p->p_model == DATAMODEL_NATIVE) { @@ -129,7 +135,16 @@ dtrace_getustack_common(uint64_t *pcstac #endif while (pc != 0) { - ret++; + /* + * We limit the number of times we can go around this + * loop to account for a circular stack. + */ + if (ret++ >= dtrace_ustackdepth_max) { + *flags |= CPU_DTRACE_BADSTACK; + cpu_core[cpu_number()].cpuc_dtrace_illval = sp; + break; + } + if (pcstack != NULL) { *pcstack++ = (uint64_t)pc; pcstack_limit--; @@ -140,6 +155,8 @@ dtrace_getustack_common(uint64_t *pcstac if (sp == 0) break; + oldsp = sp; + #ifdef notyet /* XXX signal stack. */ if (oldcontext == sp + s1 || oldcontext == sp + s2) { if (p->p_model == DATAMODEL_NATIVE) { @@ -178,6 +195,12 @@ dtrace_getustack_common(uint64_t *pcstac sp = dtrace_fuword32((void *)sp); #endif /* ! notyet */ + if (sp == oldsp) { + *flags |= CPU_DTRACE_BADSTACK; + cpu_core[cpu_number()].cpuc_dtrace_illval = sp; + break; + } + /* * This is totally bogus: if we faulted, we're going to clear * the fault and break. This is to deal with the apparently @@ -242,7 +265,7 @@ dtrace_getupcstack(uint64_t *pcstack, in pc = dtrace_fuword32((void *) sp); } - n = dtrace_getustack_common(pcstack, pcstack_limit, pc, fp); + n = dtrace_getustack_common(pcstack, pcstack_limit, pc, sp); ASSERT(n >= 0); ASSERT(n <= pcstack_limit); @@ -429,6 +452,7 @@ dtrace_getarg(int arg, int aframes) stack = (uintptr_t *)&frame->tf_esp + 1; goto load; } + } /* @@ -480,112 +504,102 @@ dtrace_getstackdepth(int aframes) return depth - aframes; } -#ifdef notyet ulong_t -dtrace_getreg(struct regs *rp, uint_t reg) +dtrace_getreg(struct trapframe *rp, uint_t reg) { -#if defined(__amd64) - int regmap[] = { - REG_GS, /* GS */ - REG_FS, /* FS */ - REG_ES, /* ES */ - REG_DS, /* DS */ - REG_RDI, /* EDI */ - REG_RSI, /* ESI */ - REG_RBP, /* EBP */ - REG_RSP, /* ESP */ - REG_RBX, /* EBX */ - REG_RDX, /* EDX */ - REG_RCX, /* ECX */ - REG_RAX, /* EAX */ - REG_TRAPNO, /* TRAPNO */ - REG_ERR, /* ERR */ - REG_RIP, /* EIP */ - REG_CS, /* CS */ - REG_RFL, /* EFL */ - REG_RSP, /* UESP */ - REG_SS /* SS */ + struct pcb *pcb; + int regmap[] = { /* Order is dependent on reg.d */ + REG_GS, /* 0 GS */ + REG_FS, /* 1 FS */ + REG_ES, /* 2 ES */ + REG_DS, /* 3 DS */ + REG_RDI, /* 4 EDI */ + REG_RSI, /* 5 ESI */ + REG_RBP, /* 6 EBP, REG_FP */ + REG_RSP, /* 7 ESP */ + REG_RBX, /* 8 EBX */ + REG_RDX, /* 9 EDX, REG_R1 */ + REG_RCX, /* 10 ECX */ + REG_RAX, /* 11 EAX, REG_R0 */ + REG_TRAPNO, /* 12 TRAPNO */ + REG_ERR, /* 13 ERR */ + REG_RIP, /* 14 EIP, REG_PC */ + REG_CS, /* 15 CS */ + REG_RFL, /* 16 EFL, REG_PS */ + REG_RSP, /* 17 UESP, REG_SP */ + REG_SS /* 18 SS */ }; - if (reg <= SS) { - if (reg >= sizeof (regmap) / sizeof (int)) { - DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); - return (0); - } + if (reg > SS) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + return (0); + } - reg = regmap[reg]; - } else { - reg -= SS + 1; + if (reg >= sizeof (regmap) / sizeof (int)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + return (0); } - switch (reg) { + reg = regmap[reg]; + + switch(reg) { + case REG_GS: +#ifdef __FreeBSD__ + if ((pcb = curthread->td_pcb) == NULL) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + return (0); + } + return (pcb->pcb_gs); +#endif +#ifdef __NetBSD__ + return (rp->tf_gs); +#endif + case REG_FS: + return (rp->tf_fs); + case REG_ES: + return (rp->tf_es); + case REG_DS: + return (rp->tf_ds); case REG_RDI: - return (rp->r_rdi); + return (rp->tf_edi); case REG_RSI: - return (rp->r_rsi); - case REG_RDX: - return (rp->r_rdx); + return (rp->tf_esi); + case REG_RBP: + return (rp->tf_ebp); + case REG_RSP: +#ifdef __FreeBSD__ + return (rp->tf_isp); +#endif +#ifdef __NetBSD__ + return (rp->tf_esp); +#endif + case REG_RBX: + return (rp->tf_ebx); case REG_RCX: - return (rp->r_rcx); - case REG_R8: - return (rp->r_r8); - case REG_R9: - return (rp->r_r9); + return (rp->tf_ecx); case REG_RAX: - return (rp->r_rax); - case REG_RBX: - return (rp->r_rbx); - case REG_RBP: - return (rp->r_rbp); - case REG_R10: - return (rp->r_r10); - case REG_R11: - return (rp->r_r11); - case REG_R12: - return (rp->r_r12); - case REG_R13: - return (rp->r_r13); - case REG_R14: - return (rp->r_r14); - case REG_R15: - return (rp->r_r15); - case REG_DS: - return (rp->r_ds); - case REG_ES: - return (rp->r_es); - case REG_FS: - return (rp->r_fs); - case REG_GS: - return (rp->r_gs); + return (rp->tf_eax); case REG_TRAPNO: - return (rp->r_trapno); + return (rp->tf_trapno); case REG_ERR: - return (rp->r_err); + return (rp->tf_err); case REG_RIP: - return (rp->r_rip); + return (rp->tf_eip); case REG_CS: - return (rp->r_cs); - case REG_SS: - return (rp->r_ss); + return (rp->tf_cs); case REG_RFL: - return (rp->r_rfl); + return (rp->tf_eflags); +#if 0 case REG_RSP: - return (rp->r_rsp); + return (rp->tf_esp); +#endif + case REG_SS: + return (rp->tf_ss); default: DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); return (0); } - -#else - if (reg > SS) { - DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); - return (0); - } - - return ((&rp->r_gs)[reg]); -#endif } -#endif static int dtrace_copycheck(uintptr_t uaddr, uintptr_t kaddr, size_t size) Index: src/external/cddl/osnet/dev/dtrace/i386/dtrace_subr.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/dtrace/i386/dtrace_subr.c,v retrieving revision 1.8 diff -u -p -r1.8 dtrace_subr.c --- src/external/cddl/osnet/dev/dtrace/i386/dtrace_subr.c 27 Feb 2017 06:47:00 -0000 1.8 +++ src/external/cddl/osnet/dev/dtrace/i386/dtrace_subr.c 19 Apr 2017 17:15:40 -0000 @@ -21,7 +21,7 @@ * * CDDL HEADER END * - * $FreeBSD: src/sys/cddl/dev/dtrace/i386/dtrace_subr.c,v 1.3.2.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/dtrace/i386/dtrace_subr.c 313850 2017-02-17 03:27:20Z markj $ * */ /* @@ -29,6 +29,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ + #include #include #include @@ -38,7 +42,6 @@ #include #include #include -//#include #include #include #include @@ -51,8 +54,8 @@ #include extern uintptr_t kernelbase; -extern uintptr_t dtrace_in_probe_addr; -extern int dtrace_in_probe; + +extern void dtrace_getnanotime(struct timespec *tsp); int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); @@ -162,122 +165,6 @@ dtrace_sync(void) } #ifdef notyet -int (*dtrace_fasttrap_probe_ptr)(struct regs *); -int (*dtrace_pid_probe_ptr)(struct regs *); -int (*dtrace_return_probe_ptr)(struct regs *); - -void -dtrace_user_probe(struct regs *rp, caddr_t addr, processorid_t cpuid) -{ - krwlock_t *rwp; - proc_t *p = curproc; - extern void trap(struct regs *, caddr_t, processorid_t); - - if (USERMODE(rp->r_cs) || (rp->r_ps & PS_VM)) { - if (curthread->t_cred != p->p_cred) { - cred_t *oldcred = curthread->t_cred; - /* - * DTrace accesses t_cred in probe context. t_cred - * must always be either NULL, or point to a valid, - * allocated cred structure. - */ - curthread->t_cred = crgetcred(); - crfree(oldcred); - } - } - - if (rp->r_trapno == T_DTRACE_RET) { - uint8_t step = curthread->t_dtrace_step; - uint8_t ret = curthread->t_dtrace_ret; - uintptr_t npc = curthread->t_dtrace_npc; - - if (curthread->t_dtrace_ast) { - aston(curthread); - curthread->t_sig_check = 1; - } - - /* - * Clear all user tracing flags. - */ - curthread->t_dtrace_ft = 0; - - /* - * If we weren't expecting to take a return probe trap, kill - * the process as though it had just executed an unassigned - * trap instruction. - */ - if (step == 0) { - tsignal(curthread, SIGILL); - return; - } - - /* - * If we hit this trap unrelated to a return probe, we're - * just here to reset the AST flag since we deferred a signal - * until after we logically single-stepped the instruction we - * copied out. - */ - if (ret == 0) { - rp->r_pc = npc; - return; - } - - /* - * We need to wait until after we've called the - * dtrace_return_probe_ptr function pointer to set %pc. - */ - rwp = &CPU->cpu_ft_lock; - rw_enter(rwp, RW_READER); - if (dtrace_return_probe_ptr != NULL) - (void) (*dtrace_return_probe_ptr)(rp); - rw_exit(rwp); - rp->r_pc = npc; - - } else if (rp->r_trapno == T_DTRACE_PROBE) { - rwp = &CPU->cpu_ft_lock; - rw_enter(rwp, RW_READER); - if (dtrace_fasttrap_probe_ptr != NULL) - (void) (*dtrace_fasttrap_probe_ptr)(rp); - rw_exit(rwp); - - } else if (rp->r_trapno == T_BPTFLT) { - uint8_t instr; - rwp = &CPU->cpu_ft_lock; - - /* - * The DTrace fasttrap provider uses the breakpoint trap - * (int 3). We let DTrace take the first crack at handling - * this trap; if it's not a probe that DTrace knowns about, - * we call into the trap() routine to handle it like a - * breakpoint placed by a conventional debugger. - */ - rw_enter(rwp, RW_READER); - if (dtrace_pid_probe_ptr != NULL && - (*dtrace_pid_probe_ptr)(rp) == 0) { - rw_exit(rwp); - return; - } - rw_exit(rwp); - - /* - * If the instruction that caused the breakpoint trap doesn't - * look like an int 3 anymore, it may be that this tracepoint - * was removed just after the user thread executed it. In - * that case, return to user land to retry the instuction. - */ - if (fuword8((void *)(rp->r_pc - 1), &instr) == 0 && - instr != FASTTRAP_INSTR) { - rp->r_pc--; - return; - } - - trap(rp, addr, cpuid); - - } else { - trap(rp, addr, cpuid); - } -} - void dtrace_safe_synchronous_signal(void) { @@ -323,14 +210,15 @@ dtrace_safe_defer_signal(void) } /* - * If we've executed the original instruction, but haven't performed - * the jmp back to t->t_dtrace_npc or the clean up of any registers - * used to emulate %rip-relative instructions in 64-bit mode, do that - * here and take the signal right away. We detect this condition by - * seeing if the program counter is the range [scrpc + isz, astpc). + * If we have executed the original instruction, but we have performed + * neither the jmp back to t->t_dtrace_npc nor the clean up of any + * registers used to emulate %rip-relative instructions in 64-bit mode, + * we'll save ourselves some effort by doing that here and taking the + * signal right away. We detect this condition by seeing if the program + * counter is the range [scrpc + isz, astpc). */ - if (t->t_dtrace_astpc - rp->r_pc < - t->t_dtrace_astpc - t->t_dtrace_scrpc - isz) { + if (rp->r_pc >= t->t_dtrace_scrpc + isz && + rp->r_pc < t->t_dtrace_astpc) { #ifdef __amd64 /* * If there is a scratch register and we're on the @@ -376,10 +264,8 @@ dtrace_safe_defer_signal(void) } #endif -#if 0 static int64_t tgt_cpu_tsc; static int64_t hst_cpu_tsc; -#endif static int64_t tsc_skew[MAXCPUS]; static uint64_t nsec_scale; @@ -395,29 +281,6 @@ dtrace_rdtsc(void) return (rv); } -#if 0 -static void -dtrace_gethrtime_init_sync(void *arg) -{ -#ifdef CHECK_SYNC - /* - * Delay this function from returning on one - * of the CPUs to check that the synchronisation - * works. - */ - uintptr_t cpu = (uintptr_t) arg; - - if (cpu == curcpu) { - int i; - for (i = 0; i < 1000000000; i++) - tgt_cpu_tsc = dtrace_rdtsc(); - tgt_cpu_tsc = 0; - } -#endif -} -#endif - -#if 0 static void dtrace_gethrtime_init_cpu(void *arg) { @@ -428,7 +291,6 @@ dtrace_gethrtime_init_cpu(void *arg) else hst_cpu_tsc = dtrace_rdtsc(); } -#endif void dtrace_gethrtime_init(void *arg) @@ -452,8 +314,8 @@ dtrace_gethrtime_init(void *arg) * another 32-bit integer without overflowing 64-bit. * Thus minimum supported TSC frequency is 62.5MHz. */ - //KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)), ("TSC frequency is too low")); - KASSERT(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT))); + KASSERTMSG(tsc_f > (NANOSEC >> (32 - SCALE_SHIFT)), + "TSC frequency is too low"); /* * We scale up NANOSEC/tsc_f ratio to preserve as much precision @@ -476,26 +338,37 @@ dtrace_gethrtime_init(void *arg) /* Already handled in x86/tsc.c for ci_data.cpu_cc_skew */ #if 0 - for (i = 0; i <= mp_maxid; i++) { + /* The current CPU is the reference one. */ + sched_pin(); + tsc_skew[curcpu] = 0; + CPU_FOREACH(i) { if (i == curcpu) continue; - if (pcpu_find(i) == NULL) - continue; - - map = 0; - map |= (1 << curcpu); - map |= (1 << i); + pc = pcpu_find(i); + CPU_SETOF(PCPU_GET(cpuid), &map); + CPU_SET(pc->pc_cpuid, &map); - smp_rendezvous_cpus(map, dtrace_gethrtime_init_sync, + smp_rendezvous_cpus(map, NULL, dtrace_gethrtime_init_cpu, smp_no_rendevous_barrier, (void *)(uintptr_t) i); tsc_skew[i] = tgt_cpu_tsc - hst_cpu_tsc; } + sched_unpin(); #endif } +#ifdef __FreeBSD__ +#ifdef EARLY_AP_STARTUP +SYSINIT(dtrace_gethrtime_init, SI_SUB_DTRACE, SI_ORDER_ANY, + dtrace_gethrtime_init, NULL); +#else +SYSINIT(dtrace_gethrtime_init, SI_SUB_SMP, SI_ORDER_ANY, dtrace_gethrtime_init, + NULL); +#endif +#endif + /* * DTrace needs a high resolution time function which can * be called from a probe context and guaranteed not to have @@ -526,27 +399,33 @@ dtrace_gethrtime() uint64_t dtrace_gethrestime(void) { - printf("%s(%d): XXX\n",__func__,__LINE__); - return (0); + struct timespec current_time; + + dtrace_getnanotime(¤t_time); + + return (current_time.tv_sec * 1000000000ULL + current_time.tv_nsec); } /* Function to handle DTrace traps during probes. See i386/i386/trap.c */ int dtrace_trap(struct trapframe *frame, u_int type) { + bool nofault; cpuid_t cpuid = cpu_number(); /* current cpu id */ /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets - * a flag in it's per-cpu flags to indicate that it doesn't - * want to fault. On returning from the the probe, the no-fault + * a flag in its per-cpu flags to indicate that it doesn't + * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * Check if DTrace has enabled 'no-fault' mode: - * */ - if ((cpu_core[cpuid].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) { + nofault = (cpu_core[cpuid].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0; + if (nofault) { + KASSERTMSG((read_eflags() & PSL_I) == 0, "interrupts enabled"); + /* * There are only a couple of trap types that are expected. * All the rest will be handled in the usual way. Index: src/external/cddl/osnet/dev/dtrace/i386/instr_size.c =================================================================== RCS file: src/external/cddl/osnet/dev/dtrace/i386/instr_size.c diff -N src/external/cddl/osnet/dev/dtrace/i386/instr_size.c --- src/external/cddl/osnet/dev/dtrace/i386/instr_size.c 21 Feb 2010 01:46:33 -0000 1.2 +++ /dev/null 1 Jan 1970 00:00:00 -0000 @@ -1,134 +0,0 @@ -/* $NetBSD: instr_size.c,v 1.2 2010/02/21 01:46:33 darran Exp $ */ - -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - * - * $FreeBSD: src/sys/cddl/dev/dtrace/i386/instr_size.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* Copyright (c) 1988 AT&T */ -/* All Rights Reserved */ - - -#if defined(sun) -#pragma ident "@(#)instr_size.c 1.14 05/07/08 SMI" -#endif - -#include -#include -#include -#if defined(sun) -#include -#include -#include -#include -#include -#else -typedef u_int model_t; -#define DATAMODEL_NATIVE 0 -int dtrace_instr_size(uchar_t *); -#endif - -#include - -/* - * This subsystem (with the minor exception of the instr_size() function) is - * is called from DTrace probe context. This imposes several requirements on - * the implementation: - * - * 1. External subsystems and functions may not be referenced. The one current - * exception is for cmn_err, but only to signal the detection of table - * errors. Assuming the tables are correct, no combination of input is to - * trigger a cmn_err call. - * - * 2. These functions can't be allowed to be traced. To prevent this, - * all functions in the probe path (everything except instr_size()) must - * have names that begin with "dtrace_". - */ - -typedef enum dis_isize { - DIS_ISIZE_INSTR, - DIS_ISIZE_OPERAND -} dis_isize_t; - - -/* - * get a byte from instruction stream - */ -static int -dtrace_dis_get_byte(void *p) -{ - int ret; - uchar_t **instr = p; - - ret = **instr; - *instr += 1; - - return (ret); -} - -/* - * Returns either the size of a given instruction, in bytes, or the size of that - * instruction's memory access (if any), depending on the value of `which'. - * If a programming error in the tables is detected, the system will panic to - * ease diagnosis. Invalid instructions will not be flagged. They will appear - * to have an instruction size between 1 and the actual size, and will be - * reported as having no memory impact. - */ -/* ARGSUSED2 */ -static int -dtrace_dis_isize(uchar_t *instr, dis_isize_t which, model_t model, int *rmindex) -{ - int sz; - dis86_t x; - uint_t mode = SIZE32; - -#if defined(sun) - mode = (model == DATAMODEL_LP64) ? SIZE64 : SIZE32; -#endif - - x.d86_data = (void **)&instr; - x.d86_get_byte = dtrace_dis_get_byte; - x.d86_check_func = NULL; - - if (dtrace_disx86(&x, mode) != 0) - return (-1); - - if (which == DIS_ISIZE_INSTR) - sz = x.d86_len; /* length of the instruction */ - else - sz = x.d86_memsize; /* length of memory operand */ - - if (rmindex != NULL) - *rmindex = x.d86_rmindex; - return (sz); -} - -int -dtrace_instr_size(uchar_t *instr) -{ - return (dtrace_dis_isize(instr, DIS_ISIZE_INSTR, DATAMODEL_NATIVE, - NULL)); -} Index: src/external/cddl/osnet/dev/dtrace/x86/dis_tables.c =================================================================== RCS file: src/external/cddl/osnet/dev/dtrace/x86/dis_tables.c diff -N src/external/cddl/osnet/dev/dtrace/x86/dis_tables.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dev/dtrace/x86/dis_tables.c 2 Mar 2017 10:54:26 -0000 @@ -0,0 +1,5597 @@ +/* + * + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. + */ + +/* + * Copyright (c) 2010, Intel Corporation. + * All rights reserved. + */ + +/* Copyright (c) 1988 AT&T */ +/* All Rights Reserved */ + +/* + * $FreeBSD: head/sys/cddl/dev/dtrace/x86/dis_tables.c 313133 2017-02-03 03:22:47Z markj $ + */ + +#include "dis_tables.h" + +/* BEGIN CSTYLED */ + +/* + * Disassembly begins in dis_distable, which is equivalent to the One-byte + * Opcode Map in the Intel IA32 ISA Reference (page A-6 in my copy). The + * decoding loops then traverse out through the other tables as necessary to + * decode a given instruction. + * + * The behavior of this file can be controlled by one of the following flags: + * + * DIS_TEXT Include text for disassembly + * DIS_MEM Include memory-size calculations + * + * Either or both of these can be defined. + * + * This file is not, and will never be, cstyled. If anything, the tables should + * be taken out another tab stop or two so nothing overlaps. + */ + +/* + * These functions must be provided for the consumer to do disassembly. + */ +#ifdef DIS_TEXT +extern char *strncpy(char *, const char *, size_t); +extern size_t strlen(const char *); +extern int strcmp(const char *, const char *); +extern int strncmp(const char *, const char *, size_t); +extern size_t strlcat(char *, const char *, size_t); +#endif + + +#define TERM 0 /* used to indicate that the 'indirect' */ + /* field terminates - no pointer. */ + +/* Used to decode instructions. */ +typedef struct instable { + struct instable *it_indirect; /* for decode op codes */ + uchar_t it_adrmode; +#ifdef DIS_TEXT + char it_name[NCPS]; + uint_t it_suffix:1; /* mnem + "w", "l", or "d" */ +#endif +#ifdef DIS_MEM + uint_t it_size:16; +#endif + uint_t it_invalid64:1; /* opcode invalid in amd64 */ + uint_t it_always64:1; /* 64 bit when in 64 bit mode */ + uint_t it_invalid32:1; /* invalid in IA32 */ + uint_t it_stackop:1; /* push/pop stack operation */ + uint_t it_vexwoxmm:1; /* VEX instructions that don't use XMM/YMM */ + uint_t it_avxsuf:1; /* AVX suffix required */ +} instable_t; + +/* + * Instruction formats. + */ +enum { + UNKNOWN, + MRw, + IMlw, + IMw, + IR, + OA, + AO, + MS, + SM, + Mv, + Mw, + M, /* register or memory */ + MG9, /* register or memory in group 9 (prefix optional) */ + Mb, /* register or memory, always byte sized */ + MO, /* memory only (no registers) */ + PREF, + SWAPGS_RDTSCP, + MONITOR_MWAIT, + R, + RA, + SEG, + MR, + RM, + RM_66r, /* RM, but with a required 0x66 prefix */ + IA, + MA, + SD, + AD, + SA, + D, + INM, + SO, + BD, + I, + P, + V, + DSHIFT, /* for double shift that has an 8-bit immediate */ + U, + OVERRIDE, + NORM, /* instructions w/o ModR/M byte, no memory access */ + IMPLMEM, /* instructions w/o ModR/M byte, implicit mem access */ + O, /* for call */ + JTAB, /* jump table */ + IMUL, /* for 186 iimul instr */ + CBW, /* so data16 can be evaluated for cbw and variants */ + MvI, /* for 186 logicals */ + ENTER, /* for 186 enter instr */ + RMw, /* for 286 arpl instr */ + Ib, /* for push immediate byte */ + F, /* for 287 instructions */ + FF, /* for 287 instructions */ + FFC, /* for 287 instructions */ + DM, /* 16-bit data */ + AM, /* 16-bit addr */ + LSEG, /* for 3-bit seg reg encoding */ + MIb, /* for 386 logicals */ + SREG, /* for 386 special registers */ + PREFIX, /* a REP instruction prefix */ + LOCK, /* a LOCK instruction prefix */ + INT3, /* The int 3 instruction, which has a fake operand */ + INTx, /* The normal int instruction, with explicit int num */ + DSHIFTcl, /* for double shift that implicitly uses %cl */ + CWD, /* so data16 can be evaluated for cwd and variants */ + RET, /* single immediate 16-bit operand */ + MOVZ, /* for movs and movz, with different size operands */ + CRC32, /* for crc32, with different size operands */ + XADDB, /* for xaddb */ + MOVSXZ, /* AMD64 mov sign extend 32 to 64 bit instruction */ + MOVBE, /* movbe instruction */ + +/* + * MMX/SIMD addressing modes. + */ + + MMO, /* Prefixable MMX/SIMD-Int mm/mem -> mm */ + MMOIMPL, /* Prefixable MMX/SIMD-Int mm -> mm (mem) */ + MMO3P, /* Prefixable MMX/SIMD-Int mm -> r32,imm8 */ + MMOM3, /* Prefixable MMX/SIMD-Int mm -> r32 */ + MMOS, /* Prefixable MMX/SIMD-Int mm -> mm/mem */ + MMOMS, /* Prefixable MMX/SIMD-Int mm -> mem */ + MMOPM, /* MMX/SIMD-Int mm/mem -> mm,imm8 */ + MMOPM_66o, /* MMX/SIMD-Int 0x66 optional mm/mem -> mm,imm8 */ + MMOPRM, /* Prefixable MMX/SIMD-Int r32/mem -> mm,imm8 */ + MMOSH, /* Prefixable MMX mm,imm8 */ + MM, /* MMX/SIMD-Int mm/mem -> mm */ + MMS, /* MMX/SIMD-Int mm -> mm/mem */ + MMSH, /* MMX mm,imm8 */ + XMMO, /* Prefixable SIMD xmm/mem -> xmm */ + XMMOS, /* Prefixable SIMD xmm -> xmm/mem */ + XMMOPM, /* Prefixable SIMD xmm/mem w/to xmm,imm8 */ + XMMOMX, /* Prefixable SIMD mm/mem -> xmm */ + XMMOX3, /* Prefixable SIMD xmm -> r32 */ + XMMOXMM, /* Prefixable SIMD xmm/mem -> mm */ + XMMOM, /* Prefixable SIMD xmm -> mem */ + XMMOMS, /* Prefixable SIMD mem -> xmm */ + XMM, /* SIMD xmm/mem -> xmm */ + XMM_66r, /* SIMD 0x66 prefix required xmm/mem -> xmm */ + XMM_66o, /* SIMD 0x66 prefix optional xmm/mem -> xmm */ + XMMXIMPL, /* SIMD xmm -> xmm (mem) */ + XMM3P, /* SIMD xmm -> r32,imm8 */ + XMM3PM_66r, /* SIMD 0x66 prefix required xmm -> r32/mem,imm8 */ + XMMP, /* SIMD xmm/mem w/to xmm,imm8 */ + XMMP_66o, /* SIMD 0x66 prefix optional xmm/mem w/to xmm,imm8 */ + XMMP_66r, /* SIMD 0x66 prefix required xmm/mem w/to xmm,imm8 */ + XMMPRM, /* SIMD r32/mem -> xmm,imm8 */ + XMMPRM_66r, /* SIMD 0x66 prefix required r32/mem -> xmm,imm8 */ + XMMS, /* SIMD xmm -> xmm/mem */ + XMMM, /* SIMD mem -> xmm */ + XMMM_66r, /* SIMD 0x66 prefix required mem -> xmm */ + XMMMS, /* SIMD xmm -> mem */ + XMM3MX, /* SIMD r32/mem -> xmm */ + XMM3MXS, /* SIMD xmm -> r32/mem */ + XMMSH, /* SIMD xmm,imm8 */ + XMMXM3, /* SIMD xmm/mem -> r32 */ + XMMX3, /* SIMD xmm -> r32 */ + XMMXMM, /* SIMD xmm/mem -> mm */ + XMMMX, /* SIMD mm -> xmm */ + XMMXM, /* SIMD xmm -> mm */ + XMMX2I, /* SIMD xmm -> xmm, imm, imm */ + XMM2I, /* SIMD xmm, imm, imm */ + XMMFENCE, /* SIMD lfence or mfence */ + XMMSFNC, /* SIMD sfence (none or mem) */ + XGETBV_XSETBV, + VEX_NONE, /* VEX no operand */ + VEX_MO, /* VEX mod_rm -> implicit reg */ + VEX_RMrX, /* VEX VEX.vvvv, mod_rm -> mod_reg */ + VEX_VRMrX, /* VEX mod_rm, VEX.vvvv -> mod_rm */ + VEX_RRX, /* VEX VEX.vvvv, mod_reg -> mod_rm */ + VEX_RMRX, /* VEX VEX.vvvv, mod_rm, imm8[7:4] -> mod_reg */ + VEX_MX, /* VEX mod_rm -> mod_reg */ + VEX_MXI, /* VEX mod_rm, imm8 -> mod_reg */ + VEX_XXI, /* VEX mod_rm, imm8 -> VEX.vvvv */ + VEX_MR, /* VEX mod_rm -> mod_reg */ + VEX_RRI, /* VEX mod_reg, mod_rm -> implicit(eflags/r32) */ + VEX_RX, /* VEX mod_reg -> mod_rm */ + VEX_RR, /* VEX mod_rm -> mod_reg */ + VEX_RRi, /* VEX mod_rm, imm8 -> mod_reg */ + VEX_RM, /* VEX mod_reg -> mod_rm */ + VEX_RIM, /* VEX mod_reg, imm8 -> mod_rm */ + VEX_RRM, /* VEX VEX.vvvv, mod_reg -> mod_rm */ + VEX_RMX, /* VEX VEX.vvvv, mod_rm -> mod_reg */ + VEX_SbVM, /* VEX SIB, VEX.vvvv -> mod_rm */ + VMx, /* vmcall/vmlaunch/vmresume/vmxoff */ + VMxo, /* VMx instruction with optional prefix */ + SVM, /* AMD SVM instructions */ + BLS, /* BLSR, BLSMSK, BLSI */ + FMA, /* FMA instructions, all VEX_RMrX */ + ADX /* ADX instructions, support REX.w, mod_rm->mod_reg */ +}; + +/* + * VEX prefixes + */ +#define VEX_2bytes 0xC5 /* the first byte of two-byte form */ +#define VEX_3bytes 0xC4 /* the first byte of three-byte form */ + +#define FILL 0x90 /* Fill byte used for alignment (nop) */ + +/* +** Register numbers for the i386 +*/ +#define EAX_REGNO 0 +#define ECX_REGNO 1 +#define EDX_REGNO 2 +#define EBX_REGNO 3 +#define ESP_REGNO 4 +#define EBP_REGNO 5 +#define ESI_REGNO 6 +#define EDI_REGNO 7 + +/* + * modes for immediate values + */ +#define MODE_NONE 0 +#define MODE_IPREL 1 /* signed IP relative value */ +#define MODE_SIGNED 2 /* sign extended immediate */ +#define MODE_IMPLIED 3 /* constant value implied from opcode */ +#define MODE_OFFSET 4 /* offset part of an address */ +#define MODE_RIPREL 5 /* like IPREL, but from %rip (amd64) */ + +/* + * The letters used in these macros are: + * IND - indirect to another to another table + * "T" - means to Terminate indirections (this is the final opcode) + * "S" - means "operand length suffix required" + * "Sa" - means AVX2 suffix (d/q) required + * "NS" - means "no suffix" which is the operand length suffix of the opcode + * "Z" - means instruction size arg required + * "u" - means the opcode is invalid in IA32 but valid in amd64 + * "x" - means the opcode is invalid in amd64, but not IA32 + * "y" - means the operand size is always 64 bits in 64 bit mode + * "p" - means push/pop stack operation + * "vr" - means VEX instruction that operates on normal registers, not fpu + */ + +#if defined(DIS_TEXT) && defined(DIS_MEM) +#define IND(table) {(instable_t *)table, 0, "", 0, 0, 0, 0, 0, 0} +#define INDx(table) {(instable_t *)table, 0, "", 0, 0, 1, 0, 0, 0} +#define TNS(name, amode) {TERM, amode, name, 0, 0, 0, 0, 0, 0} +#define TNSu(name, amode) {TERM, amode, name, 0, 0, 0, 0, 1, 0} +#define TNSx(name, amode) {TERM, amode, name, 0, 0, 1, 0, 0, 0} +#define TNSy(name, amode) {TERM, amode, name, 0, 0, 0, 1, 0, 0} +#define TNSyp(name, amode) {TERM, amode, name, 0, 0, 0, 1, 0, 1} +#define TNSZ(name, amode, sz) {TERM, amode, name, 0, sz, 0, 0, 0, 0} +#define TNSZy(name, amode, sz) {TERM, amode, name, 0, sz, 0, 1, 0, 0} +#define TNSZvr(name, amode, sz) {TERM, amode, name, 0, sz, 0, 0, 0, 0, 1} +#define TS(name, amode) {TERM, amode, name, 1, 0, 0, 0, 0, 0} +#define TSx(name, amode) {TERM, amode, name, 1, 0, 1, 0, 0, 0} +#define TSy(name, amode) {TERM, amode, name, 1, 0, 0, 1, 0, 0} +#define TSp(name, amode) {TERM, amode, name, 1, 0, 0, 0, 0, 1} +#define TSZ(name, amode, sz) {TERM, amode, name, 1, sz, 0, 0, 0, 0} +#define TSaZ(name, amode, sz) {TERM, amode, name, 1, sz, 0, 0, 0, 0, 0, 1} +#define TSZx(name, amode, sz) {TERM, amode, name, 1, sz, 1, 0, 0, 0} +#define TSZy(name, amode, sz) {TERM, amode, name, 1, sz, 0, 1, 0, 0} +#define INVALID {TERM, UNKNOWN, "", 0, 0, 0, 0, 0} +#elif defined(DIS_TEXT) +#define IND(table) {(instable_t *)table, 0, "", 0, 0, 0, 0, 0} +#define INDx(table) {(instable_t *)table, 0, "", 0, 1, 0, 0, 0} +#define TNS(name, amode) {TERM, amode, name, 0, 0, 0, 0, 0} +#define TNSu(name, amode) {TERM, amode, name, 0, 0, 0, 1, 0} +#define TNSx(name, amode) {TERM, amode, name, 0, 1, 0, 0, 0} +#define TNSy(name, amode) {TERM, amode, name, 0, 0, 1, 0, 0} +#define TNSyp(name, amode) {TERM, amode, name, 0, 0, 1, 0, 1} +#define TNSZ(name, amode, sz) {TERM, amode, name, 0, 0, 0, 0, 0} +#define TNSZy(name, amode, sz) {TERM, amode, name, 0, 0, 1, 0, 0} +#define TNSZvr(name, amode, sz) {TERM, amode, name, 0, 0, 0, 0, 0, 1} +#define TS(name, amode) {TERM, amode, name, 1, 0, 0, 0, 0} +#define TSx(name, amode) {TERM, amode, name, 1, 1, 0, 0, 0} +#define TSy(name, amode) {TERM, amode, name, 1, 0, 1, 0, 0} +#define TSp(name, amode) {TERM, amode, name, 1, 0, 0, 0, 1} +#define TSZ(name, amode, sz) {TERM, amode, name, 1, 0, 0, 0, 0} +#define TSaZ(name, amode, sz) {TERM, amode, name, 1, 0, 0, 0, 0, 0, 1} +#define TSZx(name, amode, sz) {TERM, amode, name, 1, 1, 0, 0, 0} +#define TSZy(name, amode, sz) {TERM, amode, name, 1, 0, 1, 0, 0} +#define INVALID {TERM, UNKNOWN, "", 0, 0, 0, 0, 0} +#elif defined(DIS_MEM) +#define IND(table) {(instable_t *)table, 0, 0, 0, 0, 0, 0} +#define INDx(table) {(instable_t *)table, 0, 0, 1, 0, 0, 0} +#define TNS(name, amode) {TERM, amode, 0, 0, 0, 0, 0} +#define TNSu(name, amode) {TERM, amode, 0, 0, 0, 1, 0} +#define TNSy(name, amode) {TERM, amode, 0, 0, 1, 0, 0} +#define TNSyp(name, amode) {TERM, amode, 0, 0, 1, 0, 1} +#define TNSx(name, amode) {TERM, amode, 0, 1, 0, 0, 0} +#define TNSZ(name, amode, sz) {TERM, amode, sz, 0, 0, 0, 0} +#define TNSZy(name, amode, sz) {TERM, amode, sz, 0, 1, 0, 0} +#define TNSZvr(name, amode, sz) {TERM, amode, sz, 0, 0, 0, 0, 1} +#define TS(name, amode) {TERM, amode, 0, 0, 0, 0, 0} +#define TSx(name, amode) {TERM, amode, 0, 1, 0, 0, 0} +#define TSy(name, amode) {TERM, amode, 0, 0, 1, 0, 0} +#define TSp(name, amode) {TERM, amode, 0, 0, 0, 0, 1} +#define TSZ(name, amode, sz) {TERM, amode, sz, 0, 0, 0, 0} +#define TSaZ(name, amode, sz) {TERM, amode, sz, 0, 0, 0, 0, 0, 1} +#define TSZx(name, amode, sz) {TERM, amode, sz, 1, 0, 0, 0} +#define TSZy(name, amode, sz) {TERM, amode, sz, 0, 1, 0, 0} +#define INVALID {TERM, UNKNOWN, 0, 0, 0, 0, 0} +#else +#define IND(table) {(instable_t *)table, 0, 0, 0, 0, 0} +#define INDx(table) {(instable_t *)table, 0, 1, 0, 0, 0} +#define TNS(name, amode) {TERM, amode, 0, 0, 0, 0} +#define TNSu(name, amode) {TERM, amode, 0, 0, 1, 0} +#define TNSy(name, amode) {TERM, amode, 0, 1, 0, 0} +#define TNSyp(name, amode) {TERM, amode, 0, 1, 0, 1} +#define TNSx(name, amode) {TERM, amode, 1, 0, 0, 0} +#define TNSZ(name, amode, sz) {TERM, amode, 0, 0, 0, 0} +#define TNSZy(name, amode, sz) {TERM, amode, 0, 1, 0, 0} +#define TNSZvr(name, amode, sz) {TERM, amode, 0, 0, 0, 0, 1} +#define TS(name, amode) {TERM, amode, 0, 0, 0, 0} +#define TSx(name, amode) {TERM, amode, 1, 0, 0, 0} +#define TSy(name, amode) {TERM, amode, 0, 1, 0, 0} +#define TSp(name, amode) {TERM, amode, 0, 0, 0, 1} +#define TSZ(name, amode, sz) {TERM, amode, 0, 0, 0, 0} +#define TSaZ(name, amode, sz) {TERM, amode, 0, 0, 0, 0, 0, 1} +#define TSZx(name, amode, sz) {TERM, amode, 1, 0, 0, 0} +#define TSZy(name, amode, sz) {TERM, amode, 0, 1, 0, 0} +#define INVALID {TERM, UNKNOWN, 0, 0, 0, 0} +#endif + +#ifdef DIS_TEXT +/* + * this decodes the r_m field for mode's 0, 1, 2 in 16 bit mode + */ +const char *const dis_addr16[3][8] = { +"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di)", "", + "(%bx)", +"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di", "(%bp)", + "(%bx)", +"(%bx,%si)", "(%bx,%di)", "(%bp,%si)", "(%bp,%di)", "(%si)", "(%di)", "(%bp)", + "(%bx)", +}; + + +/* + * This decodes 32 bit addressing mode r_m field for modes 0, 1, 2 + */ +const char *const dis_addr32_mode0[16] = { + "(%eax)", "(%ecx)", "(%edx)", "(%ebx)", "", "", "(%esi)", "(%edi)", + "(%r8d)", "(%r9d)", "(%r10d)", "(%r11d)", "", "", "(%r14d)", "(%r15d)" +}; + +const char *const dis_addr32_mode12[16] = { + "(%eax)", "(%ecx)", "(%edx)", "(%ebx)", "", "(%ebp)", "(%esi)", "(%edi)", + "(%r8d)", "(%r9d)", "(%r10d)", "(%r11d)", "", "(%r13d)", "(%r14d)", "(%r15d)" +}; + +/* + * This decodes 64 bit addressing mode r_m field for modes 0, 1, 2 + */ +const char *const dis_addr64_mode0[16] = { + "(%rax)", "(%rcx)", "(%rdx)", "(%rbx)", "", "(%rip)", "(%rsi)", "(%rdi)", + "(%r8)", "(%r9)", "(%r10)", "(%r11)", "(%r12)", "(%rip)", "(%r14)", "(%r15)" +}; +const char *const dis_addr64_mode12[16] = { + "(%rax)", "(%rcx)", "(%rdx)", "(%rbx)", "", "(%rbp)", "(%rsi)", "(%rdi)", + "(%r8)", "(%r9)", "(%r10)", "(%r11)", "(%r12)", "(%r13)", "(%r14)", "(%r15)" +}; + +/* + * decode for scale from SIB byte + */ +const char *const dis_scale_factor[4] = { ")", ",2)", ",4)", ",8)" }; + +/* + * decode for scale from VSIB byte, note that we always include the scale factor + * to match gas. + */ +const char *const dis_vscale_factor[4] = { ",1)", ",2)", ",4)", ",8)" }; + +/* + * register decoding for normal references to registers (ie. not addressing) + */ +const char *const dis_REG8[16] = { + "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh", + "%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b" +}; + +const char *const dis_REG8_REX[16] = { + "%al", "%cl", "%dl", "%bl", "%spl", "%bpl", "%sil", "%dil", + "%r8b", "%r9b", "%r10b", "%r11b", "%r12b", "%r13b", "%r14b", "%r15b" +}; + +const char *const dis_REG16[16] = { + "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di", + "%r8w", "%r9w", "%r10w", "%r11w", "%r12w", "%r13w", "%r14w", "%r15w" +}; + +const char *const dis_REG32[16] = { + "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", + "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" +}; + +const char *const dis_REG64[16] = { + "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", + "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" +}; + +const char *const dis_DEBUGREG[16] = { + "%db0", "%db1", "%db2", "%db3", "%db4", "%db5", "%db6", "%db7", + "%db8", "%db9", "%db10", "%db11", "%db12", "%db13", "%db14", "%db15" +}; + +const char *const dis_CONTROLREG[16] = { + "%cr0", "%cr1", "%cr2", "%cr3", "%cr4", "%cr5?", "%cr6?", "%cr7?", + "%cr8", "%cr9?", "%cr10?", "%cr11?", "%cr12?", "%cr13?", "%cr14?", "%cr15?" +}; + +const char *const dis_TESTREG[16] = { + "%tr0?", "%tr1?", "%tr2?", "%tr3", "%tr4", "%tr5", "%tr6", "%tr7", + "%tr0?", "%tr1?", "%tr2?", "%tr3", "%tr4", "%tr5", "%tr6", "%tr7" +}; + +const char *const dis_MMREG[16] = { + "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7", + "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" +}; + +const char *const dis_XMMREG[16] = { + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15" +}; + +const char *const dis_YMMREG[16] = { + "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", + "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15" +}; + +const char *const dis_SEGREG[16] = { + "%es", "%cs", "%ss", "%ds", "%fs", "%gs", "", "", + "%es", "%cs", "%ss", "%ds", "%fs", "%gs", "", "" +}; + +/* + * SIMD predicate suffixes + */ +const char *const dis_PREDSUFFIX[8] = { + "eq", "lt", "le", "unord", "neq", "nlt", "nle", "ord" +}; + +const char *const dis_AVXvgrp7[3][8] = { + /*0 1 2 3 4 5 6 7*/ +/*71*/ {"", "", "vpsrlw", "", "vpsraw", "", "vpsllw", ""}, +/*72*/ {"", "", "vpsrld", "", "vpsrad", "", "vpslld", ""}, +/*73*/ {"", "", "vpsrlq", "vpsrldq", "", "", "vpsllq", "vpslldq"} +}; + +#endif /* DIS_TEXT */ + +/* + * "decode table" for 64 bit mode MOVSXD instruction (opcode 0x63) + */ +const instable_t dis_opMOVSLD = TNS("movslq",MOVSXZ); + +/* + * "decode table" for pause and clflush instructions + */ +const instable_t dis_opPause = TNS("pause", NORM); + +/* + * Decode table for 0x0F00 opcodes + */ +const instable_t dis_op0F00[8] = { + +/* [0] */ TNS("sldt",M), TNS("str",M), TNSy("lldt",M), TNSy("ltr",M), +/* [4] */ TNSZ("verr",M,2), TNSZ("verw",M,2), INVALID, INVALID, +}; + + +/* + * Decode table for 0x0F01 opcodes + */ +const instable_t dis_op0F01[8] = { + +/* [0] */ TNSZ("sgdt",VMx,6), TNSZ("sidt",MONITOR_MWAIT,6), TNSZ("lgdt",XGETBV_XSETBV,6), TNSZ("lidt",SVM,6), +/* [4] */ TNSZ("smsw",M,2), INVALID, TNSZ("lmsw",M,2), TNS("invlpg",SWAPGS_RDTSCP), +}; + +/* + * Decode table for 0x0F18 opcodes -- SIMD prefetch + */ +const instable_t dis_op0F18[8] = { + +/* [0] */ TNS("prefetchnta",PREF),TNS("prefetcht0",PREF), TNS("prefetcht1",PREF), TNS("prefetcht2",PREF), +/* [4] */ INVALID, INVALID, INVALID, INVALID, +}; + +/* + * Decode table for 0x0FAE opcodes -- SIMD state save/restore + */ +const instable_t dis_op0FAE[8] = { +/* [0] */ TNSZ("fxsave",M,512), TNSZ("fxrstor",M,512), TNS("ldmxcsr",M), TNS("stmxcsr",M), +/* [4] */ TNSZ("xsave",M,512), TNS("lfence",XMMFENCE), TNS("mfence",XMMFENCE), TNS("sfence",XMMSFNC), +}; + +/* + * Decode table for 0x0FBA opcodes + */ + +const instable_t dis_op0FBA[8] = { + +/* [0] */ INVALID, INVALID, INVALID, INVALID, +/* [4] */ TS("bt",MIb), TS("bts",MIb), TS("btr",MIb), TS("btc",MIb), +}; + +/* + * Decode table for 0x0FC7 opcode (group 9) + */ + +const instable_t dis_op0FC7[8] = { + +/* [0] */ INVALID, TNS("cmpxchg8b",M), INVALID, INVALID, +/* [4] */ INVALID, INVALID, TNS("vmptrld",MG9), TNS("vmptrst",MG9), +}; + +/* + * Decode table for 0x0FC7 opcode (group 9) mode 3 + */ + +const instable_t dis_op0FC7m3[8] = { + +/* [0] */ INVALID, INVALID, INVALID, INVALID, +/* [4] */ INVALID, INVALID, TNS("rdrand",MG9), TNS("rdseed", MG9), +}; + +/* + * Decode table for 0x0FC7 opcode with 0x66 prefix + */ + +const instable_t dis_op660FC7[8] = { + +/* [0] */ INVALID, INVALID, INVALID, INVALID, +/* [4] */ INVALID, INVALID, TNS("vmclear",M), INVALID, +}; + +/* + * Decode table for 0x0FC7 opcode with 0xF3 prefix + */ + +const instable_t dis_opF30FC7[8] = { + +/* [0] */ INVALID, INVALID, INVALID, INVALID, +/* [4] */ INVALID, INVALID, TNS("vmxon",M), INVALID, +}; + +/* + * Decode table for 0x0FC8 opcode -- 486 bswap instruction + * + *bit pattern: 0000 1111 1100 1reg + */ +const instable_t dis_op0FC8[4] = { +/* [0] */ TNS("bswap",R), INVALID, INVALID, INVALID, +}; + +/* + * Decode table for 0x0F71, 0x0F72, and 0x0F73 opcodes -- MMX instructions + */ +const instable_t dis_op0F7123[4][8] = { +{ +/* [70].0 */ INVALID, INVALID, INVALID, INVALID, +/* .4 */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [71].0 */ INVALID, INVALID, TNS("psrlw",MMOSH), INVALID, +/* .4 */ TNS("psraw",MMOSH), INVALID, TNS("psllw",MMOSH), INVALID, +}, { +/* [72].0 */ INVALID, INVALID, TNS("psrld",MMOSH), INVALID, +/* .4 */ TNS("psrad",MMOSH), INVALID, TNS("pslld",MMOSH), INVALID, +}, { +/* [73].0 */ INVALID, INVALID, TNS("psrlq",MMOSH), TNS("INVALID",MMOSH), +/* .4 */ INVALID, INVALID, TNS("psllq",MMOSH), TNS("INVALID",MMOSH), +} }; + +/* + * Decode table for SIMD extensions to above 0x0F71-0x0F73 opcodes. + */ +const instable_t dis_opSIMD7123[32] = { +/* [70].0 */ INVALID, INVALID, INVALID, INVALID, +/* .4 */ INVALID, INVALID, INVALID, INVALID, + +/* [71].0 */ INVALID, INVALID, TNS("psrlw",XMMSH), INVALID, +/* .4 */ TNS("psraw",XMMSH), INVALID, TNS("psllw",XMMSH), INVALID, + +/* [72].0 */ INVALID, INVALID, TNS("psrld",XMMSH), INVALID, +/* .4 */ TNS("psrad",XMMSH), INVALID, TNS("pslld",XMMSH), INVALID, + +/* [73].0 */ INVALID, INVALID, TNS("psrlq",XMMSH), TNS("psrldq",XMMSH), +/* .4 */ INVALID, INVALID, TNS("psllq",XMMSH), TNS("pslldq",XMMSH), +}; + +/* + * SIMD instructions have been wedged into the existing IA32 instruction + * set through the use of prefixes. That is, while 0xf0 0x58 may be + * addps, 0xf3 0xf0 0x58 (literally, repz addps) is a completely different + * instruction - addss. At present, three prefixes have been coopted in + * this manner - address size (0x66), repnz (0xf2) and repz (0xf3). The + * following tables are used to provide the prefixed instruction names. + * The arrays are sparse, but they're fast. + */ + +/* + * Decode table for SIMD instructions with the address size (0x66) prefix. + */ +const instable_t dis_opSIMDdata16[256] = { +/* [00] */ INVALID, INVALID, INVALID, INVALID, +/* [04] */ INVALID, INVALID, INVALID, INVALID, +/* [08] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [10] */ TNSZ("movupd",XMM,16), TNSZ("movupd",XMMS,16), TNSZ("movlpd",XMMM,8), TNSZ("movlpd",XMMMS,8), +/* [14] */ TNSZ("unpcklpd",XMM,16),TNSZ("unpckhpd",XMM,16),TNSZ("movhpd",XMMM,8), TNSZ("movhpd",XMMMS,8), +/* [18] */ INVALID, INVALID, INVALID, INVALID, +/* [1C] */ INVALID, INVALID, INVALID, INVALID, + +/* [20] */ INVALID, INVALID, INVALID, INVALID, +/* [24] */ INVALID, INVALID, INVALID, INVALID, +/* [28] */ TNSZ("movapd",XMM,16), TNSZ("movapd",XMMS,16), TNSZ("cvtpi2pd",XMMOMX,8),TNSZ("movntpd",XMMOMS,16), +/* [2C] */ TNSZ("cvttpd2pi",XMMXMM,16),TNSZ("cvtpd2pi",XMMXMM,16),TNSZ("ucomisd",XMM,8),TNSZ("comisd",XMM,8), + +/* [30] */ INVALID, INVALID, INVALID, INVALID, +/* [34] */ INVALID, INVALID, INVALID, INVALID, +/* [38] */ INVALID, INVALID, INVALID, INVALID, +/* [3C] */ INVALID, INVALID, INVALID, INVALID, + +/* [40] */ INVALID, INVALID, INVALID, INVALID, +/* [44] */ INVALID, INVALID, INVALID, INVALID, +/* [48] */ INVALID, INVALID, INVALID, INVALID, +/* [4C] */ INVALID, INVALID, INVALID, INVALID, + +/* [50] */ TNS("movmskpd",XMMOX3), TNSZ("sqrtpd",XMM,16), INVALID, INVALID, +/* [54] */ TNSZ("andpd",XMM,16), TNSZ("andnpd",XMM,16), TNSZ("orpd",XMM,16), TNSZ("xorpd",XMM,16), +/* [58] */ TNSZ("addpd",XMM,16), TNSZ("mulpd",XMM,16), TNSZ("cvtpd2ps",XMM,16),TNSZ("cvtps2dq",XMM,16), +/* [5C] */ TNSZ("subpd",XMM,16), TNSZ("minpd",XMM,16), TNSZ("divpd",XMM,16), TNSZ("maxpd",XMM,16), + +/* [60] */ TNSZ("punpcklbw",XMM,16),TNSZ("punpcklwd",XMM,16),TNSZ("punpckldq",XMM,16),TNSZ("packsswb",XMM,16), +/* [64] */ TNSZ("pcmpgtb",XMM,16), TNSZ("pcmpgtw",XMM,16), TNSZ("pcmpgtd",XMM,16), TNSZ("packuswb",XMM,16), +/* [68] */ TNSZ("punpckhbw",XMM,16),TNSZ("punpckhwd",XMM,16),TNSZ("punpckhdq",XMM,16),TNSZ("packssdw",XMM,16), +/* [6C] */ TNSZ("punpcklqdq",XMM,16),TNSZ("punpckhqdq",XMM,16),TNSZ("movd",XMM3MX,4),TNSZ("movdqa",XMM,16), + +/* [70] */ TNSZ("pshufd",XMMP,16), INVALID, INVALID, INVALID, +/* [74] */ TNSZ("pcmpeqb",XMM,16), TNSZ("pcmpeqw",XMM,16), TNSZ("pcmpeqd",XMM,16), INVALID, +/* [78] */ TNSZ("extrq",XMM2I,16), TNSZ("extrq",XMM,16), INVALID, INVALID, +/* [7C] */ TNSZ("haddpd",XMM,16), TNSZ("hsubpd",XMM,16), TNSZ("movd",XMM3MXS,4), TNSZ("movdqa",XMMS,16), + +/* [80] */ INVALID, INVALID, INVALID, INVALID, +/* [84] */ INVALID, INVALID, INVALID, INVALID, +/* [88] */ INVALID, INVALID, INVALID, INVALID, +/* [8C] */ INVALID, INVALID, INVALID, INVALID, + +/* [90] */ INVALID, INVALID, INVALID, INVALID, +/* [94] */ INVALID, INVALID, INVALID, INVALID, +/* [98] */ INVALID, INVALID, INVALID, INVALID, +/* [9C] */ INVALID, INVALID, INVALID, INVALID, + +/* [A0] */ INVALID, INVALID, INVALID, INVALID, +/* [A4] */ INVALID, INVALID, INVALID, INVALID, +/* [A8] */ INVALID, INVALID, INVALID, INVALID, +/* [AC] */ INVALID, INVALID, INVALID, INVALID, + +/* [B0] */ INVALID, INVALID, INVALID, INVALID, +/* [B4] */ INVALID, INVALID, INVALID, INVALID, +/* [B8] */ INVALID, INVALID, INVALID, INVALID, +/* [BC] */ INVALID, INVALID, INVALID, INVALID, + +/* [C0] */ INVALID, INVALID, TNSZ("cmppd",XMMP,16), INVALID, +/* [C4] */ TNSZ("pinsrw",XMMPRM,2),TNS("pextrw",XMM3P), TNSZ("shufpd",XMMP,16), INVALID, +/* [C8] */ INVALID, INVALID, INVALID, INVALID, +/* [CC] */ INVALID, INVALID, INVALID, INVALID, + +/* [D0] */ TNSZ("addsubpd",XMM,16),TNSZ("psrlw",XMM,16), TNSZ("psrld",XMM,16), TNSZ("psrlq",XMM,16), +/* [D4] */ TNSZ("paddq",XMM,16), TNSZ("pmullw",XMM,16), TNSZ("movq",XMMS,8), TNS("pmovmskb",XMMX3), +/* [D8] */ TNSZ("psubusb",XMM,16), TNSZ("psubusw",XMM,16), TNSZ("pminub",XMM,16), TNSZ("pand",XMM,16), +/* [DC] */ TNSZ("paddusb",XMM,16), TNSZ("paddusw",XMM,16), TNSZ("pmaxub",XMM,16), TNSZ("pandn",XMM,16), + +/* [E0] */ TNSZ("pavgb",XMM,16), TNSZ("psraw",XMM,16), TNSZ("psrad",XMM,16), TNSZ("pavgw",XMM,16), +/* [E4] */ TNSZ("pmulhuw",XMM,16), TNSZ("pmulhw",XMM,16), TNSZ("cvttpd2dq",XMM,16),TNSZ("movntdq",XMMS,16), +/* [E8] */ TNSZ("psubsb",XMM,16), TNSZ("psubsw",XMM,16), TNSZ("pminsw",XMM,16), TNSZ("por",XMM,16), +/* [EC] */ TNSZ("paddsb",XMM,16), TNSZ("paddsw",XMM,16), TNSZ("pmaxsw",XMM,16), TNSZ("pxor",XMM,16), + +/* [F0] */ INVALID, TNSZ("psllw",XMM,16), TNSZ("pslld",XMM,16), TNSZ("psllq",XMM,16), +/* [F4] */ TNSZ("pmuludq",XMM,16), TNSZ("pmaddwd",XMM,16), TNSZ("psadbw",XMM,16), TNSZ("maskmovdqu", XMMXIMPL,16), +/* [F8] */ TNSZ("psubb",XMM,16), TNSZ("psubw",XMM,16), TNSZ("psubd",XMM,16), TNSZ("psubq",XMM,16), +/* [FC] */ TNSZ("paddb",XMM,16), TNSZ("paddw",XMM,16), TNSZ("paddd",XMM,16), INVALID, +}; + +const instable_t dis_opAVX660F[256] = { +/* [00] */ INVALID, INVALID, INVALID, INVALID, +/* [04] */ INVALID, INVALID, INVALID, INVALID, +/* [08] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [10] */ TNSZ("vmovupd",VEX_MX,16), TNSZ("vmovupd",VEX_RX,16), TNSZ("vmovlpd",VEX_RMrX,8), TNSZ("vmovlpd",VEX_RM,8), +/* [14] */ TNSZ("vunpcklpd",VEX_RMrX,16),TNSZ("vunpckhpd",VEX_RMrX,16),TNSZ("vmovhpd",VEX_RMrX,8), TNSZ("vmovhpd",VEX_RM,8), +/* [18] */ INVALID, INVALID, INVALID, INVALID, +/* [1C] */ INVALID, INVALID, INVALID, INVALID, + +/* [20] */ INVALID, INVALID, INVALID, INVALID, +/* [24] */ INVALID, INVALID, INVALID, INVALID, +/* [28] */ TNSZ("vmovapd",VEX_MX,16), TNSZ("vmovapd",VEX_RX,16), INVALID, TNSZ("vmovntpd",VEX_RM,16), +/* [2C] */ INVALID, INVALID, TNSZ("vucomisd",VEX_MX,8),TNSZ("vcomisd",VEX_MX,8), + +/* [30] */ INVALID, INVALID, INVALID, INVALID, +/* [34] */ INVALID, INVALID, INVALID, INVALID, +/* [38] */ INVALID, INVALID, INVALID, INVALID, +/* [3C] */ INVALID, INVALID, INVALID, INVALID, + +/* [40] */ INVALID, INVALID, INVALID, INVALID, +/* [44] */ INVALID, INVALID, INVALID, INVALID, +/* [48] */ INVALID, INVALID, INVALID, INVALID, +/* [4C] */ INVALID, INVALID, INVALID, INVALID, + +/* [50] */ TNS("vmovmskpd",VEX_MR), TNSZ("vsqrtpd",VEX_MX,16), INVALID, INVALID, +/* [54] */ TNSZ("vandpd",VEX_RMrX,16), TNSZ("vandnpd",VEX_RMrX,16), TNSZ("vorpd",VEX_RMrX,16), TNSZ("vxorpd",VEX_RMrX,16), +/* [58] */ TNSZ("vaddpd",VEX_RMrX,16), TNSZ("vmulpd",VEX_RMrX,16), TNSZ("vcvtpd2ps",VEX_MX,16),TNSZ("vcvtps2dq",VEX_MX,16), +/* [5C] */ TNSZ("vsubpd",VEX_RMrX,16), TNSZ("vminpd",VEX_RMrX,16), TNSZ("vdivpd",VEX_RMrX,16), TNSZ("vmaxpd",VEX_RMrX,16), + +/* [60] */ TNSZ("vpunpcklbw",VEX_RMrX,16),TNSZ("vpunpcklwd",VEX_RMrX,16),TNSZ("vpunpckldq",VEX_RMrX,16),TNSZ("vpacksswb",VEX_RMrX,16), +/* [64] */ TNSZ("vpcmpgtb",VEX_RMrX,16), TNSZ("vpcmpgtw",VEX_RMrX,16), TNSZ("vpcmpgtd",VEX_RMrX,16), TNSZ("vpackuswb",VEX_RMrX,16), +/* [68] */ TNSZ("vpunpckhbw",VEX_RMrX,16),TNSZ("vpunpckhwd",VEX_RMrX,16),TNSZ("vpunpckhdq",VEX_RMrX,16),TNSZ("vpackssdw",VEX_RMrX,16), +/* [6C] */ TNSZ("vpunpcklqdq",VEX_RMrX,16),TNSZ("vpunpckhqdq",VEX_RMrX,16),TNSZ("vmovd",VEX_MX,4),TNSZ("vmovdqa",VEX_MX,16), + +/* [70] */ TNSZ("vpshufd",VEX_MXI,16), TNSZ("vgrp71",VEX_XXI,16), TNSZ("vgrp72",VEX_XXI,16), TNSZ("vgrp73",VEX_XXI,16), +/* [74] */ TNSZ("vpcmpeqb",VEX_RMrX,16), TNSZ("vpcmpeqw",VEX_RMrX,16), TNSZ("vpcmpeqd",VEX_RMrX,16), INVALID, +/* [78] */ INVALID, INVALID, INVALID, INVALID, +/* [7C] */ TNSZ("vhaddpd",VEX_RMrX,16), TNSZ("vhsubpd",VEX_RMrX,16), TNSZ("vmovd",VEX_RR,4), TNSZ("vmovdqa",VEX_RX,16), + +/* [80] */ INVALID, INVALID, INVALID, INVALID, +/* [84] */ INVALID, INVALID, INVALID, INVALID, +/* [88] */ INVALID, INVALID, INVALID, INVALID, +/* [8C] */ INVALID, INVALID, INVALID, INVALID, + +/* [90] */ INVALID, INVALID, INVALID, INVALID, +/* [94] */ INVALID, INVALID, INVALID, INVALID, +/* [98] */ INVALID, INVALID, INVALID, INVALID, +/* [9C] */ INVALID, INVALID, INVALID, INVALID, + +/* [A0] */ INVALID, INVALID, INVALID, INVALID, +/* [A4] */ INVALID, INVALID, INVALID, INVALID, +/* [A8] */ INVALID, INVALID, INVALID, INVALID, +/* [AC] */ INVALID, INVALID, INVALID, INVALID, + +/* [B0] */ INVALID, INVALID, INVALID, INVALID, +/* [B4] */ INVALID, INVALID, INVALID, INVALID, +/* [B8] */ INVALID, INVALID, INVALID, INVALID, +/* [BC] */ INVALID, INVALID, INVALID, INVALID, + +/* [C0] */ INVALID, INVALID, TNSZ("vcmppd",VEX_RMRX,16), INVALID, +/* [C4] */ TNSZ("vpinsrw",VEX_RMRX,2),TNS("vpextrw",VEX_MR), TNSZ("vshufpd",VEX_RMRX,16), INVALID, +/* [C8] */ INVALID, INVALID, INVALID, INVALID, +/* [CC] */ INVALID, INVALID, INVALID, INVALID, + +/* [D0] */ TNSZ("vaddsubpd",VEX_RMrX,16),TNSZ("vpsrlw",VEX_RMrX,16), TNSZ("vpsrld",VEX_RMrX,16), TNSZ("vpsrlq",VEX_RMrX,16), +/* [D4] */ TNSZ("vpaddq",VEX_RMrX,16), TNSZ("vpmullw",VEX_RMrX,16), TNSZ("vmovq",VEX_RX,8), TNS("vpmovmskb",VEX_MR), +/* [D8] */ TNSZ("vpsubusb",VEX_RMrX,16), TNSZ("vpsubusw",VEX_RMrX,16), TNSZ("vpminub",VEX_RMrX,16), TNSZ("vpand",VEX_RMrX,16), +/* [DC] */ TNSZ("vpaddusb",VEX_RMrX,16), TNSZ("vpaddusw",VEX_RMrX,16), TNSZ("vpmaxub",VEX_RMrX,16), TNSZ("vpandn",VEX_RMrX,16), + +/* [E0] */ TNSZ("vpavgb",VEX_RMrX,16), TNSZ("vpsraw",VEX_RMrX,16), TNSZ("vpsrad",VEX_RMrX,16), TNSZ("vpavgw",VEX_RMrX,16), +/* [E4] */ TNSZ("vpmulhuw",VEX_RMrX,16), TNSZ("vpmulhw",VEX_RMrX,16), TNSZ("vcvttpd2dq",VEX_MX,16),TNSZ("vmovntdq",VEX_RM,16), +/* [E8] */ TNSZ("vpsubsb",VEX_RMrX,16), TNSZ("vpsubsw",VEX_RMrX,16), TNSZ("vpminsw",VEX_RMrX,16), TNSZ("vpor",VEX_RMrX,16), +/* [EC] */ TNSZ("vpaddsb",VEX_RMrX,16), TNSZ("vpaddsw",VEX_RMrX,16), TNSZ("vpmaxsw",VEX_RMrX,16), TNSZ("vpxor",VEX_RMrX,16), + +/* [F0] */ INVALID, TNSZ("vpsllw",VEX_RMrX,16), TNSZ("vpslld",VEX_RMrX,16), TNSZ("vpsllq",VEX_RMrX,16), +/* [F4] */ TNSZ("vpmuludq",VEX_RMrX,16), TNSZ("vpmaddwd",VEX_RMrX,16), TNSZ("vpsadbw",VEX_RMrX,16), TNS("vmaskmovdqu",VEX_MX), +/* [F8] */ TNSZ("vpsubb",VEX_RMrX,16), TNSZ("vpsubw",VEX_RMrX,16), TNSZ("vpsubd",VEX_RMrX,16), TNSZ("vpsubq",VEX_RMrX,16), +/* [FC] */ TNSZ("vpaddb",VEX_RMrX,16), TNSZ("vpaddw",VEX_RMrX,16), TNSZ("vpaddd",VEX_RMrX,16), INVALID, +}; + +/* + * Decode table for SIMD instructions with the repnz (0xf2) prefix. + */ +const instable_t dis_opSIMDrepnz[256] = { +/* [00] */ INVALID, INVALID, INVALID, INVALID, +/* [04] */ INVALID, INVALID, INVALID, INVALID, +/* [08] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [10] */ TNSZ("movsd",XMM,8), TNSZ("movsd",XMMS,8), TNSZ("movddup",XMM,8), INVALID, +/* [14] */ INVALID, INVALID, INVALID, INVALID, +/* [18] */ INVALID, INVALID, INVALID, INVALID, +/* [1C] */ INVALID, INVALID, INVALID, INVALID, + +/* [20] */ INVALID, INVALID, INVALID, INVALID, +/* [24] */ INVALID, INVALID, INVALID, INVALID, +/* [28] */ INVALID, INVALID, TNSZ("cvtsi2sd",XMM3MX,4),TNSZ("movntsd",XMMMS,8), +/* [2C] */ TNSZ("cvttsd2si",XMMXM3,8),TNSZ("cvtsd2si",XMMXM3,8),INVALID, INVALID, + +/* [30] */ INVALID, INVALID, INVALID, INVALID, +/* [34] */ INVALID, INVALID, INVALID, INVALID, +/* [38] */ INVALID, INVALID, INVALID, INVALID, +/* [3C] */ INVALID, INVALID, INVALID, INVALID, + +/* [40] */ INVALID, INVALID, INVALID, INVALID, +/* [44] */ INVALID, INVALID, INVALID, INVALID, +/* [48] */ INVALID, INVALID, INVALID, INVALID, +/* [4C] */ INVALID, INVALID, INVALID, INVALID, + +/* [50] */ INVALID, TNSZ("sqrtsd",XMM,8), INVALID, INVALID, +/* [54] */ INVALID, INVALID, INVALID, INVALID, +/* [58] */ TNSZ("addsd",XMM,8), TNSZ("mulsd",XMM,8), TNSZ("cvtsd2ss",XMM,8), INVALID, +/* [5C] */ TNSZ("subsd",XMM,8), TNSZ("minsd",XMM,8), TNSZ("divsd",XMM,8), TNSZ("maxsd",XMM,8), + +/* [60] */ INVALID, INVALID, INVALID, INVALID, +/* [64] */ INVALID, INVALID, INVALID, INVALID, +/* [68] */ INVALID, INVALID, INVALID, INVALID, +/* [6C] */ INVALID, INVALID, INVALID, INVALID, + +/* [70] */ TNSZ("pshuflw",XMMP,16),INVALID, INVALID, INVALID, +/* [74] */ INVALID, INVALID, INVALID, INVALID, +/* [78] */ TNSZ("insertq",XMMX2I,16),TNSZ("insertq",XMM,8),INVALID, INVALID, +/* [7C] */ TNSZ("haddps",XMM,16), TNSZ("hsubps",XMM,16), INVALID, INVALID, + +/* [80] */ INVALID, INVALID, INVALID, INVALID, +/* [84] */ INVALID, INVALID, INVALID, INVALID, +/* [88] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [90] */ INVALID, INVALID, INVALID, INVALID, +/* [94] */ INVALID, INVALID, INVALID, INVALID, +/* [98] */ INVALID, INVALID, INVALID, INVALID, +/* [9C] */ INVALID, INVALID, INVALID, INVALID, + +/* [A0] */ INVALID, INVALID, INVALID, INVALID, +/* [A4] */ INVALID, INVALID, INVALID, INVALID, +/* [A8] */ INVALID, INVALID, INVALID, INVALID, +/* [AC] */ INVALID, INVALID, INVALID, INVALID, + +/* [B0] */ INVALID, INVALID, INVALID, INVALID, +/* [B4] */ INVALID, INVALID, INVALID, INVALID, +/* [B8] */ INVALID, INVALID, INVALID, INVALID, +/* [BC] */ INVALID, INVALID, INVALID, INVALID, + +/* [C0] */ INVALID, INVALID, TNSZ("cmpsd",XMMP,8), INVALID, +/* [C4] */ INVALID, INVALID, INVALID, INVALID, +/* [C8] */ INVALID, INVALID, INVALID, INVALID, +/* [CC] */ INVALID, INVALID, INVALID, INVALID, + +/* [D0] */ TNSZ("addsubps",XMM,16),INVALID, INVALID, INVALID, +/* [D4] */ INVALID, INVALID, TNS("movdq2q",XMMXM), INVALID, +/* [D8] */ INVALID, INVALID, INVALID, INVALID, +/* [DC] */ INVALID, INVALID, INVALID, INVALID, + +/* [E0] */ INVALID, INVALID, INVALID, INVALID, +/* [E4] */ INVALID, INVALID, TNSZ("cvtpd2dq",XMM,16),INVALID, +/* [E8] */ INVALID, INVALID, INVALID, INVALID, +/* [EC] */ INVALID, INVALID, INVALID, INVALID, + +/* [F0] */ TNS("lddqu",XMMM), INVALID, INVALID, INVALID, +/* [F4] */ INVALID, INVALID, INVALID, INVALID, +/* [F8] */ INVALID, INVALID, INVALID, INVALID, +/* [FC] */ INVALID, INVALID, INVALID, INVALID, +}; + +const instable_t dis_opAVXF20F[256] = { +/* [00] */ INVALID, INVALID, INVALID, INVALID, +/* [04] */ INVALID, INVALID, INVALID, INVALID, +/* [08] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [10] */ TNSZ("vmovsd",VEX_RMrX,8), TNSZ("vmovsd",VEX_RRX,8), TNSZ("vmovddup",VEX_MX,8), INVALID, +/* [14] */ INVALID, INVALID, INVALID, INVALID, +/* [18] */ INVALID, INVALID, INVALID, INVALID, +/* [1C] */ INVALID, INVALID, INVALID, INVALID, + +/* [20] */ INVALID, INVALID, INVALID, INVALID, +/* [24] */ INVALID, INVALID, INVALID, INVALID, +/* [28] */ INVALID, INVALID, TNSZ("vcvtsi2sd",VEX_RMrX,4),INVALID, +/* [2C] */ TNSZ("vcvttsd2si",VEX_MR,8),TNSZ("vcvtsd2si",VEX_MR,8),INVALID, INVALID, + +/* [30] */ INVALID, INVALID, INVALID, INVALID, +/* [34] */ INVALID, INVALID, INVALID, INVALID, +/* [38] */ INVALID, INVALID, INVALID, INVALID, +/* [3C] */ INVALID, INVALID, INVALID, INVALID, + +/* [40] */ INVALID, INVALID, INVALID, INVALID, +/* [44] */ INVALID, INVALID, INVALID, INVALID, +/* [48] */ INVALID, INVALID, INVALID, INVALID, +/* [4C] */ INVALID, INVALID, INVALID, INVALID, + +/* [50] */ INVALID, TNSZ("vsqrtsd",VEX_RMrX,8), INVALID, INVALID, +/* [54] */ INVALID, INVALID, INVALID, INVALID, +/* [58] */ TNSZ("vaddsd",VEX_RMrX,8), TNSZ("vmulsd",VEX_RMrX,8), TNSZ("vcvtsd2ss",VEX_RMrX,8), INVALID, +/* [5C] */ TNSZ("vsubsd",VEX_RMrX,8), TNSZ("vminsd",VEX_RMrX,8), TNSZ("vdivsd",VEX_RMrX,8), TNSZ("vmaxsd",VEX_RMrX,8), + +/* [60] */ INVALID, INVALID, INVALID, INVALID, +/* [64] */ INVALID, INVALID, INVALID, INVALID, +/* [68] */ INVALID, INVALID, INVALID, INVALID, +/* [6C] */ INVALID, INVALID, INVALID, INVALID, + +/* [70] */ TNSZ("vpshuflw",VEX_MXI,16),INVALID, INVALID, INVALID, +/* [74] */ INVALID, INVALID, INVALID, INVALID, +/* [78] */ INVALID, INVALID, INVALID, INVALID, +/* [7C] */ TNSZ("vhaddps",VEX_RMrX,8), TNSZ("vhsubps",VEX_RMrX,8), INVALID, INVALID, + +/* [80] */ INVALID, INVALID, INVALID, INVALID, +/* [84] */ INVALID, INVALID, INVALID, INVALID, +/* [88] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [90] */ INVALID, INVALID, INVALID, INVALID, +/* [94] */ INVALID, INVALID, INVALID, INVALID, +/* [98] */ INVALID, INVALID, INVALID, INVALID, +/* [9C] */ INVALID, INVALID, INVALID, INVALID, + +/* [A0] */ INVALID, INVALID, INVALID, INVALID, +/* [A4] */ INVALID, INVALID, INVALID, INVALID, +/* [A8] */ INVALID, INVALID, INVALID, INVALID, +/* [AC] */ INVALID, INVALID, INVALID, INVALID, + +/* [B0] */ INVALID, INVALID, INVALID, INVALID, +/* [B4] */ INVALID, INVALID, INVALID, INVALID, +/* [B8] */ INVALID, INVALID, INVALID, INVALID, +/* [BC] */ INVALID, INVALID, INVALID, INVALID, + +/* [C0] */ INVALID, INVALID, TNSZ("vcmpsd",VEX_RMRX,8), INVALID, +/* [C4] */ INVALID, INVALID, INVALID, INVALID, +/* [C8] */ INVALID, INVALID, INVALID, INVALID, +/* [CC] */ INVALID, INVALID, INVALID, INVALID, + +/* [D0] */ TNSZ("vaddsubps",VEX_RMrX,8), INVALID, INVALID, INVALID, +/* [D4] */ INVALID, INVALID, INVALID, INVALID, +/* [D8] */ INVALID, INVALID, INVALID, INVALID, +/* [DC] */ INVALID, INVALID, INVALID, INVALID, + +/* [E0] */ INVALID, INVALID, INVALID, INVALID, +/* [E4] */ INVALID, INVALID, TNSZ("vcvtpd2dq",VEX_MX,16),INVALID, +/* [E8] */ INVALID, INVALID, INVALID, INVALID, +/* [EC] */ INVALID, INVALID, INVALID, INVALID, + +/* [F0] */ TNSZ("vlddqu",VEX_MX,16), INVALID, INVALID, INVALID, +/* [F4] */ INVALID, INVALID, INVALID, INVALID, +/* [F8] */ INVALID, INVALID, INVALID, INVALID, +/* [FC] */ INVALID, INVALID, INVALID, INVALID, +}; + +const instable_t dis_opAVXF20F3A[256] = { +/* [00] */ INVALID, INVALID, INVALID, INVALID, +/* [04] */ INVALID, INVALID, INVALID, INVALID, +/* [08] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [10] */ INVALID, INVALID, INVALID, INVALID, +/* [14] */ INVALID, INVALID, INVALID, INVALID, +/* [18] */ INVALID, INVALID, INVALID, INVALID, +/* [1C] */ INVALID, INVALID, INVALID, INVALID, + +/* [20] */ INVALID, INVALID, INVALID, INVALID, +/* [24] */ INVALID, INVALID, INVALID, INVALID, +/* [28] */ INVALID, INVALID, INVALID, INVALID, +/* [2C] */ INVALID, INVALID, INVALID, INVALID, + +/* [30] */ INVALID, INVALID, INVALID, INVALID, +/* [34] */ INVALID, INVALID, INVALID, INVALID, +/* [38] */ INVALID, INVALID, INVALID, INVALID, +/* [3C] */ INVALID, INVALID, INVALID, INVALID, + +/* [40] */ INVALID, INVALID, INVALID, INVALID, +/* [44] */ INVALID, INVALID, INVALID, INVALID, +/* [48] */ INVALID, INVALID, INVALID, INVALID, +/* [4C] */ INVALID, INVALID, INVALID, INVALID, + +/* [50] */ INVALID, INVALID, INVALID, INVALID, +/* [54] */ INVALID, INVALID, INVALID, INVALID, +/* [58] */ INVALID, INVALID, INVALID, INVALID, +/* [5C] */ INVALID, INVALID, INVALID, INVALID, + +/* [60] */ INVALID, INVALID, INVALID, INVALID, +/* [64] */ INVALID, INVALID, INVALID, INVALID, +/* [68] */ INVALID, INVALID, INVALID, INVALID, +/* [6C] */ INVALID, INVALID, INVALID, INVALID, + +/* [70] */ INVALID, INVALID, INVALID, INVALID, +/* [74] */ INVALID, INVALID, INVALID, INVALID, +/* [78] */ INVALID, INVALID, INVALID, INVALID, +/* [7C] */ INVALID, INVALID, INVALID, INVALID, + +/* [80] */ INVALID, INVALID, INVALID, INVALID, +/* [84] */ INVALID, INVALID, INVALID, INVALID, +/* [88] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [90] */ INVALID, INVALID, INVALID, INVALID, +/* [94] */ INVALID, INVALID, INVALID, INVALID, +/* [98] */ INVALID, INVALID, INVALID, INVALID, +/* [9C] */ INVALID, INVALID, INVALID, INVALID, + +/* [A0] */ INVALID, INVALID, INVALID, INVALID, +/* [A4] */ INVALID, INVALID, INVALID, INVALID, +/* [A8] */ INVALID, INVALID, INVALID, INVALID, +/* [AC] */ INVALID, INVALID, INVALID, INVALID, + +/* [B0] */ INVALID, INVALID, INVALID, INVALID, +/* [B4] */ INVALID, INVALID, INVALID, INVALID, +/* [B8] */ INVALID, INVALID, INVALID, INVALID, +/* [BC] */ INVALID, INVALID, INVALID, INVALID, + +/* [C0] */ INVALID, INVALID, INVALID, INVALID, +/* [C4] */ INVALID, INVALID, INVALID, INVALID, +/* [C8] */ INVALID, INVALID, INVALID, INVALID, +/* [CC] */ INVALID, INVALID, INVALID, INVALID, + +/* [D0] */ INVALID, INVALID, INVALID, INVALID, +/* [D4] */ INVALID, INVALID, INVALID, INVALID, +/* [D8] */ INVALID, INVALID, INVALID, INVALID, +/* [DC] */ INVALID, INVALID, INVALID, INVALID, + +/* [E0] */ INVALID, INVALID, INVALID, INVALID, +/* [E4] */ INVALID, INVALID, INVALID, INVALID, +/* [E8] */ INVALID, INVALID, INVALID, INVALID, +/* [EC] */ INVALID, INVALID, INVALID, INVALID, + +/* [F0] */ TNSZvr("rorx",VEX_MXI,6),INVALID, INVALID, INVALID, +/* [F4] */ INVALID, INVALID, INVALID, INVALID, +/* [F8] */ INVALID, INVALID, INVALID, INVALID, +/* [FC] */ INVALID, INVALID, INVALID, INVALID, +}; + +const instable_t dis_opAVXF20F38[256] = { +/* [00] */ INVALID, INVALID, INVALID, INVALID, +/* [04] */ INVALID, INVALID, INVALID, INVALID, +/* [08] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [10] */ INVALID, INVALID, INVALID, INVALID, +/* [14] */ INVALID, INVALID, INVALID, INVALID, +/* [18] */ INVALID, INVALID, INVALID, INVALID, +/* [1C] */ INVALID, INVALID, INVALID, INVALID, + +/* [20] */ INVALID, INVALID, INVALID, INVALID, +/* [24] */ INVALID, INVALID, INVALID, INVALID, +/* [28] */ INVALID, INVALID, INVALID, INVALID, +/* [2C] */ INVALID, INVALID, INVALID, INVALID, + +/* [30] */ INVALID, INVALID, INVALID, INVALID, +/* [34] */ INVALID, INVALID, INVALID, INVALID, +/* [38] */ INVALID, INVALID, INVALID, INVALID, +/* [3C] */ INVALID, INVALID, INVALID, INVALID, + +/* [40] */ INVALID, INVALID, INVALID, INVALID, +/* [44] */ INVALID, INVALID, INVALID, INVALID, +/* [48] */ INVALID, INVALID, INVALID, INVALID, +/* [4C] */ INVALID, INVALID, INVALID, INVALID, + +/* [50] */ INVALID, INVALID, INVALID, INVALID, +/* [54] */ INVALID, INVALID, INVALID, INVALID, +/* [58] */ INVALID, INVALID, INVALID, INVALID, +/* [5C] */ INVALID, INVALID, INVALID, INVALID, + +/* [60] */ INVALID, INVALID, INVALID, INVALID, +/* [64] */ INVALID, INVALID, INVALID, INVALID, +/* [68] */ INVALID, INVALID, INVALID, INVALID, +/* [6C] */ INVALID, INVALID, INVALID, INVALID, + +/* [70] */ INVALID, INVALID, INVALID, INVALID, +/* [74] */ INVALID, INVALID, INVALID, INVALID, +/* [78] */ INVALID, INVALID, INVALID, INVALID, +/* [7C] */ INVALID, INVALID, INVALID, INVALID, + +/* [80] */ INVALID, INVALID, INVALID, INVALID, +/* [84] */ INVALID, INVALID, INVALID, INVALID, +/* [88] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [90] */ INVALID, INVALID, INVALID, INVALID, +/* [94] */ INVALID, INVALID, INVALID, INVALID, +/* [98] */ INVALID, INVALID, INVALID, INVALID, +/* [9C] */ INVALID, INVALID, INVALID, INVALID, + +/* [A0] */ INVALID, INVALID, INVALID, INVALID, +/* [A4] */ INVALID, INVALID, INVALID, INVALID, +/* [A8] */ INVALID, INVALID, INVALID, INVALID, +/* [AC] */ INVALID, INVALID, INVALID, INVALID, + +/* [B0] */ INVALID, INVALID, INVALID, INVALID, +/* [B4] */ INVALID, INVALID, INVALID, INVALID, +/* [B8] */ INVALID, INVALID, INVALID, INVALID, +/* [BC] */ INVALID, INVALID, INVALID, INVALID, + +/* [C0] */ INVALID, INVALID, INVALID, INVALID, +/* [C4] */ INVALID, INVALID, INVALID, INVALID, +/* [C8] */ INVALID, INVALID, INVALID, INVALID, +/* [CC] */ INVALID, INVALID, INVALID, INVALID, + +/* [D0] */ INVALID, INVALID, INVALID, INVALID, +/* [D4] */ INVALID, INVALID, INVALID, INVALID, +/* [D8] */ INVALID, INVALID, INVALID, INVALID, +/* [DC] */ INVALID, INVALID, INVALID, INVALID, + +/* [E0] */ INVALID, INVALID, INVALID, INVALID, +/* [E4] */ INVALID, INVALID, INVALID, INVALID, +/* [E8] */ INVALID, INVALID, INVALID, INVALID, +/* [EC] */ INVALID, INVALID, INVALID, INVALID, + +/* [F0] */ INVALID, INVALID, INVALID, INVALID, +/* [F4] */ INVALID, TNSZvr("pdep",VEX_RMrX,5),TNSZvr("mulx",VEX_RMrX,5),TNSZvr("shrx",VEX_VRMrX,5), +/* [F8] */ INVALID, INVALID, INVALID, INVALID, +/* [FC] */ INVALID, INVALID, INVALID, INVALID, +}; + +const instable_t dis_opAVXF30F38[256] = { +/* [00] */ INVALID, INVALID, INVALID, INVALID, +/* [04] */ INVALID, INVALID, INVALID, INVALID, +/* [08] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [10] */ INVALID, INVALID, INVALID, INVALID, +/* [14] */ INVALID, INVALID, INVALID, INVALID, +/* [18] */ INVALID, INVALID, INVALID, INVALID, +/* [1C] */ INVALID, INVALID, INVALID, INVALID, + +/* [20] */ INVALID, INVALID, INVALID, INVALID, +/* [24] */ INVALID, INVALID, INVALID, INVALID, +/* [28] */ INVALID, INVALID, INVALID, INVALID, +/* [2C] */ INVALID, INVALID, INVALID, INVALID, + +/* [30] */ INVALID, INVALID, INVALID, INVALID, +/* [34] */ INVALID, INVALID, INVALID, INVALID, +/* [38] */ INVALID, INVALID, INVALID, INVALID, +/* [3C] */ INVALID, INVALID, INVALID, INVALID, + +/* [40] */ INVALID, INVALID, INVALID, INVALID, +/* [44] */ INVALID, INVALID, INVALID, INVALID, +/* [48] */ INVALID, INVALID, INVALID, INVALID, +/* [4C] */ INVALID, INVALID, INVALID, INVALID, + +/* [50] */ INVALID, INVALID, INVALID, INVALID, +/* [54] */ INVALID, INVALID, INVALID, INVALID, +/* [58] */ INVALID, INVALID, INVALID, INVALID, +/* [5C] */ INVALID, INVALID, INVALID, INVALID, + +/* [60] */ INVALID, INVALID, INVALID, INVALID, +/* [64] */ INVALID, INVALID, INVALID, INVALID, +/* [68] */ INVALID, INVALID, INVALID, INVALID, +/* [6C] */ INVALID, INVALID, INVALID, INVALID, + +/* [70] */ INVALID, INVALID, INVALID, INVALID, +/* [74] */ INVALID, INVALID, INVALID, INVALID, +/* [78] */ INVALID, INVALID, INVALID, INVALID, +/* [7C] */ INVALID, INVALID, INVALID, INVALID, + +/* [80] */ INVALID, INVALID, INVALID, INVALID, +/* [84] */ INVALID, INVALID, INVALID, INVALID, +/* [88] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [90] */ INVALID, INVALID, INVALID, INVALID, +/* [94] */ INVALID, INVALID, INVALID, INVALID, +/* [98] */ INVALID, INVALID, INVALID, INVALID, +/* [9C] */ INVALID, INVALID, INVALID, INVALID, + +/* [A0] */ INVALID, INVALID, INVALID, INVALID, +/* [A4] */ INVALID, INVALID, INVALID, INVALID, +/* [A8] */ INVALID, INVALID, INVALID, INVALID, +/* [AC] */ INVALID, INVALID, INVALID, INVALID, + +/* [B0] */ INVALID, INVALID, INVALID, INVALID, +/* [B4] */ INVALID, INVALID, INVALID, INVALID, +/* [B8] */ INVALID, INVALID, INVALID, INVALID, +/* [BC] */ INVALID, INVALID, INVALID, INVALID, + +/* [C0] */ INVALID, INVALID, INVALID, INVALID, +/* [C4] */ INVALID, INVALID, INVALID, INVALID, +/* [C8] */ INVALID, INVALID, INVALID, INVALID, +/* [CC] */ INVALID, INVALID, INVALID, INVALID, + +/* [D0] */ INVALID, INVALID, INVALID, INVALID, +/* [D4] */ INVALID, INVALID, INVALID, INVALID, +/* [D8] */ INVALID, INVALID, INVALID, INVALID, +/* [DC] */ INVALID, INVALID, INVALID, INVALID, + +/* [E0] */ INVALID, INVALID, INVALID, INVALID, +/* [E4] */ INVALID, INVALID, INVALID, INVALID, +/* [E8] */ INVALID, INVALID, INVALID, INVALID, +/* [EC] */ INVALID, INVALID, INVALID, INVALID, + +/* [F0] */ INVALID, INVALID, INVALID, INVALID, +/* [F4] */ INVALID, TNSZvr("pext",VEX_RMrX,5),INVALID, TNSZvr("sarx",VEX_VRMrX,5), +/* [F8] */ INVALID, INVALID, INVALID, INVALID, +/* [FC] */ INVALID, INVALID, INVALID, INVALID, +}; +/* + * Decode table for SIMD instructions with the repz (0xf3) prefix. + */ +const instable_t dis_opSIMDrepz[256] = { +/* [00] */ INVALID, INVALID, INVALID, INVALID, +/* [04] */ INVALID, INVALID, INVALID, INVALID, +/* [08] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [10] */ TNSZ("movss",XMM,4), TNSZ("movss",XMMS,4), TNSZ("movsldup",XMM,16),INVALID, +/* [14] */ INVALID, INVALID, TNSZ("movshdup",XMM,16),INVALID, +/* [18] */ INVALID, INVALID, INVALID, INVALID, +/* [1C] */ INVALID, INVALID, INVALID, INVALID, + +/* [20] */ INVALID, INVALID, INVALID, INVALID, +/* [24] */ INVALID, INVALID, INVALID, INVALID, +/* [28] */ INVALID, INVALID, TNSZ("cvtsi2ss",XMM3MX,4),TNSZ("movntss",XMMMS,4), +/* [2C] */ TNSZ("cvttss2si",XMMXM3,4),TNSZ("cvtss2si",XMMXM3,4),INVALID, INVALID, + +/* [30] */ INVALID, INVALID, INVALID, INVALID, +/* [34] */ INVALID, INVALID, INVALID, INVALID, +/* [38] */ INVALID, INVALID, INVALID, INVALID, +/* [3C] */ INVALID, INVALID, INVALID, INVALID, + +/* [40] */ INVALID, INVALID, INVALID, INVALID, +/* [44] */ INVALID, INVALID, INVALID, INVALID, +/* [48] */ INVALID, INVALID, INVALID, INVALID, +/* [4C] */ INVALID, INVALID, INVALID, INVALID, + +/* [50] */ INVALID, TNSZ("sqrtss",XMM,4), TNSZ("rsqrtss",XMM,4), TNSZ("rcpss",XMM,4), +/* [54] */ INVALID, INVALID, INVALID, INVALID, +/* [58] */ TNSZ("addss",XMM,4), TNSZ("mulss",XMM,4), TNSZ("cvtss2sd",XMM,4), TNSZ("cvttps2dq",XMM,16), +/* [5C] */ TNSZ("subss",XMM,4), TNSZ("minss",XMM,4), TNSZ("divss",XMM,4), TNSZ("maxss",XMM,4), + +/* [60] */ INVALID, INVALID, INVALID, INVALID, +/* [64] */ INVALID, INVALID, INVALID, INVALID, +/* [68] */ INVALID, INVALID, INVALID, INVALID, +/* [6C] */ INVALID, INVALID, INVALID, TNSZ("movdqu",XMM,16), + +/* [70] */ TNSZ("pshufhw",XMMP,16),INVALID, INVALID, INVALID, +/* [74] */ INVALID, INVALID, INVALID, INVALID, +/* [78] */ INVALID, INVALID, INVALID, INVALID, +/* [7C] */ INVALID, INVALID, TNSZ("movq",XMM,8), TNSZ("movdqu",XMMS,16), + +/* [80] */ INVALID, INVALID, INVALID, INVALID, +/* [84] */ INVALID, INVALID, INVALID, INVALID, +/* [88] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [90] */ INVALID, INVALID, INVALID, INVALID, +/* [94] */ INVALID, INVALID, INVALID, INVALID, +/* [98] */ INVALID, INVALID, INVALID, INVALID, +/* [9C] */ INVALID, INVALID, INVALID, INVALID, + +/* [A0] */ INVALID, INVALID, INVALID, INVALID, +/* [A4] */ INVALID, INVALID, INVALID, INVALID, +/* [A8] */ INVALID, INVALID, INVALID, INVALID, +/* [AC] */ INVALID, INVALID, INVALID, INVALID, + +/* [B0] */ INVALID, INVALID, INVALID, INVALID, +/* [B4] */ INVALID, INVALID, INVALID, INVALID, +/* [B8] */ TS("popcnt",MRw), INVALID, INVALID, INVALID, +/* [BC] */ TNSZ("tzcnt",MRw,5), TS("lzcnt",MRw), INVALID, INVALID, + +/* [C0] */ INVALID, INVALID, TNSZ("cmpss",XMMP,4), INVALID, +/* [C4] */ INVALID, INVALID, INVALID, INVALID, +/* [C8] */ INVALID, INVALID, INVALID, INVALID, +/* [CC] */ INVALID, INVALID, INVALID, INVALID, + +/* [D0] */ INVALID, INVALID, INVALID, INVALID, +/* [D4] */ INVALID, INVALID, TNS("movq2dq",XMMMX), INVALID, +/* [D8] */ INVALID, INVALID, INVALID, INVALID, +/* [DC] */ INVALID, INVALID, INVALID, INVALID, + +/* [E0] */ INVALID, INVALID, INVALID, INVALID, +/* [E4] */ INVALID, INVALID, TNSZ("cvtdq2pd",XMM,8), INVALID, +/* [E8] */ INVALID, INVALID, INVALID, INVALID, +/* [EC] */ INVALID, INVALID, INVALID, INVALID, + +/* [F0] */ INVALID, INVALID, INVALID, INVALID, +/* [F4] */ INVALID, INVALID, INVALID, INVALID, +/* [F8] */ INVALID, INVALID, INVALID, INVALID, +/* [FC] */ INVALID, INVALID, INVALID, INVALID, +}; + +const instable_t dis_opAVXF30F[256] = { +/* [00] */ INVALID, INVALID, INVALID, INVALID, +/* [04] */ INVALID, INVALID, INVALID, INVALID, +/* [08] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [10] */ TNSZ("vmovss",VEX_RMrX,4), TNSZ("vmovss",VEX_RRX,4), TNSZ("vmovsldup",VEX_MX,4), INVALID, +/* [14] */ INVALID, INVALID, TNSZ("vmovshdup",VEX_MX,4), INVALID, +/* [18] */ INVALID, INVALID, INVALID, INVALID, +/* [1C] */ INVALID, INVALID, INVALID, INVALID, + +/* [20] */ INVALID, INVALID, INVALID, INVALID, +/* [24] */ INVALID, INVALID, INVALID, INVALID, +/* [28] */ INVALID, INVALID, TNSZ("vcvtsi2ss",VEX_RMrX,4),INVALID, +/* [2C] */ TNSZ("vcvttss2si",VEX_MR,4),TNSZ("vcvtss2si",VEX_MR,4),INVALID, INVALID, + +/* [30] */ INVALID, INVALID, INVALID, INVALID, +/* [34] */ INVALID, INVALID, INVALID, INVALID, +/* [38] */ INVALID, INVALID, INVALID, INVALID, +/* [3C] */ INVALID, INVALID, INVALID, INVALID, + +/* [40] */ INVALID, INVALID, INVALID, INVALID, +/* [44] */ INVALID, INVALID, INVALID, INVALID, +/* [48] */ INVALID, INVALID, INVALID, INVALID, +/* [4C] */ INVALID, INVALID, INVALID, INVALID, + +/* [50] */ INVALID, TNSZ("vsqrtss",VEX_RMrX,4), TNSZ("vrsqrtss",VEX_RMrX,4), TNSZ("vrcpss",VEX_RMrX,4), +/* [54] */ INVALID, INVALID, INVALID, INVALID, +/* [58] */ TNSZ("vaddss",VEX_RMrX,4), TNSZ("vmulss",VEX_RMrX,4), TNSZ("vcvtss2sd",VEX_RMrX,4), TNSZ("vcvttps2dq",VEX_MX,16), +/* [5C] */ TNSZ("vsubss",VEX_RMrX,4), TNSZ("vminss",VEX_RMrX,4), TNSZ("vdivss",VEX_RMrX,4), TNSZ("vmaxss",VEX_RMrX,4), + +/* [60] */ INVALID, INVALID, INVALID, INVALID, +/* [64] */ INVALID, INVALID, INVALID, INVALID, +/* [68] */ INVALID, INVALID, INVALID, INVALID, +/* [6C] */ INVALID, INVALID, INVALID, TNSZ("vmovdqu",VEX_MX,16), + +/* [70] */ TNSZ("vpshufhw",VEX_MXI,16),INVALID, INVALID, INVALID, +/* [74] */ INVALID, INVALID, INVALID, INVALID, +/* [78] */ INVALID, INVALID, INVALID, INVALID, +/* [7C] */ INVALID, INVALID, TNSZ("vmovq",VEX_MX,8), TNSZ("vmovdqu",VEX_RX,16), + +/* [80] */ INVALID, INVALID, INVALID, INVALID, +/* [84] */ INVALID, INVALID, INVALID, INVALID, +/* [88] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [90] */ INVALID, INVALID, INVALID, INVALID, +/* [94] */ INVALID, INVALID, INVALID, INVALID, +/* [98] */ INVALID, INVALID, INVALID, INVALID, +/* [9C] */ INVALID, INVALID, INVALID, INVALID, + +/* [A0] */ INVALID, INVALID, INVALID, INVALID, +/* [A4] */ INVALID, INVALID, INVALID, INVALID, +/* [A8] */ INVALID, INVALID, INVALID, INVALID, +/* [AC] */ INVALID, INVALID, INVALID, INVALID, + +/* [B0] */ INVALID, INVALID, INVALID, INVALID, +/* [B4] */ INVALID, INVALID, INVALID, INVALID, +/* [B8] */ INVALID, INVALID, INVALID, INVALID, +/* [BC] */ INVALID, INVALID, INVALID, INVALID, + +/* [C0] */ INVALID, INVALID, TNSZ("vcmpss",VEX_RMRX,4), INVALID, +/* [C4] */ INVALID, INVALID, INVALID, INVALID, +/* [C8] */ INVALID, INVALID, INVALID, INVALID, +/* [CC] */ INVALID, INVALID, INVALID, INVALID, + +/* [D0] */ INVALID, INVALID, INVALID, INVALID, +/* [D4] */ INVALID, INVALID, INVALID, INVALID, +/* [D8] */ INVALID, INVALID, INVALID, INVALID, +/* [DC] */ INVALID, INVALID, INVALID, INVALID, + +/* [E0] */ INVALID, INVALID, INVALID, INVALID, +/* [E4] */ INVALID, INVALID, TNSZ("vcvtdq2pd",VEX_MX,8), INVALID, +/* [E8] */ INVALID, INVALID, INVALID, INVALID, +/* [EC] */ INVALID, INVALID, INVALID, INVALID, + +/* [F0] */ INVALID, INVALID, INVALID, INVALID, +/* [F4] */ INVALID, INVALID, INVALID, INVALID, +/* [F8] */ INVALID, INVALID, INVALID, INVALID, +/* [FC] */ INVALID, INVALID, INVALID, INVALID, +}; +/* + * The following two tables are used to encode crc32 and movbe + * since they share the same opcodes. + */ +const instable_t dis_op0F38F0[2] = { +/* [00] */ TNS("crc32b",CRC32), + TS("movbe",MOVBE), +}; + +const instable_t dis_op0F38F1[2] = { +/* [00] */ TS("crc32",CRC32), + TS("movbe",MOVBE), +}; + +/* + * The following table is used to distinguish between adox and adcx which share + * the same opcodes. + */ +const instable_t dis_op0F38F6[2] = { +/* [00] */ TNS("adcx",ADX), + TNS("adox",ADX), +}; + +const instable_t dis_op0F38[256] = { +/* [00] */ TNSZ("pshufb",XMM_66o,16),TNSZ("phaddw",XMM_66o,16),TNSZ("phaddd",XMM_66o,16),TNSZ("phaddsw",XMM_66o,16), +/* [04] */ TNSZ("pmaddubsw",XMM_66o,16),TNSZ("phsubw",XMM_66o,16), TNSZ("phsubd",XMM_66o,16),TNSZ("phsubsw",XMM_66o,16), +/* [08] */ TNSZ("psignb",XMM_66o,16),TNSZ("psignw",XMM_66o,16),TNSZ("psignd",XMM_66o,16),TNSZ("pmulhrsw",XMM_66o,16), +/* [0C] */ INVALID, INVALID, INVALID, INVALID, + +/* [10] */ TNSZ("pblendvb",XMM_66r,16),INVALID, INVALID, INVALID, +/* [14] */ TNSZ("blendvps",XMM_66r,16),TNSZ("blendvpd",XMM_66r,16),INVALID, TNSZ("ptest",XMM_66r,16), +/* [18] */ INVALID, INVALID, INVALID, INVALID, +/* [1C] */ TNSZ("pabsb",XMM_66o,16),TNSZ("pabsw",XMM_66o,16),TNSZ("pabsd",XMM_66o,16),INVALID, + +/* [20] */ TNSZ("pmovsxbw",XMM_66r,16),TNSZ("pmovsxbd",XMM_66r,16),TNSZ("pmovsxbq",XMM_66r,16),TNSZ("pmovsxwd",XMM_66r,16), +/* [24] */ TNSZ("pmovsxwq",XMM_66r,16),TNSZ("pmovsxdq",XMM_66r,16),INVALID, INVALID, +/* [28] */ TNSZ("pmuldq",XMM_66r,16),TNSZ("pcmpeqq",XMM_66r,16),TNSZ("movntdqa",XMMM_66r,16),TNSZ("packusdw",XMM_66r,16), +/* [2C] */ INVALID, INVALID, INVALID, INVALID, + +/* [30] */ TNSZ("pmovzxbw",XMM_66r,16),TNSZ("pmovzxbd",XMM_66r,16),TNSZ("pmovzxbq",XMM_66r,16),TNSZ("pmovzxwd",XMM_66r,16), +/* [34] */ TNSZ("pmovzxwq",XMM_66r,16),TNSZ("pmovzxdq",XMM_66r,16),INVALID, TNSZ("pcmpgtq",XMM_66r,16), +/* [38] */ TNSZ("pminsb",XMM_66r,16),TNSZ("pminsd",XMM_66r,16),TNSZ("pminuw",XMM_66r,16),TNSZ("pminud",XMM_66r,16), +/* [3C] */ TNSZ("pmaxsb",XMM_66r,16),TNSZ("pmaxsd",XMM_66r,16),TNSZ("pmaxuw",XMM_66r,16),TNSZ("pmaxud",XMM_66r,16), + +/* [40] */ TNSZ("pmulld",XMM_66r,16),TNSZ("phminposuw",XMM_66r,16),INVALID, INVALID, +/* [44] */ INVALID, INVALID, INVALID, INVALID, +/* [48] */ INVALID, INVALID, INVALID, INVALID, +/* [4C] */ INVALID, INVALID, INVALID, INVALID, + +/* [50] */ INVALID, INVALID, INVALID, INVALID, +/* [54] */ INVALID, INVALID, INVALID, INVALID, +/* [58] */ INVALID, INVALID, INVALID, INVALID, +/* [5C] */ INVALID, INVALID, INVALID, INVALID, + +/* [60] */ INVALID, INVALID, INVALID, INVALID, +/* [64] */ INVALID, INVALID, INVALID, INVALID, +/* [68] */ INVALID, INVALID, INVALID, INVALID, +/* [6C] */ INVALID, INVALID, INVALID, INVALID, + +/* [70] */ INVALID, INVALID, INVALID, INVALID, +/* [74] */ INVALID, INVALID, INVALID, INVALID, +/* [78] */ INVALID, INVALID, INVALID, INVALID, +/* [7C] */ INVALID, INVALID, INVALID, INVALID, + +/* [80] */ TNSy("invept", RM_66r), TNSy("invvpid", RM_66r),TNSy("invpcid", RM_66r),INVALID, +/* [84] */ INVALID, INVALID, INVALID, INVALID, +/* [88] */ INVALID, INVALID, INVALID, INVALID, +/* [8C] */ INVALID, INVALID, INVALID, INVALID, + +/* [90] */ INVALID, INVALID, INVALID, INVALID, +/* [94] */ INVALID, INVALID, INVALID, INVALID, +/* [98] */ INVALID, INVALID, INVALID, INVALID, +/* [9C] */ INVALID, INVALID, INVALID, INVALID, + +/* [A0] */ INVALID, INVALID, INVALID, INVALID, +/* [A4] */ INVALID, INVALID, INVALID, INVALID, +/* [A8] */ INVALID, INVALID, INVALID, INVALID, +/* [AC] */ INVALID, INVALID, INVALID, INVALID, + +/* [B0] */ INVALID, INVALID, INVALID, INVALID, +/* [B4] */ INVALID, INVALID, INVALID, INVALID, +/* [B8] */ INVALID, INVALID, INVALID, INVALID, +/* [BC] */ INVALID, INVALID, INVALID, INVALID, + +/* [C0] */ INVALID, INVALID, INVALID, INVALID, +/* [C4] */ INVALID, INVALID, INVALID, INVALID, +/* [C8] */ TNSZ("sha1nexte",XMM,16),TNSZ("sha1msg1",XMM,16),TNSZ("sha1msg2",XMM,16),TNSZ("sha256rnds2",XMM,16), +/* [CC] */ TNSZ("sha256msg1",XMM,16),TNSZ("sha256msg2",XMM,16),INVALID, INVALID, + +/* [D0] */ INVALID, INVALID, INVALID, INVALID, +/* [D4] */ INVALID, INVALID, INVALID, INVALID, +/* [D8] */ INVALID, INVALID, INVALID, TNSZ("aesimc",XMM_66r,16), +/* [DC] */ TNSZ("aesenc",XMM_66r,16),TNSZ("aesenclast",XMM_66r,16),TNSZ("aesdec",XMM_66r,16),TNSZ("aesdeclast",XMM_66r,16), + +/* [E0] */ INVALID, INVALID, INVALID, INVALID, +/* [E4] */ INVALID, INVALID, INVALID, INVALID, +/* [E8] */ INVALID, INVALID, INVALID, INVALID, +/* [EC] */ INVALID, INVALID, INVALID, INVALID, +/* [F0] */ IND(dis_op0F38F0), IND(dis_op0F38F1), INVALID, INVALID, +/* [F4] */ INVALID, INVALID, IND(dis_op0F38F6), INVALID, +/* [F8] */ INVALID, INVALID, INVALID, INVALID, +/* [FC] */ INVALID, INVALID, INVALID, INVALID, +}; + +const instable_t dis_opAVX660F38[256] = { +/* [00] */ TNSZ("vpshufb",VEX_RMrX,16),TNSZ("vphaddw",VEX_RMrX,16),TNSZ("vphaddd",VEX_RMrX,16),TNSZ("vphaddsw",VEX_RMrX,16), +/* [04] */ TNSZ("vpmaddubsw",VEX_RMrX,16),TNSZ("vphsubw",VEX_RMrX,16), TNSZ("vphsubd",VEX_RMrX,16),TNSZ("vphsubsw",VEX_RMrX,16), +/* [08] */ TNSZ("vpsignb",VEX_RMrX,16),TNSZ("vpsignw",VEX_RMrX,16),TNSZ("vpsignd",VEX_RMrX,16),TNSZ("vpmulhrsw",VEX_RMrX,16), +/* [0C] */ TNSZ("vpermilps",VEX_RMrX,8),TNSZ("vpermilpd",VEX_RMrX,16),TNSZ("vtestps",VEX_RRI,8), TNSZ("vtestpd",VEX_RRI,16), + +/* [10] */ INVALID, INVALID, INVALID, TNSZ("vcvtph2ps",VEX_MX,16), +/* [14] */ INVALID, INVALID, TNSZ("vpermps",VEX_RMrX,16),TNSZ("vptest",VEX_RRI,16), +/* [18] */ TNSZ("vbroadcastss",VEX_MX,4),TNSZ("vbroadcastsd",VEX_MX,8),TNSZ("vbroadcastf128",VEX_MX,16),INVALID, +/* [1C] */ TNSZ("vpabsb",VEX_MX,16),TNSZ("vpabsw",VEX_MX,16),TNSZ("vpabsd",VEX_MX,16),INVALID, + +/* [20] */ TNSZ("vpmovsxbw",VEX_MX,16),TNSZ("vpmovsxbd",VEX_MX,16),TNSZ("vpmovsxbq",VEX_MX,16),TNSZ("vpmovsxwd",VEX_MX,16), +/* [24] */ TNSZ("vpmovsxwq",VEX_MX,16),TNSZ("vpmovsxdq",VEX_MX,16),INVALID, INVALID, +/* [28] */ TNSZ("vpmuldq",VEX_RMrX,16),TNSZ("vpcmpeqq",VEX_RMrX,16),TNSZ("vmovntdqa",VEX_MX,16),TNSZ("vpackusdw",VEX_RMrX,16), +/* [2C] */ TNSZ("vmaskmovps",VEX_RMrX,8),TNSZ("vmaskmovpd",VEX_RMrX,16),TNSZ("vmaskmovps",VEX_RRM,8),TNSZ("vmaskmovpd",VEX_RRM,16), + +/* [30] */ TNSZ("vpmovzxbw",VEX_MX,16),TNSZ("vpmovzxbd",VEX_MX,16),TNSZ("vpmovzxbq",VEX_MX,16),TNSZ("vpmovzxwd",VEX_MX,16), +/* [34] */ TNSZ("vpmovzxwq",VEX_MX,16),TNSZ("vpmovzxdq",VEX_MX,16),TNSZ("vpermd",VEX_RMrX,16),TNSZ("vpcmpgtq",VEX_RMrX,16), +/* [38] */ TNSZ("vpminsb",VEX_RMrX,16),TNSZ("vpminsd",VEX_RMrX,16),TNSZ("vpminuw",VEX_RMrX,16),TNSZ("vpminud",VEX_RMrX,16), +/* [3C] */ TNSZ("vpmaxsb",VEX_RMrX,16),TNSZ("vpmaxsd",VEX_RMrX,16),TNSZ("vpmaxuw",VEX_RMrX,16),TNSZ("vpmaxud",VEX_RMrX,16), + +/* [40] */ TNSZ("vpmulld",VEX_RMrX,16),TNSZ("vphminposuw",VEX_MX,16),INVALID, INVALID, +/* [44] */ INVALID, TSaZ("vpsrlv",VEX_RMrX,16),TNSZ("vpsravd",VEX_RMrX,16),TSaZ("vpsllv",VEX_RMrX,16), +/* [48] */ INVALID, INVALID, INVALID, INVALID, +/* [4C] */ INVALID, INVALID, INVALID, INVALID, + +/* [50] */ INVALID, INVALID, INVALID, INVALID, +/* [54] */ INVALID, INVALID, INVALID, INVALID, +/* [58] */ TNSZ("vpbroadcastd",VEX_MX,16),TNSZ("vpbroadcastq",VEX_MX,16),TNSZ("vbroadcasti128",VEX_MX,16),INVALID, +/* [5C] */ INVALID, INVALID, INVALID, INVALID, + +/* [60] */ INVALID, INVALID, INVALID, INVALID, +/* [64] */ INVALID, INVALID, INVALID, INVALID, +/* [68] */ INVALID, INVALID, INVALID, INVALID, +/* [6C] */ INVALID, INVALID, INVALID, INVALID, + +/* [70] */ INVALID, INVALID, INVALID, INVALID, +/* [74] */ INVALID, INVALID, INVALID, INVALID, +/* [78] */ TNSZ("vpbroadcastb",VEX_MX,16),TNSZ("vpbroadcastw",VEX_MX,16),INVALID, INVALID, +/* [7C] */ INVALID, INVALID, INVALID, INVALID, + +/* [80] */ INVALID, INVALID, INVALID, INVALID, +/* [84] */ INVALID, INVALID, INVALID, INVALID, +/* [88] */ INVALID, INVALID, INVALID, INVALID, +/* [8C] */ TSaZ("vpmaskmov",VEX_RMrX,16),INVALID, TSaZ("vpmaskmov",VEX_RRM,16),INVALID, + +/* [90] */ TNSZ("vpgatherd",VEX_SbVM,16),TNSZ("vpgatherq",VEX_SbVM,16),TNSZ("vgatherdp",VEX_SbVM,16),TNSZ("vgatherqp",VEX_SbVM,16), +/* [94] */ INVALID, INVALID, TNSZ("vfmaddsub132p",FMA,16),TNSZ("vfmsubadd132p",FMA,16), +/* [98] */ TNSZ("vfmadd132p",FMA,16),TNSZ("vfmadd132s",FMA,16),TNSZ("vfmsub132p",FMA,16),TNSZ("vfmsub132s",FMA,16), +/* [9C] */ TNSZ("vfnmadd132p",FMA,16),TNSZ("vfnmadd132s",FMA,16),TNSZ("vfnmsub132p",FMA,16),TNSZ("vfnmsub132s",FMA,16), + +/* [A0] */ INVALID, INVALID, INVALID, INVALID, +/* [A4] */ INVALID, INVALID, TNSZ("vfmaddsub213p",FMA,16),TNSZ("vfmsubadd213p",FMA,16), +/* [A8] */ TNSZ("vfmadd213p",FMA,16),TNSZ("vfmadd213s",FMA,16),TNSZ("vfmsub213p",FMA,16),TNSZ("vfmsub213s",FMA,16), +/* [AC] */ TNSZ("vfnmadd213p",FMA,16),TNSZ("vfnmadd213s",FMA,16),TNSZ("vfnmsub213p",FMA,16),TNSZ("vfnmsub213s",FMA,16), + +/* [B0] */ INVALID, INVALID, INVALID, INVALID, +/* [B4] */ INVALID, INVALID, TNSZ("vfmaddsub231p",FMA,16),TNSZ("vfmsubadd231p",FMA,16), +/* [B8] */ TNSZ("vfmadd231p",FMA,16),TNSZ("vfmadd231s",FMA,16),TNSZ("vfmsub231p",FMA,16),TNSZ("vfmsub231s",FMA,16), +/* [BC] */ TNSZ("vfnmadd231p",FMA,16),TNSZ("vfnmadd231s",FMA,16),TNSZ("vfnmsub231p",FMA,16),TNSZ("vfnmsub231s",FMA,16), + +/* [C0] */ INVALID, INVALID, INVALID, INVALID, +/* [C4] */ INVALID, INVALID, INVALID, INVALID, +/* [C8] */ INVALID, INVALID, INVALID, INVALID, +/* [CC] */ INVALID, INVALID, INVALID, INVALID, + +/* [D0] */ INVALID, INVALID, INVALID, INVALID, +/* [D4] */ INVALID, INVALID, INVALID, INVALID, +/* [D8] */ INVALID, INVALID, INVALID, TNSZ("vaesimc",VEX_MX,16), +/* [DC] */ TNSZ("vaesenc",VEX_RMrX,16),TNSZ("vaesenclast",VEX_RMrX,16),TNSZ("vaesdec",VEX_RMrX,16),TNSZ("vaesdeclast",VEX_RMrX,16), + +/* [E0] */ INVALID, INVALID, INVALID, INVALID, +/* [E4] */ INVALID, INVALID, INVALID, INVALID, +/* [E8] */ INVALID, INVALID, INVALID, INVALID, +/* [EC] */ INVALID, INVALID, INVALID, INVALID, +/* [F0] */ IND(dis_op0F38F0), IND(dis_op0F38F1), INVALID, INVALID, +/* [F4] */ INVALID, INVALID, INVALID, TNSZvr("shlx",VEX_VRMrX,5), +/* [F8] */ INVALID, INVALID, INVALID, INVALID, +/* [FC] */ INVALID, INVALID, INVALID, INVALID, +}; + +const instable_t dis_op0F3A[256] = { +/* [00] */ INVALID, INVALID, INVALID, INVALID, +/* [04] */ INVALID, INVALID, INVALID, INVALID, +/* [08] */ TNSZ("roundps",XMMP_66r,16),TNSZ("roundpd",XMMP_66r,16),TNSZ("roundss",XMMP_66r,16),TNSZ("roundsd",XMMP_66r,16), +/* [0C] */ TNSZ("blendps",XMMP_66r,16),TNSZ("blendpd",XMMP_66r,16),TNSZ("pblendw",XMMP_66r,16),TNSZ("palignr",XMMP_66o,16), + +/* [10] */ INVALID, INVALID, INVALID, INVALID, +/* [14] */ TNSZ("pextrb",XMM3PM_66r,8),TNSZ("pextrw",XMM3PM_66r,16),TSZ("pextr",XMM3PM_66r,16),TNSZ("extractps",XMM3PM_66r,16), +/* [18] */ INVALID, INVALID, INVALID, INVALID, +/* [1C] */ INVALID, INVALID, INVALID, INVALID, + +/* [20] */ TNSZ("pinsrb",XMMPRM_66r,8),TNSZ("insertps",XMMP_66r,16),TSZ("pinsr",XMMPRM_66r,16),INVALID, +/* [24] */ INVALID, INVALID, INVALID, INVALID, +/* [28] */ INVALID, INVALID, INVALID, INVALID, +/* [2C] */ INVALID, INVALID, INVALID, INVALID, + +/* [30] */ INVALID, INVALID, INVALID, INVALID, +/* [34] */ INVALID, INVALID, INVALID, INVALID, +/* [38] */ INVALID, INVALID, INVALID, INVALID, +/* [3C] */ INVALID, INVALID, INVALID, INVALID, + +/* [40] */ TNSZ("dpps",XMMP_66r,16),TNSZ("dppd",XMMP_66r,16),TNSZ("mpsadbw",XMMP_66r,16),INVALID, +/* [44] */ TNSZ("pclmulqdq",XMMP_66r,16),INVALID, INVALID, INVALID, +/* [48] */ INVALID, INVALID, INVALID, INVALID, +/* [4C] */ INVALID, INVALID, INVALID, INVALID, + +/* [50] */ INVALID, INVALID, INVALID, INVALID, +/* [54] */ INVALID, INVALID, INVALID, INVALID, +/* [58] */ INVALID, INVALID, INVALID, INVALID, +/* [5C] */ INVALID, INVALID, INVALID, INVALID, + +/* [60] */ TNSZ("pcmpestrm",XMMP_66r,16),TNSZ("pcmpestri",XMMP_66r,16),TNSZ("pcmpistrm",XMMP_66r,16),TNSZ("pcmpistri",XMMP_66r,16), +/* [64] */ INVALID, INVALID, INVALID, INVALID, +/* [68] */ INVALID, INVALID, INVALID, INVALID, +/* [6C] */ INVALID, INVALID, INVALID, INVALID, + +/* [70] */ INVALID, INVALID, INVALID, INVALID, +/* [74] */ INVALID, INVALID, INVALID, INVALID, +/* [78] */ INVALID, INVALID, INVALID, INVALID, +/* [7C] */ INVALID, INVALID, INVALID, INVALID, + +/* [80] */ INVALID, INVALID, INVALID, INVALID, +/* [84] */ INVALID, INVALID, INVALID, INVALID, +/* [88] */ INVALID, INVALID, INVALID, INVALID, +/* [8C] */ INVALID, INVALID, INVALID, INVALID, + +/* [90] */ INVALID, INVALID, INVALID, INVALID, +/* [94] */ INVALID, INVALID, INVALID, INVALID, +/* [98] */ INVALID, INVALID, INVALID, INVALID, +/* [9C] */ INVALID, INVALID, INVALID, INVALID, + +/* [A0] */ INVALID, INVALID, INVALID, INVALID, +/* [A4] */ INVALID, INVALID, INVALID, INVALID, +/* [A8] */ INVALID, INVALID, INVALID, INVALID, +/* [AC] */ INVALID, INVALID, INVALID, INVALID, + +/* [B0] */ INVALID, INVALID, INVALID, INVALID, +/* [B4] */ INVALID, INVALID, INVALID, INVALID, +/* [B8] */ INVALID, INVALID, INVALID, INVALID, +/* [BC] */ INVALID, INVALID, INVALID, INVALID, + +/* [C0] */ INVALID, INVALID, INVALID, INVALID, +/* [C4] */ INVALID, INVALID, INVALID, INVALID, +/* [C8] */ INVALID, INVALID, INVALID, INVALID, +/* [CC] */ TNSZ("sha1rnds4",XMMP,16),INVALID, INVALID, INVALID, + +/* [D0] */ INVALID, INVALID, INVALID, INVALID, +/* [D4] */ INVALID, INVALID, INVALID, INVALID, +/* [D8] */ INVALID, INVALID, INVALID, INVALID, +/* [DC] */ INVALID, INVALID, INVALID, TNSZ("aeskeygenassist",XMMP_66r,16), + +/* [E0] */ INVALID, INVALID, INVALID, INVALID, +/* [E4] */ INVALID, INVALID, INVALID, INVALID, +/* [E8] */ INVALID, INVALID, INVALID, INVALID, +/* [EC] */ INVALID, INVALID, INVALID, INVALID, + +/* [F0] */ INVALID, INVALID, INVALID, INVALID, +/* [F4] */ INVALID, INVALID, INVALID, INVALID, +/* [F8] */ INVALID, INVALID, INVALID, INVALID, +/* [FC] */ INVALID, INVALID, INVALID, INVALID, +}; + +const instable_t dis_opAVX660F3A[256] = { +/* [00] */ TNSZ("vpermq",VEX_MXI,16),TNSZ("vpermpd",VEX_MXI,16),TNSZ("vpblendd",VEX_RMRX,16),INVALID, +/* [04] */ TNSZ("vpermilps",VEX_MXI,8),TNSZ("vpermilpd",VEX_MXI,16),TNSZ("vperm2f128",VEX_RMRX,16),INVALID, +/* [08] */ TNSZ("vroundps",VEX_MXI,16),TNSZ("vroundpd",VEX_MXI,16),TNSZ("vroundss",VEX_RMRX,16),TNSZ("vroundsd",VEX_RMRX,16), +/* [0C] */ TNSZ("vblendps",VEX_RMRX,16),TNSZ("vblendpd",VEX_RMRX,16),TNSZ("vpblendw",VEX_RMRX,16),TNSZ("vpalignr",VEX_RMRX,16), + +/* [10] */ INVALID, INVALID, INVALID, INVALID, +/* [14] */ TNSZ("vpextrb",VEX_RRi,8),TNSZ("vpextrw",VEX_RRi,16),TNSZ("vpextrd",VEX_RRi,16),TNSZ("vextractps",VEX_RM,16), +/* [18] */ TNSZ("vinsertf128",VEX_RMRX,16),TNSZ("vextractf128",VEX_RX,16),INVALID, INVALID, +/* [1C] */ INVALID, TNSZ("vcvtps2ph",VEX_RX,16), INVALID, INVALID, + +/* [20] */ TNSZ("vpinsrb",VEX_RMRX,8),TNSZ("vinsertps",VEX_RMRX,16),TNSZ("vpinsrd",VEX_RMRX,16),INVALID, +/* [24] */ INVALID, INVALID, INVALID, INVALID, +/* [28] */ INVALID, INVALID, INVALID, INVALID, +/* [2C] */ INVALID, INVALID, INVALID, INVALID, + +/* [30] */ INVALID, INVALID, INVALID, INVALID, +/* [34] */ INVALID, INVALID, INVALID, INVALID, +/* [38] */ TNSZ("vinserti128",VEX_RMRX,16),TNSZ("vextracti128",VEX_RIM,16),INVALID, INVALID, +/* [3C] */ INVALID, INVALID, INVALID, INVALID, + +/* [40] */ TNSZ("vdpps",VEX_RMRX,16),TNSZ("vdppd",VEX_RMRX,16),TNSZ("vmpsadbw",VEX_RMRX,16),INVALID, +/* [44] */ TNSZ("vpclmulqdq",VEX_RMRX,16),INVALID, TNSZ("vperm2i128",VEX_RMRX,16),INVALID, +/* [48] */ INVALID, INVALID, TNSZ("vblendvps",VEX_RMRX,8), TNSZ("vblendvpd",VEX_RMRX,16), +/* [4C] */ TNSZ("vpblendvb",VEX_RMRX,16),INVALID, INVALID, INVALID, + +/* [50] */ INVALID, INVALID, INVALID, INVALID, +/* [54] */ INVALID, INVALID, INVALID, INVALID, +/* [58] */ INVALID, INVALID, INVALID, INVALID, +/* [5C] */ INVALID, INVALID, INVALID, INVALID, + +/* [60] */ TNSZ("vpcmpestrm",VEX_MXI,16),TNSZ("vpcmpestri",VEX_MXI,16),TNSZ("vpcmpistrm",VEX_MXI,16),TNSZ("vpcmpistri",VEX_MXI,16), +/* [64] */ INVALID, INVALID, INVALID, INVALID, +/* [68] */ INVALID, INVALID, INVALID, INVALID, +/* [6C] */ INVALID, INVALID, INVALID, INVALID, + +/* [70] */ INVALID, INVALID, INVALID, INVALID, +/* [74] */ INVALID, INVALID, INVALID, INVALID, +/* [78] */ INVALID, INVALID, INVALID, INVALID, +/* [7C] */ INVALID, INVALID, INVALID, INVALID, + +/* [80] */ INVALID, INVALID, INVALID, INVALID, +/* [84] */ INVALID, INVALID, INVALID, INVALID, +/* [88] */ INVALID, INVALID, INVALID, INVALID, +/* [8C] */ INVALID, INVALID, INVALID, INVALID, + +/* [90] */ INVALID, INVALID, INVALID, INVALID, +/* [94] */ INVALID, INVALID, INVALID, INVALID, +/* [98] */ INVALID, INVALID, INVALID, INVALID, +/* [9C] */ INVALID, INVALID, INVALID, INVALID, + +/* [A0] */ INVALID, INVALID, INVALID, INVALID, +/* [A4] */ INVALID, INVALID, INVALID, INVALID, +/* [A8] */ INVALID, INVALID, INVALID, INVALID, +/* [AC] */ INVALID, INVALID, INVALID, INVALID, + +/* [B0] */ INVALID, INVALID, INVALID, INVALID, +/* [B4] */ INVALID, INVALID, INVALID, INVALID, +/* [B8] */ INVALID, INVALID, INVALID, INVALID, +/* [BC] */ INVALID, INVALID, INVALID, INVALID, + +/* [C0] */ INVALID, INVALID, INVALID, INVALID, +/* [C4] */ INVALID, INVALID, INVALID, INVALID, +/* [C8] */ INVALID, INVALID, INVALID, INVALID, +/* [CC] */ INVALID, INVALID, INVALID, INVALID, + +/* [D0] */ INVALID, INVALID, INVALID, INVALID, +/* [D4] */ INVALID, INVALID, INVALID, INVALID, +/* [D8] */ INVALID, INVALID, INVALID, INVALID, +/* [DC] */ INVALID, INVALID, INVALID, TNSZ("vaeskeygenassist",VEX_MXI,16), + +/* [E0] */ INVALID, INVALID, INVALID, INVALID, +/* [E4] */ INVALID, INVALID, INVALID, INVALID, +/* [E8] */ INVALID, INVALID, INVALID, INVALID, +/* [EC] */ INVALID, INVALID, INVALID, INVALID, + +/* [F0] */ INVALID, INVALID, INVALID, INVALID, +/* [F4] */ INVALID, INVALID, INVALID, INVALID, +/* [F8] */ INVALID, INVALID, INVALID, INVALID, +/* [FC] */ INVALID, INVALID, INVALID, INVALID, +}; + +/* + * Decode table for 0x0F0D which uses the first byte of the mod_rm to + * indicate a sub-code. + */ +const instable_t dis_op0F0D[8] = { +/* [00] */ INVALID, TNS("prefetchw",PREF), TNS("prefetchwt1",PREF),INVALID, +/* [04] */ INVALID, INVALID, INVALID, INVALID, +}; + +/* + * Decode table for 0x0F opcodes + */ + +const instable_t dis_op0F[16][16] = { +{ +/* [00] */ IND(dis_op0F00), IND(dis_op0F01), TNS("lar",MR), TNS("lsl",MR), +/* [04] */ INVALID, TNS("syscall",NORM), TNS("clts",NORM), TNS("sysret",NORM), +/* [08] */ TNS("invd",NORM), TNS("wbinvd",NORM), INVALID, TNS("ud2",NORM), +/* [0C] */ INVALID, IND(dis_op0F0D), INVALID, INVALID, +}, { +/* [10] */ TNSZ("movups",XMMO,16), TNSZ("movups",XMMOS,16),TNSZ("movlps",XMMO,8), TNSZ("movlps",XMMOS,8), +/* [14] */ TNSZ("unpcklps",XMMO,16),TNSZ("unpckhps",XMMO,16),TNSZ("movhps",XMMOM,8),TNSZ("movhps",XMMOMS,8), +/* [18] */ IND(dis_op0F18), INVALID, INVALID, INVALID, +/* [1C] */ INVALID, INVALID, INVALID, TS("nop",Mw), +}, { +/* [20] */ TSy("mov",SREG), TSy("mov",SREG), TSy("mov",SREG), TSy("mov",SREG), +/* [24] */ TSx("mov",SREG), INVALID, TSx("mov",SREG), INVALID, +/* [28] */ TNSZ("movaps",XMMO,16), TNSZ("movaps",XMMOS,16),TNSZ("cvtpi2ps",XMMOMX,8),TNSZ("movntps",XMMOS,16), +/* [2C] */ TNSZ("cvttps2pi",XMMOXMM,8),TNSZ("cvtps2pi",XMMOXMM,8),TNSZ("ucomiss",XMMO,4),TNSZ("comiss",XMMO,4), +}, { +/* [30] */ TNS("wrmsr",NORM), TNS("rdtsc",NORM), TNS("rdmsr",NORM), TNS("rdpmc",NORM), +/* [34] */ TNSx("sysenter",NORM), TNSx("sysexit",NORM), INVALID, INVALID, +/* [38] */ INVALID, INVALID, INVALID, INVALID, +/* [3C] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [40] */ TS("cmovx.o",MR), TS("cmovx.no",MR), TS("cmovx.b",MR), TS("cmovx.ae",MR), +/* [44] */ TS("cmovx.e",MR), TS("cmovx.ne",MR), TS("cmovx.be",MR), TS("cmovx.a",MR), +/* [48] */ TS("cmovx.s",MR), TS("cmovx.ns",MR), TS("cmovx.pe",MR), TS("cmovx.po",MR), +/* [4C] */ TS("cmovx.l",MR), TS("cmovx.ge",MR), TS("cmovx.le",MR), TS("cmovx.g",MR), +}, { +/* [50] */ TNS("movmskps",XMMOX3), TNSZ("sqrtps",XMMO,16), TNSZ("rsqrtps",XMMO,16),TNSZ("rcpps",XMMO,16), +/* [54] */ TNSZ("andps",XMMO,16), TNSZ("andnps",XMMO,16), TNSZ("orps",XMMO,16), TNSZ("xorps",XMMO,16), +/* [58] */ TNSZ("addps",XMMO,16), TNSZ("mulps",XMMO,16), TNSZ("cvtps2pd",XMMO,8),TNSZ("cvtdq2ps",XMMO,16), +/* [5C] */ TNSZ("subps",XMMO,16), TNSZ("minps",XMMO,16), TNSZ("divps",XMMO,16), TNSZ("maxps",XMMO,16), +}, { +/* [60] */ TNSZ("punpcklbw",MMO,4),TNSZ("punpcklwd",MMO,4),TNSZ("punpckldq",MMO,4),TNSZ("packsswb",MMO,8), +/* [64] */ TNSZ("pcmpgtb",MMO,8), TNSZ("pcmpgtw",MMO,8), TNSZ("pcmpgtd",MMO,8), TNSZ("packuswb",MMO,8), +/* [68] */ TNSZ("punpckhbw",MMO,8),TNSZ("punpckhwd",MMO,8),TNSZ("punpckhdq",MMO,8),TNSZ("packssdw",MMO,8), +/* [6C] */ TNSZ("INVALID",MMO,0), TNSZ("INVALID",MMO,0), TNSZ("movd",MMO,4), TNSZ("movq",MMO,8), +}, { +/* [70] */ TNSZ("pshufw",MMOPM,8), TNS("psrXXX",MR), TNS("psrXXX",MR), TNS("psrXXX",MR), +/* [74] */ TNSZ("pcmpeqb",MMO,8), TNSZ("pcmpeqw",MMO,8), TNSZ("pcmpeqd",MMO,8), TNS("emms",NORM), +/* [78] */ TNSy("vmread",RM), TNSy("vmwrite",MR), INVALID, INVALID, +/* [7C] */ INVALID, INVALID, TNSZ("movd",MMOS,4), TNSZ("movq",MMOS,8), +}, { +/* [80] */ TNS("jo",D), TNS("jno",D), TNS("jb",D), TNS("jae",D), +/* [84] */ TNS("je",D), TNS("jne",D), TNS("jbe",D), TNS("ja",D), +/* [88] */ TNS("js",D), TNS("jns",D), TNS("jp",D), TNS("jnp",D), +/* [8C] */ TNS("jl",D), TNS("jge",D), TNS("jle",D), TNS("jg",D), +}, { +/* [90] */ TNS("seto",Mb), TNS("setno",Mb), TNS("setb",Mb), TNS("setae",Mb), +/* [94] */ TNS("sete",Mb), TNS("setne",Mb), TNS("setbe",Mb), TNS("seta",Mb), +/* [98] */ TNS("sets",Mb), TNS("setns",Mb), TNS("setp",Mb), TNS("setnp",Mb), +/* [9C] */ TNS("setl",Mb), TNS("setge",Mb), TNS("setle",Mb), TNS("setg",Mb), +}, { +/* [A0] */ TSp("push",LSEG), TSp("pop",LSEG), TNS("cpuid",NORM), TS("bt",RMw), +/* [A4] */ TS("shld",DSHIFT), TS("shld",DSHIFTcl), INVALID, INVALID, +/* [A8] */ TSp("push",LSEG), TSp("pop",LSEG), TNS("rsm",NORM), TS("bts",RMw), +/* [AC] */ TS("shrd",DSHIFT), TS("shrd",DSHIFTcl), IND(dis_op0FAE), TS("imul",MRw), +}, { +/* [B0] */ TNS("cmpxchgb",RMw), TS("cmpxchg",RMw), TS("lss",MR), TS("btr",RMw), +/* [B4] */ TS("lfs",MR), TS("lgs",MR), TS("movzb",MOVZ), TNS("movzwl",MOVZ), +/* [B8] */ TNS("INVALID",MRw), INVALID, IND(dis_op0FBA), TS("btc",RMw), +/* [BC] */ TS("bsf",MRw), TS("bsr",MRw), TS("movsb",MOVZ), TNS("movswl",MOVZ), +}, { +/* [C0] */ TNS("xaddb",XADDB), TS("xadd",RMw), TNSZ("cmpps",XMMOPM,16),TNS("movnti",RM), +/* [C4] */ TNSZ("pinsrw",MMOPRM,2),TNS("pextrw",MMO3P), TNSZ("shufps",XMMOPM,16),IND(dis_op0FC7), +/* [C8] */ INVALID, INVALID, INVALID, INVALID, +/* [CC] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [D0] */ INVALID, TNSZ("psrlw",MMO,8), TNSZ("psrld",MMO,8), TNSZ("psrlq",MMO,8), +/* [D4] */ TNSZ("paddq",MMO,8), TNSZ("pmullw",MMO,8), TNSZ("INVALID",MMO,0), TNS("pmovmskb",MMOM3), +/* [D8] */ TNSZ("psubusb",MMO,8), TNSZ("psubusw",MMO,8), TNSZ("pminub",MMO,8), TNSZ("pand",MMO,8), +/* [DC] */ TNSZ("paddusb",MMO,8), TNSZ("paddusw",MMO,8), TNSZ("pmaxub",MMO,8), TNSZ("pandn",MMO,8), +}, { +/* [E0] */ TNSZ("pavgb",MMO,8), TNSZ("psraw",MMO,8), TNSZ("psrad",MMO,8), TNSZ("pavgw",MMO,8), +/* [E4] */ TNSZ("pmulhuw",MMO,8), TNSZ("pmulhw",MMO,8), TNS("INVALID",XMMO), TNSZ("movntq",MMOMS,8), +/* [E8] */ TNSZ("psubsb",MMO,8), TNSZ("psubsw",MMO,8), TNSZ("pminsw",MMO,8), TNSZ("por",MMO,8), +/* [EC] */ TNSZ("paddsb",MMO,8), TNSZ("paddsw",MMO,8), TNSZ("pmaxsw",MMO,8), TNSZ("pxor",MMO,8), +}, { +/* [F0] */ INVALID, TNSZ("psllw",MMO,8), TNSZ("pslld",MMO,8), TNSZ("psllq",MMO,8), +/* [F4] */ TNSZ("pmuludq",MMO,8), TNSZ("pmaddwd",MMO,8), TNSZ("psadbw",MMO,8), TNSZ("maskmovq",MMOIMPL,8), +/* [F8] */ TNSZ("psubb",MMO,8), TNSZ("psubw",MMO,8), TNSZ("psubd",MMO,8), TNSZ("psubq",MMO,8), +/* [FC] */ TNSZ("paddb",MMO,8), TNSZ("paddw",MMO,8), TNSZ("paddd",MMO,8), INVALID, +} }; + +const instable_t dis_opAVX0F[16][16] = { +{ +/* [00] */ INVALID, INVALID, INVALID, INVALID, +/* [04] */ INVALID, INVALID, INVALID, INVALID, +/* [08] */ INVALID, INVALID, INVALID, INVALID, +/* [0C] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [10] */ TNSZ("vmovups",VEX_MX,16), TNSZ("vmovups",VEX_RM,16),TNSZ("vmovlps",VEX_RMrX,8), TNSZ("vmovlps",VEX_RM,8), +/* [14] */ TNSZ("vunpcklps",VEX_RMrX,16),TNSZ("vunpckhps",VEX_RMrX,16),TNSZ("vmovhps",VEX_RMrX,8),TNSZ("vmovhps",VEX_RM,8), +/* [18] */ INVALID, INVALID, INVALID, INVALID, +/* [1C] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [20] */ INVALID, INVALID, INVALID, INVALID, +/* [24] */ INVALID, INVALID, INVALID, INVALID, +/* [28] */ TNSZ("vmovaps",VEX_MX,16), TNSZ("vmovaps",VEX_RX,16),INVALID, TNSZ("vmovntps",VEX_RM,16), +/* [2C] */ INVALID, INVALID, TNSZ("vucomiss",VEX_MX,4),TNSZ("vcomiss",VEX_MX,4), +}, { +/* [30] */ INVALID, INVALID, INVALID, INVALID, +/* [34] */ INVALID, INVALID, INVALID, INVALID, +/* [38] */ INVALID, INVALID, INVALID, INVALID, +/* [3C] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [40] */ INVALID, INVALID, INVALID, INVALID, +/* [44] */ INVALID, INVALID, INVALID, INVALID, +/* [48] */ INVALID, INVALID, INVALID, INVALID, +/* [4C] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [50] */ TNS("vmovmskps",VEX_MR), TNSZ("vsqrtps",VEX_MX,16), TNSZ("vrsqrtps",VEX_MX,16),TNSZ("vrcpps",VEX_MX,16), +/* [54] */ TNSZ("vandps",VEX_RMrX,16), TNSZ("vandnps",VEX_RMrX,16), TNSZ("vorps",VEX_RMrX,16), TNSZ("vxorps",VEX_RMrX,16), +/* [58] */ TNSZ("vaddps",VEX_RMrX,16), TNSZ("vmulps",VEX_RMrX,16), TNSZ("vcvtps2pd",VEX_MX,8),TNSZ("vcvtdq2ps",VEX_MX,16), +/* [5C] */ TNSZ("vsubps",VEX_RMrX,16), TNSZ("vminps",VEX_RMrX,16), TNSZ("vdivps",VEX_RMrX,16), TNSZ("vmaxps",VEX_RMrX,16), +}, { +/* [60] */ INVALID, INVALID, INVALID, INVALID, +/* [64] */ INVALID, INVALID, INVALID, INVALID, +/* [68] */ INVALID, INVALID, INVALID, INVALID, +/* [6C] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [70] */ INVALID, INVALID, INVALID, INVALID, +/* [74] */ INVALID, INVALID, INVALID, TNS("vzeroupper", VEX_NONE), +/* [78] */ INVALID, INVALID, INVALID, INVALID, +/* [7C] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [80] */ INVALID, INVALID, INVALID, INVALID, +/* [84] */ INVALID, INVALID, INVALID, INVALID, +/* [88] */ INVALID, INVALID, INVALID, INVALID, +/* [8C] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [90] */ INVALID, INVALID, INVALID, INVALID, +/* [94] */ INVALID, INVALID, INVALID, INVALID, +/* [98] */ INVALID, INVALID, INVALID, INVALID, +/* [9C] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [A0] */ INVALID, INVALID, INVALID, INVALID, +/* [A4] */ INVALID, INVALID, INVALID, INVALID, +/* [A8] */ INVALID, INVALID, INVALID, INVALID, +/* [AC] */ INVALID, INVALID, TNSZ("vldmxcsr",VEX_MO,2), INVALID, +}, { +/* [B0] */ INVALID, INVALID, INVALID, INVALID, +/* [B4] */ INVALID, INVALID, INVALID, INVALID, +/* [B8] */ INVALID, INVALID, INVALID, INVALID, +/* [BC] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [C0] */ INVALID, INVALID, TNSZ("vcmpps",VEX_RMRX,16),INVALID, +/* [C4] */ INVALID, INVALID, TNSZ("vshufps",VEX_RMRX,16),INVALID, +/* [C8] */ INVALID, INVALID, INVALID, INVALID, +/* [CC] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [D0] */ INVALID, INVALID, INVALID, INVALID, +/* [D4] */ INVALID, INVALID, INVALID, INVALID, +/* [D8] */ INVALID, INVALID, INVALID, INVALID, +/* [DC] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [E0] */ INVALID, INVALID, INVALID, INVALID, +/* [E4] */ INVALID, INVALID, INVALID, INVALID, +/* [E8] */ INVALID, INVALID, INVALID, INVALID, +/* [EC] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [F0] */ INVALID, INVALID, TNSZvr("andn",VEX_RMrX,5),TNSZvr("bls",BLS,5), +/* [F4] */ INVALID, TNSZvr("bzhi",VEX_VRMrX,5),INVALID, TNSZvr("bextr",VEX_VRMrX,5), +/* [F8] */ INVALID, INVALID, INVALID, INVALID, +/* [FC] */ INVALID, INVALID, INVALID, INVALID, +} }; + +/* + * Decode table for 0x80 opcodes + */ + +const instable_t dis_op80[8] = { + +/* [0] */ TNS("addb",IMlw), TNS("orb",IMw), TNS("adcb",IMlw), TNS("sbbb",IMlw), +/* [4] */ TNS("andb",IMw), TNS("subb",IMlw), TNS("xorb",IMw), TNS("cmpb",IMlw), +}; + + +/* + * Decode table for 0x81 opcodes. + */ + +const instable_t dis_op81[8] = { + +/* [0] */ TS("add",IMlw), TS("or",IMw), TS("adc",IMlw), TS("sbb",IMlw), +/* [4] */ TS("and",IMw), TS("sub",IMlw), TS("xor",IMw), TS("cmp",IMlw), +}; + + +/* + * Decode table for 0x82 opcodes. + */ + +const instable_t dis_op82[8] = { + +/* [0] */ TNSx("addb",IMlw), TNSx("orb",IMlw), TNSx("adcb",IMlw), TNSx("sbbb",IMlw), +/* [4] */ TNSx("andb",IMlw), TNSx("subb",IMlw), TNSx("xorb",IMlw), TNSx("cmpb",IMlw), +}; +/* + * Decode table for 0x83 opcodes. + */ + +const instable_t dis_op83[8] = { + +/* [0] */ TS("add",IMlw), TS("or",IMlw), TS("adc",IMlw), TS("sbb",IMlw), +/* [4] */ TS("and",IMlw), TS("sub",IMlw), TS("xor",IMlw), TS("cmp",IMlw), +}; + +/* + * Decode table for 0xC0 opcodes. + */ + +const instable_t dis_opC0[8] = { + +/* [0] */ TNS("rolb",MvI), TNS("rorb",MvI), TNS("rclb",MvI), TNS("rcrb",MvI), +/* [4] */ TNS("shlb",MvI), TNS("shrb",MvI), INVALID, TNS("sarb",MvI), +}; + +/* + * Decode table for 0xD0 opcodes. + */ + +const instable_t dis_opD0[8] = { + +/* [0] */ TNS("rolb",Mv), TNS("rorb",Mv), TNS("rclb",Mv), TNS("rcrb",Mv), +/* [4] */ TNS("shlb",Mv), TNS("shrb",Mv), TNS("salb",Mv), TNS("sarb",Mv), +}; + +/* + * Decode table for 0xC1 opcodes. + * 186 instruction set + */ + +const instable_t dis_opC1[8] = { + +/* [0] */ TS("rol",MvI), TS("ror",MvI), TS("rcl",MvI), TS("rcr",MvI), +/* [4] */ TS("shl",MvI), TS("shr",MvI), TS("sal",MvI), TS("sar",MvI), +}; + +/* + * Decode table for 0xD1 opcodes. + */ + +const instable_t dis_opD1[8] = { + +/* [0] */ TS("rol",Mv), TS("ror",Mv), TS("rcl",Mv), TS("rcr",Mv), +/* [4] */ TS("shl",Mv), TS("shr",Mv), TS("sal",Mv), TS("sar",Mv), +}; + + +/* + * Decode table for 0xD2 opcodes. + */ + +const instable_t dis_opD2[8] = { + +/* [0] */ TNS("rolb",Mv), TNS("rorb",Mv), TNS("rclb",Mv), TNS("rcrb",Mv), +/* [4] */ TNS("shlb",Mv), TNS("shrb",Mv), TNS("salb",Mv), TNS("sarb",Mv), +}; +/* + * Decode table for 0xD3 opcodes. + */ + +const instable_t dis_opD3[8] = { + +/* [0] */ TS("rol",Mv), TS("ror",Mv), TS("rcl",Mv), TS("rcr",Mv), +/* [4] */ TS("shl",Mv), TS("shr",Mv), TS("salb",Mv), TS("sar",Mv), +}; + + +/* + * Decode table for 0xF6 opcodes. + */ + +const instable_t dis_opF6[8] = { + +/* [0] */ TNS("testb",IMw), TNS("testb",IMw), TNS("notb",Mw), TNS("negb",Mw), +/* [4] */ TNS("mulb",MA), TNS("imulb",MA), TNS("divb",MA), TNS("idivb",MA), +}; + + +/* + * Decode table for 0xF7 opcodes. + */ + +const instable_t dis_opF7[8] = { + +/* [0] */ TS("test",IMw), TS("test",IMw), TS("not",Mw), TS("neg",Mw), +/* [4] */ TS("mul",MA), TS("imul",MA), TS("div",MA), TS("idiv",MA), +}; + + +/* + * Decode table for 0xFE opcodes. + */ + +const instable_t dis_opFE[8] = { + +/* [0] */ TNS("incb",Mw), TNS("decb",Mw), INVALID, INVALID, +/* [4] */ INVALID, INVALID, INVALID, INVALID, +}; +/* + * Decode table for 0xFF opcodes. + */ + +const instable_t dis_opFF[8] = { + +/* [0] */ TS("inc",Mw), TS("dec",Mw), TNSyp("call",INM), TNS("lcall",INM), +/* [4] */ TNSy("jmp",INM), TNS("ljmp",INM), TSp("push",M), INVALID, +}; + +/* for 287 instructions, which are a mess to decode */ + +const instable_t dis_opFP1n2[8][8] = { +{ +/* bit pattern: 1101 1xxx MODxx xR/M */ +/* [0,0] */ TNS("fadds",M), TNS("fmuls",M), TNS("fcoms",M), TNS("fcomps",M), +/* [0,4] */ TNS("fsubs",M), TNS("fsubrs",M), TNS("fdivs",M), TNS("fdivrs",M), +}, { +/* [1,0] */ TNS("flds",M), INVALID, TNS("fsts",M), TNS("fstps",M), +/* [1,4] */ TNSZ("fldenv",M,28), TNSZ("fldcw",M,2), TNSZ("fnstenv",M,28), TNSZ("fnstcw",M,2), +}, { +/* [2,0] */ TNS("fiaddl",M), TNS("fimull",M), TNS("ficoml",M), TNS("ficompl",M), +/* [2,4] */ TNS("fisubl",M), TNS("fisubrl",M), TNS("fidivl",M), TNS("fidivrl",M), +}, { +/* [3,0] */ TNS("fildl",M), TNSZ("tisttpl",M,4), TNS("fistl",M), TNS("fistpl",M), +/* [3,4] */ INVALID, TNSZ("fldt",M,10), INVALID, TNSZ("fstpt",M,10), +}, { +/* [4,0] */ TNSZ("faddl",M,8), TNSZ("fmull",M,8), TNSZ("fcoml",M,8), TNSZ("fcompl",M,8), +/* [4,1] */ TNSZ("fsubl",M,8), TNSZ("fsubrl",M,8), TNSZ("fdivl",M,8), TNSZ("fdivrl",M,8), +}, { +/* [5,0] */ TNSZ("fldl",M,8), TNSZ("fisttpll",M,8), TNSZ("fstl",M,8), TNSZ("fstpl",M,8), +/* [5,4] */ TNSZ("frstor",M,108), INVALID, TNSZ("fnsave",M,108), TNSZ("fnstsw",M,2), +}, { +/* [6,0] */ TNSZ("fiadd",M,2), TNSZ("fimul",M,2), TNSZ("ficom",M,2), TNSZ("ficomp",M,2), +/* [6,4] */ TNSZ("fisub",M,2), TNSZ("fisubr",M,2), TNSZ("fidiv",M,2), TNSZ("fidivr",M,2), +}, { +/* [7,0] */ TNSZ("fild",M,2), TNSZ("fisttp",M,2), TNSZ("fist",M,2), TNSZ("fistp",M,2), +/* [7,4] */ TNSZ("fbld",M,10), TNSZ("fildll",M,8), TNSZ("fbstp",M,10), TNSZ("fistpll",M,8), +} }; + +const instable_t dis_opFP3[8][8] = { +{ +/* bit pattern: 1101 1xxx 11xx xREG */ +/* [0,0] */ TNS("fadd",FF), TNS("fmul",FF), TNS("fcom",F), TNS("fcomp",F), +/* [0,4] */ TNS("fsub",FF), TNS("fsubr",FF), TNS("fdiv",FF), TNS("fdivr",FF), +}, { +/* [1,0] */ TNS("fld",F), TNS("fxch",F), TNS("fnop",NORM), TNS("fstp",F), +/* [1,4] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [2,0] */ INVALID, INVALID, INVALID, INVALID, +/* [2,4] */ INVALID, TNS("fucompp",NORM), INVALID, INVALID, +}, { +/* [3,0] */ INVALID, INVALID, INVALID, INVALID, +/* [3,4] */ INVALID, INVALID, INVALID, INVALID, +}, { +/* [4,0] */ TNS("fadd",FF), TNS("fmul",FF), TNS("fcom",F), TNS("fcomp",F), +/* [4,4] */ TNS("fsub",FF), TNS("fsubr",FF), TNS("fdiv",FF), TNS("fdivr",FF), +}, { +/* [5,0] */ TNS("ffree",F), TNS("fxch",F), TNS("fst",F), TNS("fstp",F), +/* [5,4] */ TNS("fucom",F), TNS("fucomp",F), INVALID, INVALID, +}, { +/* [6,0] */ TNS("faddp",FF), TNS("fmulp",FF), TNS("fcomp",F), TNS("fcompp",NORM), +/* [6,4] */ TNS("fsubp",FF), TNS("fsubrp",FF), TNS("fdivp",FF), TNS("fdivrp",FF), +}, { +/* [7,0] */ TNS("ffreep",F), TNS("fxch",F), TNS("fstp",F), TNS("fstp",F), +/* [7,4] */ TNS("fnstsw",M), TNS("fucomip",FFC), TNS("fcomip",FFC), INVALID, +} }; + +const instable_t dis_opFP4[4][8] = { +{ +/* bit pattern: 1101 1001 111x xxxx */ +/* [0,0] */ TNS("fchs",NORM), TNS("fabs",NORM), INVALID, INVALID, +/* [0,4] */ TNS("ftst",NORM), TNS("fxam",NORM), TNS("ftstp",NORM), INVALID, +}, { +/* [1,0] */ TNS("fld1",NORM), TNS("fldl2t",NORM), TNS("fldl2e",NORM), TNS("fldpi",NORM), +/* [1,4] */ TNS("fldlg2",NORM), TNS("fldln2",NORM), TNS("fldz",NORM), INVALID, +}, { +/* [2,0] */ TNS("f2xm1",NORM), TNS("fyl2x",NORM), TNS("fptan",NORM), TNS("fpatan",NORM), +/* [2,4] */ TNS("fxtract",NORM), TNS("fprem1",NORM), TNS("fdecstp",NORM), TNS("fincstp",NORM), +}, { +/* [3,0] */ TNS("fprem",NORM), TNS("fyl2xp1",NORM), TNS("fsqrt",NORM), TNS("fsincos",NORM), +/* [3,4] */ TNS("frndint",NORM), TNS("fscale",NORM), TNS("fsin",NORM), TNS("fcos",NORM), +} }; + +const instable_t dis_opFP5[8] = { +/* bit pattern: 1101 1011 111x xxxx */ +/* [0] */ TNS("feni",NORM), TNS("fdisi",NORM), TNS("fnclex",NORM), TNS("fninit",NORM), +/* [4] */ TNS("fsetpm",NORM), TNS("frstpm",NORM), INVALID, INVALID, +}; + +const instable_t dis_opFP6[8] = { +/* bit pattern: 1101 1011 11yy yxxx */ +/* [00] */ TNS("fcmov.nb",FF), TNS("fcmov.ne",FF), TNS("fcmov.nbe",FF), TNS("fcmov.nu",FF), +/* [04] */ INVALID, TNS("fucomi",F), TNS("fcomi",F), INVALID, +}; + +const instable_t dis_opFP7[8] = { +/* bit pattern: 1101 1010 11yy yxxx */ +/* [00] */ TNS("fcmov.b",FF), TNS("fcmov.e",FF), TNS("fcmov.be",FF), TNS("fcmov.u",FF), +/* [04] */ INVALID, INVALID, INVALID, INVALID, +}; + +/* + * Main decode table for the op codes. The first two nibbles + * will be used as an index into the table. If there is a + * a need to further decode an instruction, the array to be + * referenced is indicated with the other two entries being + * empty. + */ + +const instable_t dis_distable[16][16] = { +{ +/* [0,0] */ TNS("addb",RMw), TS("add",RMw), TNS("addb",MRw), TS("add",MRw), +/* [0,4] */ TNS("addb",IA), TS("add",IA), TSx("push",SEG), TSx("pop",SEG), +/* [0,8] */ TNS("orb",RMw), TS("or",RMw), TNS("orb",MRw), TS("or",MRw), +/* [0,C] */ TNS("orb",IA), TS("or",IA), TSx("push",SEG), IND(dis_op0F), +}, { +/* [1,0] */ TNS("adcb",RMw), TS("adc",RMw), TNS("adcb",MRw), TS("adc",MRw), +/* [1,4] */ TNS("adcb",IA), TS("adc",IA), TSx("push",SEG), TSx("pop",SEG), +/* [1,8] */ TNS("sbbb",RMw), TS("sbb",RMw), TNS("sbbb",MRw), TS("sbb",MRw), +/* [1,C] */ TNS("sbbb",IA), TS("sbb",IA), TSx("push",SEG), TSx("pop",SEG), +}, { +/* [2,0] */ TNS("andb",RMw), TS("and",RMw), TNS("andb",MRw), TS("and",MRw), +/* [2,4] */ TNS("andb",IA), TS("and",IA), TNSx("%es:",OVERRIDE), TNSx("daa",NORM), +/* [2,8] */ TNS("subb",RMw), TS("sub",RMw), TNS("subb",MRw), TS("sub",MRw), +/* [2,C] */ TNS("subb",IA), TS("sub",IA), TNS("%cs:",OVERRIDE), TNSx("das",NORM), +}, { +/* [3,0] */ TNS("xorb",RMw), TS("xor",RMw), TNS("xorb",MRw), TS("xor",MRw), +/* [3,4] */ TNS("xorb",IA), TS("xor",IA), TNSx("%ss:",OVERRIDE), TNSx("aaa",NORM), +/* [3,8] */ TNS("cmpb",RMw), TS("cmp",RMw), TNS("cmpb",MRw), TS("cmp",MRw), +/* [3,C] */ TNS("cmpb",IA), TS("cmp",IA), TNSx("%ds:",OVERRIDE), TNSx("aas",NORM), +}, { +/* [4,0] */ TSx("inc",R), TSx("inc",R), TSx("inc",R), TSx("inc",R), +/* [4,4] */ TSx("inc",R), TSx("inc",R), TSx("inc",R), TSx("inc",R), +/* [4,8] */ TSx("dec",R), TSx("dec",R), TSx("dec",R), TSx("dec",R), +/* [4,C] */ TSx("dec",R), TSx("dec",R), TSx("dec",R), TSx("dec",R), +}, { +/* [5,0] */ TSp("push",R), TSp("push",R), TSp("push",R), TSp("push",R), +/* [5,4] */ TSp("push",R), TSp("push",R), TSp("push",R), TSp("push",R), +/* [5,8] */ TSp("pop",R), TSp("pop",R), TSp("pop",R), TSp("pop",R), +/* [5,C] */ TSp("pop",R), TSp("pop",R), TSp("pop",R), TSp("pop",R), +}, { +/* [6,0] */ TSZx("pusha",IMPLMEM,28),TSZx("popa",IMPLMEM,28), TSx("bound",MR), TNS("arpl",RMw), +/* [6,4] */ TNS("%fs:",OVERRIDE), TNS("%gs:",OVERRIDE), TNS("data16",DM), TNS("addr16",AM), +/* [6,8] */ TSp("push",I), TS("imul",IMUL), TSp("push",Ib), TS("imul",IMUL), +/* [6,C] */ TNSZ("insb",IMPLMEM,1), TSZ("ins",IMPLMEM,4), TNSZ("outsb",IMPLMEM,1),TSZ("outs",IMPLMEM,4), +}, { +/* [7,0] */ TNSy("jo",BD), TNSy("jno",BD), TNSy("jb",BD), TNSy("jae",BD), +/* [7,4] */ TNSy("je",BD), TNSy("jne",BD), TNSy("jbe",BD), TNSy("ja",BD), +/* [7,8] */ TNSy("js",BD), TNSy("jns",BD), TNSy("jp",BD), TNSy("jnp",BD), +/* [7,C] */ TNSy("jl",BD), TNSy("jge",BD), TNSy("jle",BD), TNSy("jg",BD), +}, { +/* [8,0] */ IND(dis_op80), IND(dis_op81), INDx(dis_op82), IND(dis_op83), +/* [8,4] */ TNS("testb",RMw), TS("test",RMw), TNS("xchgb",RMw), TS("xchg",RMw), +/* [8,8] */ TNS("movb",RMw), TS("mov",RMw), TNS("movb",MRw), TS("mov",MRw), +/* [8,C] */ TNS("movw",SM), TS("lea",MR), TNS("movw",MS), TSp("pop",M), +}, { +/* [9,0] */ TNS("nop",NORM), TS("xchg",RA), TS("xchg",RA), TS("xchg",RA), +/* [9,4] */ TS("xchg",RA), TS("xchg",RA), TS("xchg",RA), TS("xchg",RA), +/* [9,8] */ TNS("cXtX",CBW), TNS("cXtX",CWD), TNSx("lcall",SO), TNS("fwait",NORM), +/* [9,C] */ TSZy("pushf",IMPLMEM,4),TSZy("popf",IMPLMEM,4), TNS("sahf",NORM), TNS("lahf",NORM), +}, { +/* [A,0] */ TNS("movb",OA), TS("mov",OA), TNS("movb",AO), TS("mov",AO), +/* [A,4] */ TNSZ("movsb",SD,1), TS("movs",SD), TNSZ("cmpsb",SD,1), TS("cmps",SD), +/* [A,8] */ TNS("testb",IA), TS("test",IA), TNS("stosb",AD), TS("stos",AD), +/* [A,C] */ TNS("lodsb",SA), TS("lods",SA), TNS("scasb",AD), TS("scas",AD), +}, { +/* [B,0] */ TNS("movb",IR), TNS("movb",IR), TNS("movb",IR), TNS("movb",IR), +/* [B,4] */ TNS("movb",IR), TNS("movb",IR), TNS("movb",IR), TNS("movb",IR), +/* [B,8] */ TS("mov",IR), TS("mov",IR), TS("mov",IR), TS("mov",IR), +/* [B,C] */ TS("mov",IR), TS("mov",IR), TS("mov",IR), TS("mov",IR), +}, { +/* [C,0] */ IND(dis_opC0), IND(dis_opC1), TNSyp("ret",RET), TNSyp("ret",NORM), +/* [C,4] */ TNSx("les",MR), TNSx("lds",MR), TNS("movb",IMw), TS("mov",IMw), +/* [C,8] */ TNSyp("enter",ENTER), TNSyp("leave",NORM), TNS("lret",RET), TNS("lret",NORM), +/* [C,C] */ TNS("int",INT3), TNS("int",INTx), TNSx("into",NORM), TNS("iret",NORM), +}, { +/* [D,0] */ IND(dis_opD0), IND(dis_opD1), IND(dis_opD2), IND(dis_opD3), +/* [D,4] */ TNSx("aam",U), TNSx("aad",U), TNSx("falc",NORM), TNSZ("xlat",IMPLMEM,1), + +/* 287 instructions. Note that although the indirect field */ +/* indicates opFP1n2 for further decoding, this is not necessarily */ +/* the case since the opFP arrays are not partitioned according to key1 */ +/* and key2. opFP1n2 is given only to indicate that we haven't */ +/* finished decoding the instruction. */ +/* [D,8] */ IND(dis_opFP1n2), IND(dis_opFP1n2), IND(dis_opFP1n2), IND(dis_opFP1n2), +/* [D,C] */ IND(dis_opFP1n2), IND(dis_opFP1n2), IND(dis_opFP1n2), IND(dis_opFP1n2), +}, { +/* [E,0] */ TNSy("loopnz",BD), TNSy("loopz",BD), TNSy("loop",BD), TNSy("jcxz",BD), +/* [E,4] */ TNS("inb",P), TS("in",P), TNS("outb",P), TS("out",P), +/* [E,8] */ TNSyp("call",D), TNSy("jmp",D), TNSx("ljmp",SO), TNSy("jmp",BD), +/* [E,C] */ TNS("inb",V), TS("in",V), TNS("outb",V), TS("out",V), +}, { +/* [F,0] */ TNS("lock",LOCK), TNS("icebp", NORM), TNS("repnz",PREFIX), TNS("repz",PREFIX), +/* [F,4] */ TNS("hlt",NORM), TNS("cmc",NORM), IND(dis_opF6), IND(dis_opF7), +/* [F,8] */ TNS("clc",NORM), TNS("stc",NORM), TNS("cli",NORM), TNS("sti",NORM), +/* [F,C] */ TNS("cld",NORM), TNS("std",NORM), IND(dis_opFE), IND(dis_opFF), +} }; + +/* END CSTYLED */ + +/* + * common functions to decode and disassemble an x86 or amd64 instruction + */ + +/* + * These are the individual fields of a REX prefix. Note that a REX + * prefix with none of these set is still needed to: + * - use the MOVSXD (sign extend 32 to 64 bits) instruction + * - access the %sil, %dil, %bpl, %spl registers + */ +#define REX_W 0x08 /* 64 bit operand size when set */ +#define REX_R 0x04 /* high order bit extension of ModRM reg field */ +#define REX_X 0x02 /* high order bit extension of SIB index field */ +#define REX_B 0x01 /* extends ModRM r_m, SIB base, or opcode reg */ + +/* + * These are the individual fields of a VEX prefix. + */ +#define VEX_R 0x08 /* REX.R in 1's complement form */ +#define VEX_X 0x04 /* REX.X in 1's complement form */ +#define VEX_B 0x02 /* REX.B in 1's complement form */ +/* Vector Length, 0: scalar or 128-bit vector, 1: 256-bit vector */ +#define VEX_L 0x04 +#define VEX_W 0x08 /* opcode specific, use like REX.W */ +#define VEX_m 0x1F /* VEX m-mmmm field */ +#define VEX_v 0x78 /* VEX register specifier */ +#define VEX_p 0x03 /* VEX pp field, opcode extension */ + +/* VEX m-mmmm field, only used by three bytes prefix */ +#define VEX_m_0F 0x01 /* implied 0F leading opcode byte */ +#define VEX_m_0F38 0x02 /* implied 0F 38 leading opcode byte */ +#define VEX_m_0F3A 0x03 /* implied 0F 3A leading opcode byte */ + +/* VEX pp field, providing equivalent functionality of a SIMD prefix */ +#define VEX_p_66 0x01 +#define VEX_p_F3 0x02 +#define VEX_p_F2 0x03 + +/* + * Even in 64 bit mode, usually only 4 byte immediate operands are supported. + */ +static int isize[] = {1, 2, 4, 4}; +static int isize64[] = {1, 2, 4, 8}; + +/* + * Just a bunch of useful macros. + */ +#define WBIT(x) (x & 0x1) /* to get w bit */ +#define REGNO(x) (x & 0x7) /* to get 3 bit register */ +#define VBIT(x) ((x)>>1 & 0x1) /* to get 'v' bit */ +#define OPSIZE(osize, wbit) ((wbit) ? isize[osize] : 1) +#define OPSIZE64(osize, wbit) ((wbit) ? isize64[osize] : 1) + +#define REG_ONLY 3 /* mode to indicate a register operand (not memory) */ + +#define BYTE_OPND 0 /* w-bit value indicating byte register */ +#define LONG_OPND 1 /* w-bit value indicating opnd_size register */ +#define MM_OPND 2 /* "value" used to indicate a mmx reg */ +#define XMM_OPND 3 /* "value" used to indicate a xmm reg */ +#define SEG_OPND 4 /* "value" used to indicate a segment reg */ +#define CONTROL_OPND 5 /* "value" used to indicate a control reg */ +#define DEBUG_OPND 6 /* "value" used to indicate a debug reg */ +#define TEST_OPND 7 /* "value" used to indicate a test reg */ +#define WORD_OPND 8 /* w-bit value indicating word size reg */ +#define YMM_OPND 9 /* "value" used to indicate a ymm reg */ + +/* + * The AVX2 gather instructions are a bit of a mess. While there's a pattern, + * there's not really a consistent scheme that we can use to know what the mode + * is supposed to be for a given type. Various instructions, like VPGATHERDD, + * always match the value of VEX_L. Other instructions like VPGATHERDQ, have + * some registers match VEX_L, but the VSIB is always XMM. + * + * The simplest way to deal with this is to just define a table based on the + * instruction opcodes, which are 0x90-0x93, so we subtract 0x90 to index into + * them. + * + * We further have to subdivide this based on the value of VEX_W and the value + * of VEX_L. The array is constructed to be indexed as: + * [opcode - 0x90][VEX_W][VEX_L]. + */ +/* w = 0, 0x90 */ +typedef struct dis_gather_regs { + uint_t dgr_arg0; /* src reg */ + uint_t dgr_arg1; /* vsib reg */ + uint_t dgr_arg2; /* dst reg */ + char *dgr_suffix; /* suffix to append */ +} dis_gather_regs_t; + +static dis_gather_regs_t dis_vgather[4][2][2] = { + { + /* op 0x90, W.0 */ + { + { XMM_OPND, XMM_OPND, XMM_OPND, "d" }, + { YMM_OPND, YMM_OPND, YMM_OPND, "d" } + }, + /* op 0x90, W.1 */ + { + { XMM_OPND, XMM_OPND, XMM_OPND, "q" }, + { YMM_OPND, XMM_OPND, YMM_OPND, "q" } + } + }, + { + /* op 0x91, W.0 */ + { + { XMM_OPND, XMM_OPND, XMM_OPND, "d" }, + { XMM_OPND, YMM_OPND, XMM_OPND, "d" }, + }, + /* op 0x91, W.1 */ + { + { XMM_OPND, XMM_OPND, XMM_OPND, "q" }, + { YMM_OPND, YMM_OPND, YMM_OPND, "q" }, + } + }, + { + /* op 0x92, W.0 */ + { + { XMM_OPND, XMM_OPND, XMM_OPND, "s" }, + { YMM_OPND, YMM_OPND, YMM_OPND, "s" } + }, + /* op 0x92, W.1 */ + { + { XMM_OPND, XMM_OPND, XMM_OPND, "d" }, + { YMM_OPND, XMM_OPND, YMM_OPND, "d" } + } + }, + { + /* op 0x93, W.0 */ + { + { XMM_OPND, XMM_OPND, XMM_OPND, "s" }, + { XMM_OPND, YMM_OPND, XMM_OPND, "s" } + }, + /* op 0x93, W.1 */ + { + { XMM_OPND, XMM_OPND, XMM_OPND, "d" }, + { YMM_OPND, YMM_OPND, YMM_OPND, "d" } + } + } +}; + +/* + * Get the next byte and separate the op code into the high and low nibbles. + */ +static int +dtrace_get_opcode(dis86_t *x, uint_t *high, uint_t *low) +{ + int byte; + + /* + * x86 instructions have a maximum length of 15 bytes. Bail out if + * we try to read more. + */ + if (x->d86_len >= 15) + return (x->d86_error = 1); + + if (x->d86_error) + return (1); + byte = x->d86_get_byte(x->d86_data); + if (byte < 0) + return (x->d86_error = 1); + x->d86_bytes[x->d86_len++] = byte; + *low = byte & 0xf; /* ----xxxx low 4 bits */ + *high = byte >> 4 & 0xf; /* xxxx---- bits 7 to 4 */ + return (0); +} + +/* + * Get and decode an SIB (scaled index base) byte + */ +static void +dtrace_get_SIB(dis86_t *x, uint_t *ss, uint_t *index, uint_t *base) +{ + int byte; + + if (x->d86_error) + return; + + byte = x->d86_get_byte(x->d86_data); + if (byte < 0) { + x->d86_error = 1; + return; + } + x->d86_bytes[x->d86_len++] = byte; + + *base = byte & 0x7; + *index = (byte >> 3) & 0x7; + *ss = (byte >> 6) & 0x3; +} + +/* + * Get the byte following the op code and separate it into the + * mode, register, and r/m fields. + */ +static void +dtrace_get_modrm(dis86_t *x, uint_t *mode, uint_t *reg, uint_t *r_m) +{ + if (x->d86_got_modrm == 0) { + if (x->d86_rmindex == -1) + x->d86_rmindex = x->d86_len; + dtrace_get_SIB(x, mode, reg, r_m); + x->d86_got_modrm = 1; + } +} + +/* + * Adjust register selection based on any REX prefix bits present. + */ +/*ARGSUSED*/ +static void +dtrace_rex_adjust(uint_t rex_prefix, uint_t mode, uint_t *reg, uint_t *r_m) +{ + if (reg != NULL && r_m == NULL) { + if (rex_prefix & REX_B) + *reg += 8; + } else { + if (reg != NULL && (REX_R & rex_prefix) != 0) + *reg += 8; + if (r_m != NULL && (REX_B & rex_prefix) != 0) + *r_m += 8; + } +} + +/* + * Adjust register selection based on any VEX prefix bits present. + * Notes: VEX.R, VEX.X and VEX.B use the inverted form compared with REX prefix + */ +/*ARGSUSED*/ +static void +dtrace_vex_adjust(uint_t vex_byte1, uint_t mode, uint_t *reg, uint_t *r_m) +{ + if (reg != NULL && r_m == NULL) { + if (!(vex_byte1 & VEX_B)) + *reg += 8; + } else { + if (reg != NULL && ((VEX_R & vex_byte1) == 0)) + *reg += 8; + if (r_m != NULL && ((VEX_B & vex_byte1) == 0)) + *r_m += 8; + } +} + +/* + * Get an immediate operand of the given size, with sign extension. + */ +static void +dtrace_imm_opnd(dis86_t *x, int wbit, int size, int opindex) +{ + int i; + int byte; + int valsize; + + if (x->d86_numopnds < opindex + 1) + x->d86_numopnds = opindex + 1; + + switch (wbit) { + case BYTE_OPND: + valsize = 1; + break; + case LONG_OPND: + if (x->d86_opnd_size == SIZE16) + valsize = 2; + else if (x->d86_opnd_size == SIZE32) + valsize = 4; + else + valsize = 8; + break; + case MM_OPND: + case XMM_OPND: + case YMM_OPND: + case SEG_OPND: + case CONTROL_OPND: + case DEBUG_OPND: + case TEST_OPND: + valsize = size; + break; + case WORD_OPND: + valsize = 2; + break; + } + if (valsize < size) + valsize = size; + + if (x->d86_error) + return; + x->d86_opnd[opindex].d86_value = 0; + for (i = 0; i < size; ++i) { + byte = x->d86_get_byte(x->d86_data); + if (byte < 0) { + x->d86_error = 1; + return; + } + x->d86_bytes[x->d86_len++] = byte; + x->d86_opnd[opindex].d86_value |= (uint64_t)byte << (i * 8); + } + /* Do sign extension */ + if (x->d86_bytes[x->d86_len - 1] & 0x80) { + for (; i < sizeof (uint64_t); i++) + x->d86_opnd[opindex].d86_value |= + (uint64_t)0xff << (i * 8); + } +#ifdef DIS_TEXT + x->d86_opnd[opindex].d86_mode = MODE_SIGNED; + x->d86_opnd[opindex].d86_value_size = valsize; + x->d86_imm_bytes += size; +#endif +} + +/* + * Get an ip relative operand of the given size, with sign extension. + */ +static void +dtrace_disp_opnd(dis86_t *x, int wbit, int size, int opindex) +{ + dtrace_imm_opnd(x, wbit, size, opindex); +#ifdef DIS_TEXT + x->d86_opnd[opindex].d86_mode = MODE_IPREL; +#endif +} + +/* + * Check to see if there is a segment override prefix pending. + * If so, print it in the current 'operand' location and set + * the override flag back to false. + */ +/*ARGSUSED*/ +static void +dtrace_check_override(dis86_t *x, int opindex) +{ +#ifdef DIS_TEXT + if (x->d86_seg_prefix) { + (void) strlcat(x->d86_opnd[opindex].d86_prefix, + x->d86_seg_prefix, PFIXLEN); + } +#endif + x->d86_seg_prefix = NULL; +} + + +/* + * Process a single instruction Register or Memory operand. + * + * mode = addressing mode from ModRM byte + * r_m = r_m (or reg if mode == 3) field from ModRM byte + * wbit = indicates which register (8bit, 16bit, ... MMX, etc.) set to use. + * o = index of operand that we are processing (0, 1 or 2) + * + * the value of reg or r_m must have already been adjusted for any REX prefix. + */ +/*ARGSUSED*/ +static void +dtrace_get_operand(dis86_t *x, uint_t mode, uint_t r_m, int wbit, int opindex) +{ + int have_SIB = 0; /* flag presence of scale-index-byte */ + uint_t ss; /* scale-factor from opcode */ + uint_t index; /* index register number */ + uint_t base; /* base register number */ + int dispsize; /* size of displacement in bytes */ +#ifdef DIS_TEXT + char *opnd = x->d86_opnd[opindex].d86_opnd; +#endif + + if (x->d86_numopnds < opindex + 1) + x->d86_numopnds = opindex + 1; + + if (x->d86_error) + return; + + /* + * first handle a simple register + */ + if (mode == REG_ONLY) { +#ifdef DIS_TEXT + switch (wbit) { + case MM_OPND: + (void) strlcat(opnd, dis_MMREG[r_m], OPLEN); + break; + case XMM_OPND: + (void) strlcat(opnd, dis_XMMREG[r_m], OPLEN); + break; + case YMM_OPND: + (void) strlcat(opnd, dis_YMMREG[r_m], OPLEN); + break; + case SEG_OPND: + (void) strlcat(opnd, dis_SEGREG[r_m], OPLEN); + break; + case CONTROL_OPND: + (void) strlcat(opnd, dis_CONTROLREG[r_m], OPLEN); + break; + case DEBUG_OPND: + (void) strlcat(opnd, dis_DEBUGREG[r_m], OPLEN); + break; + case TEST_OPND: + (void) strlcat(opnd, dis_TESTREG[r_m], OPLEN); + break; + case BYTE_OPND: + if (x->d86_rex_prefix == 0) + (void) strlcat(opnd, dis_REG8[r_m], OPLEN); + else + (void) strlcat(opnd, dis_REG8_REX[r_m], OPLEN); + break; + case WORD_OPND: + (void) strlcat(opnd, dis_REG16[r_m], OPLEN); + break; + case LONG_OPND: + if (x->d86_opnd_size == SIZE16) + (void) strlcat(opnd, dis_REG16[r_m], OPLEN); + else if (x->d86_opnd_size == SIZE32) + (void) strlcat(opnd, dis_REG32[r_m], OPLEN); + else + (void) strlcat(opnd, dis_REG64[r_m], OPLEN); + break; + } +#endif /* DIS_TEXT */ + return; + } + + /* + * if symbolic representation, skip override prefix, if any + */ + dtrace_check_override(x, opindex); + + /* + * Handle 16 bit memory references first, since they decode + * the mode values more simply. + * mode 1 is r_m + 8 bit displacement + * mode 2 is r_m + 16 bit displacement + * mode 0 is just r_m, unless r_m is 6 which is 16 bit disp + */ + if (x->d86_addr_size == SIZE16) { + if ((mode == 0 && r_m == 6) || mode == 2) + dtrace_imm_opnd(x, WORD_OPND, 2, opindex); + else if (mode == 1) + dtrace_imm_opnd(x, BYTE_OPND, 1, opindex); +#ifdef DIS_TEXT + if (mode == 0 && r_m == 6) + x->d86_opnd[opindex].d86_mode = MODE_SIGNED; + else if (mode == 0) + x->d86_opnd[opindex].d86_mode = MODE_NONE; + else + x->d86_opnd[opindex].d86_mode = MODE_OFFSET; + (void) strlcat(opnd, dis_addr16[mode][r_m], OPLEN); +#endif + return; + } + + /* + * 32 and 64 bit addressing modes are more complex since they + * can involve an SIB (scaled index and base) byte to decode. + */ + if (r_m == ESP_REGNO || r_m == ESP_REGNO + 8) { + have_SIB = 1; + dtrace_get_SIB(x, &ss, &index, &base); + if (x->d86_error) + return; + if (base != 5 || mode != 0) + if (x->d86_rex_prefix & REX_B) + base += 8; + if (x->d86_rex_prefix & REX_X) + index += 8; + } else { + base = r_m; + } + + /* + * Compute the displacement size and get its bytes + */ + dispsize = 0; + + if (mode == 1) + dispsize = 1; + else if (mode == 2) + dispsize = 4; + else if ((r_m & 7) == EBP_REGNO || + (have_SIB && (base & 7) == EBP_REGNO)) + dispsize = 4; + + if (dispsize > 0) { + dtrace_imm_opnd(x, dispsize == 4 ? LONG_OPND : BYTE_OPND, + dispsize, opindex); + if (x->d86_error) + return; + } + +#ifdef DIS_TEXT + if (dispsize > 0) + x->d86_opnd[opindex].d86_mode = MODE_OFFSET; + + if (have_SIB == 0) { + if (x->d86_mode == SIZE32) { + if (mode == 0) + (void) strlcat(opnd, dis_addr32_mode0[r_m], + OPLEN); + else + (void) strlcat(opnd, dis_addr32_mode12[r_m], + OPLEN); + } else { + if (mode == 0) { + (void) strlcat(opnd, dis_addr64_mode0[r_m], + OPLEN); + if (r_m == 5) { + x->d86_opnd[opindex].d86_mode = + MODE_RIPREL; + } + } else { + (void) strlcat(opnd, dis_addr64_mode12[r_m], + OPLEN); + } + } + } else { + uint_t need_paren = 0; + char **regs; + char **bregs; + const char *const *sf; + if (x->d86_mode == SIZE32) /* NOTE this is not addr_size! */ + regs = (char **)dis_REG32; + else + regs = (char **)dis_REG64; + + if (x->d86_vsib != 0) { + if (wbit == YMM_OPND) /* NOTE this is not addr_size! */ + bregs = (char **)dis_YMMREG; + else + bregs = (char **)dis_XMMREG; + sf = dis_vscale_factor; + } else { + bregs = regs; + sf = dis_scale_factor; + } + + /* + * print the base (if any) + */ + if (base == EBP_REGNO && mode == 0) { + if (index != ESP_REGNO || x->d86_vsib != 0) { + (void) strlcat(opnd, "(", OPLEN); + need_paren = 1; + } + } else { + (void) strlcat(opnd, "(", OPLEN); + (void) strlcat(opnd, regs[base], OPLEN); + need_paren = 1; + } + + /* + * print the index (if any) + */ + if (index != ESP_REGNO || x->d86_vsib) { + (void) strlcat(opnd, ",", OPLEN); + (void) strlcat(opnd, bregs[index], OPLEN); + (void) strlcat(opnd, sf[ss], OPLEN); + } else + if (need_paren) + (void) strlcat(opnd, ")", OPLEN); + } +#endif +} + +/* + * Operand sequence for standard instruction involving one register + * and one register/memory operand. + * wbit indicates a byte(0) or opnd_size(1) operation + * vbit indicates direction (0 for "opcode r,r_m") or (1 for "opcode r_m, r") + */ +#define STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, vbit) { \ + dtrace_get_modrm(x, &mode, ®, &r_m); \ + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); \ + dtrace_get_operand(x, mode, r_m, wbit, vbit); \ + dtrace_get_operand(x, REG_ONLY, reg, wbit, 1 - vbit); \ +} + +/* + * Similar to above, but allows for the two operands to be of different + * classes (ie. wbit). + * wbit is for the r_m operand + * w2 is for the reg operand + */ +#define MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, w2, vbit) { \ + dtrace_get_modrm(x, &mode, ®, &r_m); \ + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); \ + dtrace_get_operand(x, mode, r_m, wbit, vbit); \ + dtrace_get_operand(x, REG_ONLY, reg, w2, 1 - vbit); \ +} + +/* + * Similar, but for 2 operands plus an immediate. + * vbit indicates direction + * 0 for "opcode imm, r, r_m" or + * 1 for "opcode imm, r_m, r" + */ +#define THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, w2, immsize, vbit) { \ + dtrace_get_modrm(x, &mode, ®, &r_m); \ + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); \ + dtrace_get_operand(x, mode, r_m, wbit, 2-vbit); \ + dtrace_get_operand(x, REG_ONLY, reg, w2, 1+vbit); \ + dtrace_imm_opnd(x, wbit, immsize, 0); \ +} + +/* + * Similar, but for 2 operands plus two immediates. + */ +#define FOUROPERAND(x, mode, reg, r_m, rex_prefix, wbit, w2, immsize) { \ + dtrace_get_modrm(x, &mode, ®, &r_m); \ + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); \ + dtrace_get_operand(x, mode, r_m, wbit, 2); \ + dtrace_get_operand(x, REG_ONLY, reg, w2, 3); \ + dtrace_imm_opnd(x, wbit, immsize, 1); \ + dtrace_imm_opnd(x, wbit, immsize, 0); \ +} + +/* + * 1 operands plus two immediates. + */ +#define ONEOPERAND_TWOIMM(x, mode, reg, r_m, rex_prefix, wbit, immsize) { \ + dtrace_get_modrm(x, &mode, ®, &r_m); \ + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); \ + dtrace_get_operand(x, mode, r_m, wbit, 2); \ + dtrace_imm_opnd(x, wbit, immsize, 1); \ + dtrace_imm_opnd(x, wbit, immsize, 0); \ +} + +/* + * Dissassemble a single x86 or amd64 instruction. + * + * Mode determines the default operating mode (SIZE16, SIZE32 or SIZE64) + * for interpreting instructions. + * + * returns non-zero for bad opcode + */ +int +dtrace_disx86(dis86_t *x, uint_t cpu_mode) +{ + instable_t *dp; /* decode table being used */ +#ifdef DIS_TEXT + uint_t i; +#endif +#ifdef DIS_MEM + uint_t nomem = 0; +#define NOMEM (nomem = 1) +#else +#define NOMEM /* nothing */ +#endif + uint_t opnd_size; /* SIZE16, SIZE32 or SIZE64 */ + uint_t addr_size; /* SIZE16, SIZE32 or SIZE64 */ + uint_t wbit; /* opcode wbit, 0 is 8 bit, !0 for opnd_size */ + uint_t w2; /* wbit value for second operand */ + uint_t vbit; + uint_t mode = 0; /* mode value from ModRM byte */ + uint_t reg; /* reg value from ModRM byte */ + uint_t r_m; /* r_m value from ModRM byte */ + + uint_t opcode1; /* high nibble of 1st byte */ + uint_t opcode2; /* low nibble of 1st byte */ + uint_t opcode3; /* extra opcode bits usually from ModRM byte */ + uint_t opcode4; /* high nibble of 2nd byte */ + uint_t opcode5; /* low nibble of 2nd byte */ + uint_t opcode6; /* high nibble of 3rd byte */ + uint_t opcode7; /* low nibble of 3rd byte */ + uint_t opcode_bytes = 1; + + /* + * legacy prefixes come in 5 flavors, you should have only one of each + */ + uint_t opnd_size_prefix = 0; + uint_t addr_size_prefix = 0; + uint_t segment_prefix = 0; + uint_t lock_prefix = 0; + uint_t rep_prefix = 0; + uint_t rex_prefix = 0; /* amd64 register extension prefix */ + + /* + * Intel VEX instruction encoding prefix and fields + */ + + /* 0xC4 means 3 bytes prefix, 0xC5 means 2 bytes prefix */ + uint_t vex_prefix = 0; + + /* + * VEX prefix byte 1, includes vex.r, vex.x and vex.b + * (for 3 bytes prefix) + */ + uint_t vex_byte1 = 0; + + /* + * For 32-bit mode, it should prefetch the next byte to + * distinguish between AVX and les/lds + */ + uint_t vex_prefetch = 0; + + uint_t vex_m = 0; + uint_t vex_v = 0; + uint_t vex_p = 0; + uint_t vex_R = 1; + uint_t vex_X = 1; + uint_t vex_B = 1; + uint_t vex_W = 0; + uint_t vex_L; + dis_gather_regs_t *vreg; + +#ifdef DIS_TEXT + /* Instruction name for BLS* family of instructions */ + char *blsinstr; +#endif + + size_t off; + + instable_t dp_mmx; + + x->d86_len = 0; + x->d86_rmindex = -1; + x->d86_error = 0; +#ifdef DIS_TEXT + x->d86_numopnds = 0; + x->d86_seg_prefix = NULL; + x->d86_mnem[0] = 0; + for (i = 0; i < 4; ++i) { + x->d86_opnd[i].d86_opnd[0] = 0; + x->d86_opnd[i].d86_prefix[0] = 0; + x->d86_opnd[i].d86_value_size = 0; + x->d86_opnd[i].d86_value = 0; + x->d86_opnd[i].d86_mode = MODE_NONE; + } +#endif + x->d86_rex_prefix = 0; + x->d86_got_modrm = 0; + x->d86_memsize = 0; + x->d86_vsib = 0; + + if (cpu_mode == SIZE16) { + opnd_size = SIZE16; + addr_size = SIZE16; + } else if (cpu_mode == SIZE32) { + opnd_size = SIZE32; + addr_size = SIZE32; + } else { + opnd_size = SIZE32; + addr_size = SIZE64; + } + + /* + * Get one opcode byte and check for zero padding that follows + * jump tables. + */ + if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0) + goto error; + + if (opcode1 == 0 && opcode2 == 0 && + x->d86_check_func != NULL && x->d86_check_func(x->d86_data)) { +#ifdef DIS_TEXT + (void) strncpy(x->d86_mnem, ".byte\t0", OPLEN); +#endif + goto done; + } + + /* + * Gather up legacy x86 prefix bytes. + */ + for (;;) { + uint_t *which_prefix = NULL; + + dp = (instable_t *)&dis_distable[opcode1][opcode2]; + + switch (dp->it_adrmode) { + case PREFIX: + which_prefix = &rep_prefix; + break; + case LOCK: + which_prefix = &lock_prefix; + break; + case OVERRIDE: + which_prefix = &segment_prefix; +#ifdef DIS_TEXT + x->d86_seg_prefix = (char *)dp->it_name; +#endif + if (dp->it_invalid64 && cpu_mode == SIZE64) + goto error; + break; + case AM: + which_prefix = &addr_size_prefix; + break; + case DM: + which_prefix = &opnd_size_prefix; + break; + } + if (which_prefix == NULL) + break; + *which_prefix = (opcode1 << 4) | opcode2; + if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0) + goto error; + } + + /* + * Handle amd64 mode PREFIX values. + * Some of the segment prefixes are no-ops. (only FS/GS actually work) + * We might have a REX prefix (opcodes 0x40-0x4f) + */ + if (cpu_mode == SIZE64) { + if (segment_prefix != 0x64 && segment_prefix != 0x65) + segment_prefix = 0; + + if (opcode1 == 0x4) { + rex_prefix = (opcode1 << 4) | opcode2; + if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0) + goto error; + dp = (instable_t *)&dis_distable[opcode1][opcode2]; + } else if (opcode1 == 0xC && + (opcode2 == 0x4 || opcode2 == 0x5)) { + /* AVX instructions */ + vex_prefix = (opcode1 << 4) | opcode2; + x->d86_rex_prefix = 0x40; + } + } else if (opcode1 == 0xC && (opcode2 == 0x4 || opcode2 == 0x5)) { + /* LDS, LES or AVX */ + dtrace_get_modrm(x, &mode, ®, &r_m); + vex_prefetch = 1; + + if (mode == REG_ONLY) { + /* AVX */ + vex_prefix = (opcode1 << 4) | opcode2; + x->d86_rex_prefix = 0x40; + opcode3 = (((mode << 3) | reg)>>1) & 0x0F; + opcode4 = ((reg << 3) | r_m) & 0x0F; + } + } + + if (vex_prefix == VEX_2bytes) { + if (!vex_prefetch) { + if (dtrace_get_opcode(x, &opcode3, &opcode4) != 0) + goto error; + } + vex_R = ((opcode3 & VEX_R) & 0x0F) >> 3; + vex_L = ((opcode4 & VEX_L) & 0x0F) >> 2; + vex_v = (((opcode3 << 4) | opcode4) & VEX_v) >> 3; + vex_p = opcode4 & VEX_p; + /* + * The vex.x and vex.b bits are not defined in two bytes + * mode vex prefix, their default values are 1 + */ + vex_byte1 = (opcode3 & VEX_R) | VEX_X | VEX_B; + + if (vex_R == 0) + x->d86_rex_prefix |= REX_R; + + if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0) + goto error; + + switch (vex_p) { + case VEX_p_66: + dp = (instable_t *) + &dis_opAVX660F[(opcode1 << 4) | opcode2]; + break; + case VEX_p_F3: + dp = (instable_t *) + &dis_opAVXF30F[(opcode1 << 4) | opcode2]; + break; + case VEX_p_F2: + dp = (instable_t *) + &dis_opAVXF20F [(opcode1 << 4) | opcode2]; + break; + default: + dp = (instable_t *) + &dis_opAVX0F[opcode1][opcode2]; + + } + + } else if (vex_prefix == VEX_3bytes) { + if (!vex_prefetch) { + if (dtrace_get_opcode(x, &opcode3, &opcode4) != 0) + goto error; + } + vex_R = (opcode3 & VEX_R) >> 3; + vex_X = (opcode3 & VEX_X) >> 2; + vex_B = (opcode3 & VEX_B) >> 1; + vex_m = (((opcode3 << 4) | opcode4) & VEX_m); + vex_byte1 = opcode3 & (VEX_R | VEX_X | VEX_B); + + if (vex_R == 0) + x->d86_rex_prefix |= REX_R; + if (vex_X == 0) + x->d86_rex_prefix |= REX_X; + if (vex_B == 0) + x->d86_rex_prefix |= REX_B; + + if (dtrace_get_opcode(x, &opcode5, &opcode6) != 0) + goto error; + vex_W = (opcode5 & VEX_W) >> 3; + vex_L = (opcode6 & VEX_L) >> 2; + vex_v = (((opcode5 << 4) | opcode6) & VEX_v) >> 3; + vex_p = opcode6 & VEX_p; + + if (vex_W) + x->d86_rex_prefix |= REX_W; + + /* Only these three vex_m values valid; others are reserved */ + if ((vex_m != VEX_m_0F) && (vex_m != VEX_m_0F38) && + (vex_m != VEX_m_0F3A)) + goto error; + + if (dtrace_get_opcode(x, &opcode1, &opcode2) != 0) + goto error; + + switch (vex_p) { + case VEX_p_66: + if (vex_m == VEX_m_0F) { + dp = (instable_t *) + &dis_opAVX660F + [(opcode1 << 4) | opcode2]; + } else if (vex_m == VEX_m_0F38) { + dp = (instable_t *) + &dis_opAVX660F38 + [(opcode1 << 4) | opcode2]; + } else if (vex_m == VEX_m_0F3A) { + dp = (instable_t *) + &dis_opAVX660F3A + [(opcode1 << 4) | opcode2]; + } else { + goto error; + } + break; + case VEX_p_F3: + if (vex_m == VEX_m_0F) { + dp = (instable_t *) + &dis_opAVXF30F + [(opcode1 << 4) | opcode2]; + } else if (vex_m == VEX_m_0F38) { + dp = (instable_t *) + &dis_opAVXF30F38 + [(opcode1 << 4) | opcode2]; + } else { + goto error; + } + break; + case VEX_p_F2: + if (vex_m == VEX_m_0F) { + dp = (instable_t *) + &dis_opAVXF20F + [(opcode1 << 4) | opcode2]; + } else if (vex_m == VEX_m_0F3A) { + dp = (instable_t *) + &dis_opAVXF20F3A + [(opcode1 << 4) | opcode2]; + } else if (vex_m == VEX_m_0F38) { + dp = (instable_t *) + &dis_opAVXF20F38 + [(opcode1 << 4) | opcode2]; + } else { + goto error; + } + break; + default: + dp = (instable_t *) + &dis_opAVX0F[opcode1][opcode2]; + + } + } + if (vex_prefix) { + if (dp->it_vexwoxmm) { + wbit = LONG_OPND; + } else { + if (vex_L) + wbit = YMM_OPND; + else + wbit = XMM_OPND; + } + } + + /* + * Deal with selection of operand and address size now. + * Note that the REX.W bit being set causes opnd_size_prefix to be + * ignored. + */ + if (cpu_mode == SIZE64) { + if ((rex_prefix & REX_W) || vex_W) + opnd_size = SIZE64; + else if (opnd_size_prefix) + opnd_size = SIZE16; + + if (addr_size_prefix) + addr_size = SIZE32; + } else if (cpu_mode == SIZE32) { + if (opnd_size_prefix) + opnd_size = SIZE16; + if (addr_size_prefix) + addr_size = SIZE16; + } else { + if (opnd_size_prefix) + opnd_size = SIZE32; + if (addr_size_prefix) + addr_size = SIZE32; + } + /* + * The pause instruction - a repz'd nop. This doesn't fit + * with any of the other prefix goop added for SSE, so we'll + * special-case it here. + */ + if (rep_prefix == 0xf3 && opcode1 == 0x9 && opcode2 == 0x0) { + rep_prefix = 0; + dp = (instable_t *)&dis_opPause; + } + + /* + * Some 386 instructions have 2 bytes of opcode before the mod_r/m + * byte so we may need to perform a table indirection. + */ + if (dp->it_indirect == (instable_t *)dis_op0F) { + if (dtrace_get_opcode(x, &opcode4, &opcode5) != 0) + goto error; + opcode_bytes = 2; + if (opcode4 == 0x7 && opcode5 >= 0x1 && opcode5 <= 0x3) { + uint_t subcode; + + if (dtrace_get_opcode(x, &opcode6, &opcode7) != 0) + goto error; + opcode_bytes = 3; + subcode = ((opcode6 & 0x3) << 1) | + ((opcode7 & 0x8) >> 3); + dp = (instable_t *)&dis_op0F7123[opcode5][subcode]; + } else if ((opcode4 == 0xc) && (opcode5 >= 0x8)) { + dp = (instable_t *)&dis_op0FC8[0]; + } else if ((opcode4 == 0x3) && (opcode5 == 0xA)) { + opcode_bytes = 3; + if (dtrace_get_opcode(x, &opcode6, &opcode7) != 0) + goto error; + if (opnd_size == SIZE16) + opnd_size = SIZE32; + + dp = (instable_t *)&dis_op0F3A[(opcode6<<4)|opcode7]; +#ifdef DIS_TEXT + if (strcmp(dp->it_name, "INVALID") == 0) + goto error; +#endif + switch (dp->it_adrmode) { + case XMMP: + break; + case XMMP_66r: + case XMMPRM_66r: + case XMM3PM_66r: + if (opnd_size_prefix == 0) { + goto error; + } + break; + case XMMP_66o: + if (opnd_size_prefix == 0) { + /* SSSE3 MMX instructions */ + dp_mmx = *dp; + dp = &dp_mmx; + dp->it_adrmode = MMOPM_66o; +#ifdef DIS_MEM + dp->it_size = 8; +#endif + } + break; + default: + goto error; + } + } else if ((opcode4 == 0x3) && (opcode5 == 0x8)) { + opcode_bytes = 3; + if (dtrace_get_opcode(x, &opcode6, &opcode7) != 0) + goto error; + dp = (instable_t *)&dis_op0F38[(opcode6<<4)|opcode7]; + + /* + * Both crc32 and movbe have the same 3rd opcode + * byte of either 0xF0 or 0xF1, so we use another + * indirection to distinguish between the two. + */ + if (dp->it_indirect == (instable_t *)dis_op0F38F0 || + dp->it_indirect == (instable_t *)dis_op0F38F1) { + + dp = dp->it_indirect; + if (rep_prefix != 0xF2) { + /* It is movbe */ + dp++; + } + } + + /* + * The adx family of instructions (adcx and adox) + * continue the classic Intel tradition of abusing + * arbitrary prefixes without actually meaning the + * prefix bit. Therefore, if we find either the + * opnd_size_prefix or rep_prefix we end up zeroing it + * out after making our determination so as to ensure + * that we don't get confused and accidentally print + * repz prefixes and the like on these instructions. + * + * In addition, these instructions are actually much + * closer to AVX instructions in semantics. Importantly, + * they always default to having 32-bit operands. + * However, if the CPU is in 64-bit mode, then and only + * then, does it use REX.w promotes things to 64-bits + * and REX.r allows 64-bit mode to use register r8-r15. + */ + if (dp->it_indirect == (instable_t *)dis_op0F38F6) { + dp = dp->it_indirect; + if (opnd_size_prefix == 0 && + rep_prefix == 0xf3) { + /* It is adox */ + dp++; + } else if (opnd_size_prefix != 0x66 && + rep_prefix != 0) { + /* It isn't adcx */ + goto error; + } + opnd_size_prefix = 0; + rep_prefix = 0; + opnd_size = SIZE32; + if (rex_prefix & REX_W) + opnd_size = SIZE64; + } + +#ifdef DIS_TEXT + if (strcmp(dp->it_name, "INVALID") == 0) + goto error; +#endif + switch (dp->it_adrmode) { + case ADX: + case XMM: + break; + case RM_66r: + case XMM_66r: + case XMMM_66r: + if (opnd_size_prefix == 0) { + goto error; + } + break; + case XMM_66o: + if (opnd_size_prefix == 0) { + /* SSSE3 MMX instructions */ + dp_mmx = *dp; + dp = &dp_mmx; + dp->it_adrmode = MM; +#ifdef DIS_MEM + dp->it_size = 8; +#endif + } + break; + case CRC32: + if (rep_prefix != 0xF2) { + goto error; + } + rep_prefix = 0; + break; + case MOVBE: + if (rep_prefix != 0x0) { + goto error; + } + break; + default: + goto error; + } + } else { + dp = (instable_t *)&dis_op0F[opcode4][opcode5]; + } + } + + /* + * If still not at a TERM decode entry, then a ModRM byte + * exists and its fields further decode the instruction. + */ + x->d86_got_modrm = 0; + if (dp->it_indirect != TERM) { + dtrace_get_modrm(x, &mode, &opcode3, &r_m); + if (x->d86_error) + goto error; + reg = opcode3; + + /* + * decode 287 instructions (D8-DF) from opcodeN + */ + if (opcode1 == 0xD && opcode2 >= 0x8) { + if (opcode2 == 0xB && mode == 0x3 && opcode3 == 4) + dp = (instable_t *)&dis_opFP5[r_m]; + else if (opcode2 == 0xA && mode == 0x3 && opcode3 < 4) + dp = (instable_t *)&dis_opFP7[opcode3]; + else if (opcode2 == 0xB && mode == 0x3) + dp = (instable_t *)&dis_opFP6[opcode3]; + else if (opcode2 == 0x9 && mode == 0x3 && opcode3 >= 4) + dp = (instable_t *)&dis_opFP4[opcode3 - 4][r_m]; + else if (mode == 0x3) + dp = (instable_t *) + &dis_opFP3[opcode2 - 8][opcode3]; + else + dp = (instable_t *) + &dis_opFP1n2[opcode2 - 8][opcode3]; + } else { + dp = (instable_t *)dp->it_indirect + opcode3; + } + } + + /* + * In amd64 bit mode, ARPL opcode is changed to MOVSXD + * (sign extend 32bit to 64 bit) + */ + if ((vex_prefix == 0) && cpu_mode == SIZE64 && + opcode1 == 0x6 && opcode2 == 0x3) + dp = (instable_t *)&dis_opMOVSLD; + + /* + * at this point we should have a correct (or invalid) opcode + */ + if (cpu_mode == SIZE64 && dp->it_invalid64 || + cpu_mode != SIZE64 && dp->it_invalid32) + goto error; + if (dp->it_indirect != TERM) + goto error; + + /* + * Deal with MMX/SSE opcodes which are changed by prefixes. Note, we do + * need to include UNKNOWN below, as we may have instructions that + * actually have a prefix, but don't exist in any other form. + */ + switch (dp->it_adrmode) { + case UNKNOWN: + case MMO: + case MMOIMPL: + case MMO3P: + case MMOM3: + case MMOMS: + case MMOPM: + case MMOPRM: + case MMOS: + case XMMO: + case XMMOM: + case XMMOMS: + case XMMOPM: + case XMMOS: + case XMMOMX: + case XMMOX3: + case XMMOXMM: + /* + * This is horrible. Some SIMD instructions take the + * form 0x0F 0x?? ..., which is easily decoded using the + * existing tables. Other SIMD instructions use various + * prefix bytes to overload existing instructions. For + * Example, addps is F0, 58, whereas addss is F3 (repz), + * F0, 58. Presumably someone got a raise for this. + * + * If we see one of the instructions which can be + * modified in this way (if we've got one of the SIMDO* + * address modes), we'll check to see if the last prefix + * was a repz. If it was, we strip the prefix from the + * mnemonic, and we indirect using the dis_opSIMDrepz + * table. + */ + + /* + * Calculate our offset in dis_op0F + */ + if ((uintptr_t)dp - (uintptr_t)dis_op0F > sizeof (dis_op0F)) + goto error; + + off = ((uintptr_t)dp - (uintptr_t)dis_op0F) / + sizeof (instable_t); + + /* + * Rewrite if this instruction used one of the magic prefixes. + */ + if (rep_prefix) { + if (rep_prefix == 0xf2) + dp = (instable_t *)&dis_opSIMDrepnz[off]; + else + dp = (instable_t *)&dis_opSIMDrepz[off]; + rep_prefix = 0; + } else if (opnd_size_prefix) { + dp = (instable_t *)&dis_opSIMDdata16[off]; + opnd_size_prefix = 0; + if (opnd_size == SIZE16) + opnd_size = SIZE32; + } + break; + + case MG9: + /* + * More horribleness: the group 9 (0xF0 0xC7) instructions are + * allowed an optional prefix of 0x66 or 0xF3. This is similar + * to the SIMD business described above, but with a different + * addressing mode (and an indirect table), so we deal with it + * separately (if similarly). + * + * Intel further complicated this with the release of Ivy Bridge + * where they overloaded these instructions based on the ModR/M + * bytes. The VMX instructions have a mode of 0 since they are + * memory instructions but rdrand instructions have a mode of + * 0b11 (REG_ONLY) because they only operate on registers. While + * there are different prefix formats, for now it is sufficient + * to use a single different table. + */ + + /* + * Calculate our offset in dis_op0FC7 (the group 9 table) + */ + if ((uintptr_t)dp - (uintptr_t)dis_op0FC7 > sizeof (dis_op0FC7)) + goto error; + + off = ((uintptr_t)dp - (uintptr_t)dis_op0FC7) / + sizeof (instable_t); + + /* + * If we have a mode of 0b11 then we have to rewrite this. + */ + dtrace_get_modrm(x, &mode, ®, &r_m); + if (mode == REG_ONLY) { + dp = (instable_t *)&dis_op0FC7m3[off]; + break; + } + + /* + * Rewrite if this instruction used one of the magic prefixes. + */ + if (rep_prefix) { + if (rep_prefix == 0xf3) + dp = (instable_t *)&dis_opF30FC7[off]; + else + goto error; + rep_prefix = 0; + } else if (opnd_size_prefix) { + dp = (instable_t *)&dis_op660FC7[off]; + opnd_size_prefix = 0; + if (opnd_size == SIZE16) + opnd_size = SIZE32; + } + break; + + + case MMOSH: + /* + * As with the "normal" SIMD instructions, the MMX + * shuffle instructions are overloaded. These + * instructions, however, are special in that they use + * an extra byte, and thus an extra table. As of this + * writing, they only use the opnd_size prefix. + */ + + /* + * Calculate our offset in dis_op0F7123 + */ + if ((uintptr_t)dp - (uintptr_t)dis_op0F7123 > + sizeof (dis_op0F7123)) + goto error; + + if (opnd_size_prefix) { + off = ((uintptr_t)dp - (uintptr_t)dis_op0F7123) / + sizeof (instable_t); + dp = (instable_t *)&dis_opSIMD7123[off]; + opnd_size_prefix = 0; + if (opnd_size == SIZE16) + opnd_size = SIZE32; + } + break; + case MRw: + if (rep_prefix) { + if (rep_prefix == 0xf3) { + + /* + * Calculate our offset in dis_op0F + */ + if ((uintptr_t)dp - (uintptr_t)dis_op0F + > sizeof (dis_op0F)) + goto error; + + off = ((uintptr_t)dp - (uintptr_t)dis_op0F) / + sizeof (instable_t); + + dp = (instable_t *)&dis_opSIMDrepz[off]; + rep_prefix = 0; + } else { + goto error; + } + } + break; + } + + /* + * In 64 bit mode, some opcodes automatically use opnd_size == SIZE64. + */ + if (cpu_mode == SIZE64) + if (dp->it_always64 || (opnd_size == SIZE32 && dp->it_stackop)) + opnd_size = SIZE64; + +#ifdef DIS_TEXT + /* + * At this point most instructions can format the opcode mnemonic + * including the prefixes. + */ + if (lock_prefix) + (void) strlcat(x->d86_mnem, "lock ", OPLEN); + + if (rep_prefix == 0xf2) + (void) strlcat(x->d86_mnem, "repnz ", OPLEN); + else if (rep_prefix == 0xf3) + (void) strlcat(x->d86_mnem, "repz ", OPLEN); + + if (cpu_mode == SIZE64 && addr_size_prefix) + (void) strlcat(x->d86_mnem, "addr32 ", OPLEN); + + if (dp->it_adrmode != CBW && + dp->it_adrmode != CWD && + dp->it_adrmode != XMMSFNC) { + if (strcmp(dp->it_name, "INVALID") == 0) + goto error; + (void) strlcat(x->d86_mnem, dp->it_name, OPLEN); + if (dp->it_avxsuf && dp->it_suffix) { + (void) strlcat(x->d86_mnem, vex_W != 0 ? "q" : "d", + OPLEN); + } else if (dp->it_suffix) { + char *types[] = {"", "w", "l", "q"}; + if (opcode_bytes == 2 && opcode4 == 4) { + /* It's a cmovx.yy. Replace the suffix x */ + for (i = 5; i < OPLEN; i++) { + if (x->d86_mnem[i] == '.') + break; + } + x->d86_mnem[i - 1] = *types[opnd_size]; + } else if ((opnd_size == 2) && (opcode_bytes == 3) && + ((opcode6 == 1 && opcode7 == 6) || + (opcode6 == 2 && opcode7 == 2))) { + /* + * To handle PINSRD and PEXTRD + */ + (void) strlcat(x->d86_mnem, "d", OPLEN); + } else { + (void) strlcat(x->d86_mnem, types[opnd_size], + OPLEN); + } + } + } +#endif + + /* + * Process operands based on the addressing modes. + */ + x->d86_mode = cpu_mode; + /* + * In vex mode the rex_prefix has no meaning + */ + if (!vex_prefix) + x->d86_rex_prefix = rex_prefix; + x->d86_opnd_size = opnd_size; + x->d86_addr_size = addr_size; + vbit = 0; /* initialize for mem/reg -> reg */ + switch (dp->it_adrmode) { + /* + * amd64 instruction to sign extend 32 bit reg/mem operands + * into 64 bit register values + */ + case MOVSXZ: +#ifdef DIS_TEXT + if (rex_prefix == 0) + (void) strncpy(x->d86_mnem, "movzld", OPLEN); +#endif + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); + x->d86_opnd_size = SIZE64; + dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1); + x->d86_opnd_size = opnd_size = SIZE32; + wbit = LONG_OPND; + dtrace_get_operand(x, mode, r_m, wbit, 0); + break; + + /* + * movsbl movsbw movsbq (0x0FBE) or movswl movswq (0x0FBF) + * movzbl movzbw movzbq (0x0FB6) or movzwl movzwq (0x0FB7) + * wbit lives in 2nd byte, note that operands + * are different sized + */ + case MOVZ: + if (rex_prefix & REX_W) { + /* target register size = 64 bit */ + x->d86_mnem[5] = 'q'; + } + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); + dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1); + x->d86_opnd_size = opnd_size = SIZE16; + wbit = WBIT(opcode5); + dtrace_get_operand(x, mode, r_m, wbit, 0); + break; + case CRC32: + opnd_size = SIZE32; + if (rex_prefix & REX_W) + opnd_size = SIZE64; + x->d86_opnd_size = opnd_size; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); + dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1); + wbit = WBIT(opcode7); + if (opnd_size_prefix) + x->d86_opnd_size = opnd_size = SIZE16; + dtrace_get_operand(x, mode, r_m, wbit, 0); + break; + case MOVBE: + opnd_size = SIZE32; + if (rex_prefix & REX_W) + opnd_size = SIZE64; + x->d86_opnd_size = opnd_size; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); + wbit = WBIT(opcode7); + if (opnd_size_prefix) + x->d86_opnd_size = opnd_size = SIZE16; + if (wbit) { + /* reg -> mem */ + dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 0); + dtrace_get_operand(x, mode, r_m, wbit, 1); + } else { + /* mem -> reg */ + dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1); + dtrace_get_operand(x, mode, r_m, wbit, 0); + } + break; + + /* + * imul instruction, with either 8-bit or longer immediate + * opcode 0x6B for byte, sign-extended displacement, 0x69 for word(s) + */ + case IMUL: + wbit = LONG_OPND; + THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, + OPSIZE(opnd_size, opcode2 == 0x9), 1); + break; + + /* memory or register operand to register, with 'w' bit */ + case MRw: + case ADX: + wbit = WBIT(opcode2); + STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0); + break; + + /* register to memory or register operand, with 'w' bit */ + /* arpl happens to fit here also because it is odd */ + case RMw: + if (opcode_bytes == 2) + wbit = WBIT(opcode5); + else + wbit = WBIT(opcode2); + STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1); + break; + + /* xaddb instruction */ + case XADDB: + wbit = 0; + STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1); + break; + + /* MMX register to memory or register operand */ + case MMS: + case MMOS: +#ifdef DIS_TEXT + wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND; +#else + wbit = LONG_OPND; +#endif + MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 1); + break; + + /* MMX register to memory */ + case MMOMS: + dtrace_get_modrm(x, &mode, ®, &r_m); + if (mode == REG_ONLY) + goto error; + wbit = MM_OPND; + MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 1); + break; + + /* Double shift. Has immediate operand specifying the shift. */ + case DSHIFT: + wbit = LONG_OPND; + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); + dtrace_get_operand(x, mode, r_m, wbit, 2); + dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1); + dtrace_imm_opnd(x, wbit, 1, 0); + break; + + /* + * Double shift. With no immediate operand, specifies using %cl. + */ + case DSHIFTcl: + wbit = LONG_OPND; + STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1); + break; + + /* immediate to memory or register operand */ + case IMlw: + wbit = WBIT(opcode2); + dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); + dtrace_get_operand(x, mode, r_m, wbit, 1); + /* + * Have long immediate for opcode 0x81, but not 0x80 nor 0x83 + */ + dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, opcode2 == 1), 0); + break; + + /* immediate to memory or register operand with the */ + /* 'w' bit present */ + case IMw: + wbit = WBIT(opcode2); + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); + dtrace_get_operand(x, mode, r_m, wbit, 1); + dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, wbit), 0); + break; + + /* immediate to register with register in low 3 bits */ + /* of op code */ + case IR: + /* w-bit here (with regs) is bit 3 */ + wbit = opcode2 >>3 & 0x1; + reg = REGNO(opcode2); + dtrace_rex_adjust(rex_prefix, mode, ®, NULL); + mode = REG_ONLY; + r_m = reg; + dtrace_get_operand(x, mode, r_m, wbit, 1); + dtrace_imm_opnd(x, wbit, OPSIZE64(opnd_size, wbit), 0); + break; + + /* MMX immediate shift of register */ + case MMSH: + case MMOSH: + wbit = MM_OPND; + goto mm_shift; /* in next case */ + + /* SIMD immediate shift of register */ + case XMMSH: + wbit = XMM_OPND; +mm_shift: + reg = REGNO(opcode7); + dtrace_rex_adjust(rex_prefix, mode, ®, NULL); + dtrace_get_operand(x, REG_ONLY, reg, wbit, 1); + dtrace_imm_opnd(x, wbit, 1, 0); + NOMEM; + break; + + /* accumulator to memory operand */ + case AO: + vbit = 1; + /*FALLTHROUGH*/ + + /* memory operand to accumulator */ + case OA: + wbit = WBIT(opcode2); + dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1 - vbit); + dtrace_imm_opnd(x, wbit, OPSIZE64(addr_size, LONG_OPND), vbit); +#ifdef DIS_TEXT + x->d86_opnd[vbit].d86_mode = MODE_OFFSET; +#endif + break; + + + /* segment register to memory or register operand */ + case SM: + vbit = 1; + /*FALLTHROUGH*/ + + /* memory or register operand to segment register */ + case MS: + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); + dtrace_get_operand(x, mode, r_m, LONG_OPND, vbit); + dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 1 - vbit); + break; + + /* + * rotate or shift instructions, which may shift by 1 or + * consult the cl register, depending on the 'v' bit + */ + case Mv: + vbit = VBIT(opcode2); + wbit = WBIT(opcode2); + dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); + dtrace_get_operand(x, mode, r_m, wbit, 1); +#ifdef DIS_TEXT + if (vbit) { + (void) strlcat(x->d86_opnd[0].d86_opnd, "%cl", OPLEN); + } else { + x->d86_opnd[0].d86_mode = MODE_SIGNED; + x->d86_opnd[0].d86_value_size = 1; + x->d86_opnd[0].d86_value = 1; + } +#endif + break; + /* + * immediate rotate or shift instructions + */ + case MvI: + wbit = WBIT(opcode2); +normal_imm_mem: + dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); + dtrace_get_operand(x, mode, r_m, wbit, 1); + dtrace_imm_opnd(x, wbit, 1, 0); + break; + + /* bit test instructions */ + case MIb: + wbit = LONG_OPND; + goto normal_imm_mem; + + /* single memory or register operand with 'w' bit present */ + case Mw: + wbit = WBIT(opcode2); +just_mem: + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); + dtrace_get_operand(x, mode, r_m, wbit, 0); + break; + + case SWAPGS_RDTSCP: + if (cpu_mode == SIZE64 && mode == 3 && r_m == 0) { +#ifdef DIS_TEXT + (void) strncpy(x->d86_mnem, "swapgs", OPLEN); +#endif + NOMEM; + break; + } else if (mode == 3 && r_m == 1) { +#ifdef DIS_TEXT + (void) strncpy(x->d86_mnem, "rdtscp", OPLEN); +#endif + NOMEM; + break; + } + + /*FALLTHROUGH*/ + + /* prefetch instruction - memory operand, but no memory acess */ + case PREF: + NOMEM; + /*FALLTHROUGH*/ + + /* single memory or register operand */ + case M: + case MG9: + wbit = LONG_OPND; + goto just_mem; + + /* single memory or register byte operand */ + case Mb: + wbit = BYTE_OPND; + goto just_mem; + + case VMx: + if (mode == 3) { +#ifdef DIS_TEXT + char *vminstr; + + switch (r_m) { + case 1: + vminstr = "vmcall"; + break; + case 2: + vminstr = "vmlaunch"; + break; + case 3: + vminstr = "vmresume"; + break; + case 4: + vminstr = "vmxoff"; + break; + default: + goto error; + } + + (void) strncpy(x->d86_mnem, vminstr, OPLEN); +#else + if (r_m < 1 || r_m > 4) + goto error; +#endif + + NOMEM; + break; + } + /*FALLTHROUGH*/ + case SVM: + if (mode == 3) { +#ifdef DIS_TEXT + char *vinstr; + + switch (r_m) { + case 0: + vinstr = "vmrun"; + break; + case 1: + vinstr = "vmmcall"; + break; + case 2: + vinstr = "vmload"; + break; + case 3: + vinstr = "vmsave"; + break; + case 4: + vinstr = "stgi"; + break; + case 5: + vinstr = "clgi"; + break; + case 6: + vinstr = "skinit"; + break; + case 7: + vinstr = "invlpga"; + break; + } + + (void) strncpy(x->d86_mnem, vinstr, OPLEN); +#endif + NOMEM; + break; + } + /*FALLTHROUGH*/ + case MONITOR_MWAIT: + if (mode == 3) { + if (r_m == 0) { +#ifdef DIS_TEXT + (void) strncpy(x->d86_mnem, "monitor", OPLEN); +#endif + NOMEM; + break; + } else if (r_m == 1) { +#ifdef DIS_TEXT + (void) strncpy(x->d86_mnem, "mwait", OPLEN); +#endif + NOMEM; + break; + } else if (r_m == 2) { +#ifdef DIS_TEXT + (void) strncpy(x->d86_mnem, "clac", OPLEN); +#endif + NOMEM; + break; + } else if (r_m == 3) { +#ifdef DIS_TEXT + (void) strncpy(x->d86_mnem, "stac", OPLEN); +#endif + NOMEM; + break; + } else { + goto error; + } + } + /*FALLTHROUGH*/ + case XGETBV_XSETBV: + if (mode == 3) { + if (r_m == 0) { +#ifdef DIS_TEXT + (void) strncpy(x->d86_mnem, "xgetbv", OPLEN); +#endif + NOMEM; + break; + } else if (r_m == 1) { +#ifdef DIS_TEXT + (void) strncpy(x->d86_mnem, "xsetbv", OPLEN); +#endif + NOMEM; + break; + } else { + goto error; + } + + } + /*FALLTHROUGH*/ + case MO: + /* Similar to M, but only memory (no direct registers) */ + wbit = LONG_OPND; + dtrace_get_modrm(x, &mode, ®, &r_m); + if (mode == 3) + goto error; + dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); + dtrace_get_operand(x, mode, r_m, wbit, 0); + break; + + /* move special register to register or reverse if vbit */ + case SREG: + switch (opcode5) { + + case 2: + vbit = 1; + /*FALLTHROUGH*/ + case 0: + wbit = CONTROL_OPND; + break; + + case 3: + vbit = 1; + /*FALLTHROUGH*/ + case 1: + wbit = DEBUG_OPND; + break; + + case 6: + vbit = 1; + /*FALLTHROUGH*/ + case 4: + wbit = TEST_OPND; + break; + + } + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); + dtrace_get_operand(x, REG_ONLY, reg, wbit, vbit); + dtrace_get_operand(x, REG_ONLY, r_m, LONG_OPND, 1 - vbit); + NOMEM; + break; + + /* + * single register operand with register in the low 3 + * bits of op code + */ + case R: + if (opcode_bytes == 2) + reg = REGNO(opcode5); + else + reg = REGNO(opcode2); + dtrace_rex_adjust(rex_prefix, mode, ®, NULL); + dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 0); + NOMEM; + break; + + /* + * register to accumulator with register in the low 3 + * bits of op code, xchg instructions + */ + case RA: + NOMEM; + reg = REGNO(opcode2); + dtrace_rex_adjust(rex_prefix, mode, ®, NULL); + dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 0); + dtrace_get_operand(x, REG_ONLY, EAX_REGNO, LONG_OPND, 1); + break; + + /* + * single segment register operand, with register in + * bits 3-4 of op code byte + */ + case SEG: + NOMEM; + reg = (x->d86_bytes[x->d86_len - 1] >> 3) & 0x3; + dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 0); + break; + + /* + * single segment register operand, with register in + * bits 3-5 of op code + */ + case LSEG: + NOMEM; + /* long seg reg from opcode */ + reg = (x->d86_bytes[x->d86_len - 1] >> 3) & 0x7; + dtrace_get_operand(x, REG_ONLY, reg, SEG_OPND, 0); + break; + + /* memory or register operand to register */ + case MR: + if (vex_prefetch) + x->d86_got_modrm = 1; + wbit = LONG_OPND; + STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0); + break; + + case RM: + case RM_66r: + wbit = LONG_OPND; + STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 1); + break; + + /* MMX/SIMD-Int memory or mm reg to mm reg */ + case MM: + case MMO: +#ifdef DIS_TEXT + wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND; +#else + wbit = LONG_OPND; +#endif + MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 0); + break; + + case MMOIMPL: +#ifdef DIS_TEXT + wbit = strcmp(dp->it_name, "movd") ? MM_OPND : LONG_OPND; +#else + wbit = LONG_OPND; +#endif + dtrace_get_modrm(x, &mode, ®, &r_m); + if (mode != REG_ONLY) + goto error; + + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); + dtrace_get_operand(x, mode, r_m, wbit, 0); + dtrace_get_operand(x, REG_ONLY, reg, MM_OPND, 1); + mode = 0; /* change for memory access size... */ + break; + + /* MMX/SIMD-Int and SIMD-FP predicated mm reg to r32 */ + case MMO3P: + wbit = MM_OPND; + goto xmm3p; + case XMM3P: + wbit = XMM_OPND; +xmm3p: + dtrace_get_modrm(x, &mode, ®, &r_m); + if (mode != REG_ONLY) + goto error; + + THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 1, + 1); + NOMEM; + break; + + case XMM3PM_66r: + THREEOPERAND(x, mode, reg, r_m, rex_prefix, LONG_OPND, XMM_OPND, + 1, 0); + break; + + /* MMX/SIMD-Int predicated r32/mem to mm reg */ + case MMOPRM: + wbit = LONG_OPND; + w2 = MM_OPND; + goto xmmprm; + case XMMPRM: + case XMMPRM_66r: + wbit = LONG_OPND; + w2 = XMM_OPND; +xmmprm: + THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, w2, 1, 1); + break; + + /* MMX/SIMD-Int predicated mm/mem to mm reg */ + case MMOPM: + case MMOPM_66o: + wbit = w2 = MM_OPND; + goto xmmprm; + + /* MMX/SIMD-Int mm reg to r32 */ + case MMOM3: + NOMEM; + dtrace_get_modrm(x, &mode, ®, &r_m); + if (mode != REG_ONLY) + goto error; + wbit = MM_OPND; + MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 0); + break; + + /* SIMD memory or xmm reg operand to xmm reg */ + case XMM: + case XMM_66o: + case XMM_66r: + case XMMO: + case XMMXIMPL: + wbit = XMM_OPND; + STANDARD_MODRM(x, mode, reg, r_m, rex_prefix, wbit, 0); + + if (dp->it_adrmode == XMMXIMPL && mode != REG_ONLY) + goto error; + +#ifdef DIS_TEXT + /* + * movlps and movhlps share opcodes. They differ in the + * addressing modes allowed for their operands. + * movhps and movlhps behave similarly. + */ + if (mode == REG_ONLY) { + if (strcmp(dp->it_name, "movlps") == 0) + (void) strncpy(x->d86_mnem, "movhlps", OPLEN); + else if (strcmp(dp->it_name, "movhps") == 0) + (void) strncpy(x->d86_mnem, "movlhps", OPLEN); + } +#endif + if (dp->it_adrmode == XMMXIMPL) + mode = 0; /* change for memory access size... */ + break; + + /* SIMD xmm reg to memory or xmm reg */ + case XMMS: + case XMMOS: + case XMMMS: + case XMMOMS: + dtrace_get_modrm(x, &mode, ®, &r_m); +#ifdef DIS_TEXT + if ((strcmp(dp->it_name, "movlps") == 0 || + strcmp(dp->it_name, "movhps") == 0 || + strcmp(dp->it_name, "movntps") == 0) && + mode == REG_ONLY) + goto error; +#endif + wbit = XMM_OPND; + MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1); + break; + + /* SIMD memory to xmm reg */ + case XMMM: + case XMMM_66r: + case XMMOM: + wbit = XMM_OPND; + dtrace_get_modrm(x, &mode, ®, &r_m); +#ifdef DIS_TEXT + if (mode == REG_ONLY) { + if (strcmp(dp->it_name, "movhps") == 0) + (void) strncpy(x->d86_mnem, "movlhps", OPLEN); + else + goto error; + } +#endif + MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0); + break; + + /* SIMD memory or r32 to xmm reg */ + case XMM3MX: + wbit = LONG_OPND; + MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0); + break; + + case XMM3MXS: + wbit = LONG_OPND; + MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1); + break; + + /* SIMD memory or mm reg to xmm reg */ + case XMMOMX: + /* SIMD mm to xmm */ + case XMMMX: + wbit = MM_OPND; + MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 0); + break; + + /* SIMD memory or xmm reg to mm reg */ + case XMMXMM: + case XMMOXMM: + case XMMXM: + wbit = XMM_OPND; + MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, MM_OPND, 0); + break; + + + /* SIMD memory or xmm reg to r32 */ + case XMMXM3: + wbit = XMM_OPND; + MIXED_MM(x, mode, reg, r_m, rex_prefix, wbit, LONG_OPND, 0); + break; + + /* SIMD xmm to r32 */ + case XMMX3: + case XMMOX3: + dtrace_get_modrm(x, &mode, ®, &r_m); + if (mode != REG_ONLY) + goto error; + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); + dtrace_get_operand(x, mode, r_m, XMM_OPND, 0); + dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, 1); + NOMEM; + break; + + /* SIMD predicated memory or xmm reg with/to xmm reg */ + case XMMP: + case XMMP_66r: + case XMMP_66o: + case XMMOPM: + wbit = XMM_OPND; + THREEOPERAND(x, mode, reg, r_m, rex_prefix, wbit, XMM_OPND, 1, + 1); + +#ifdef DIS_TEXT + /* + * cmpps and cmpss vary their instruction name based + * on the value of imm8. Other XMMP instructions, + * such as shufps, require explicit specification of + * the predicate. + */ + if (dp->it_name[0] == 'c' && + dp->it_name[1] == 'm' && + dp->it_name[2] == 'p' && + strlen(dp->it_name) == 5) { + uchar_t pred = x->d86_opnd[0].d86_value & 0xff; + + if (pred >= (sizeof (dis_PREDSUFFIX) / sizeof (char *))) + goto error; + + (void) strncpy(x->d86_mnem, "cmp", OPLEN); + (void) strlcat(x->d86_mnem, dis_PREDSUFFIX[pred], + OPLEN); + (void) strlcat(x->d86_mnem, + dp->it_name + strlen(dp->it_name) - 2, + OPLEN); + x->d86_opnd[0] = x->d86_opnd[1]; + x->d86_opnd[1] = x->d86_opnd[2]; + x->d86_numopnds = 2; + } +#endif + break; + + case XMMX2I: + FOUROPERAND(x, mode, reg, r_m, rex_prefix, XMM_OPND, XMM_OPND, + 1); + NOMEM; + break; + + case XMM2I: + ONEOPERAND_TWOIMM(x, mode, reg, r_m, rex_prefix, XMM_OPND, 1); + NOMEM; + break; + + /* immediate operand to accumulator */ + case IA: + wbit = WBIT(opcode2); + dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1); + dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, wbit), 0); + NOMEM; + break; + + /* memory or register operand to accumulator */ + case MA: + wbit = WBIT(opcode2); + dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); + dtrace_get_operand(x, mode, r_m, wbit, 0); + break; + + /* si register to di register used to reference memory */ + case SD: +#ifdef DIS_TEXT + dtrace_check_override(x, 0); + x->d86_numopnds = 2; + if (addr_size == SIZE64) { + (void) strlcat(x->d86_opnd[0].d86_opnd, "(%rsi)", + OPLEN); + (void) strlcat(x->d86_opnd[1].d86_opnd, "(%rdi)", + OPLEN); + } else if (addr_size == SIZE32) { + (void) strlcat(x->d86_opnd[0].d86_opnd, "(%esi)", + OPLEN); + (void) strlcat(x->d86_opnd[1].d86_opnd, "(%edi)", + OPLEN); + } else { + (void) strlcat(x->d86_opnd[0].d86_opnd, "(%si)", + OPLEN); + (void) strlcat(x->d86_opnd[1].d86_opnd, "(%di)", + OPLEN); + } +#endif + wbit = LONG_OPND; + break; + + /* accumulator to di register */ + case AD: + wbit = WBIT(opcode2); +#ifdef DIS_TEXT + dtrace_check_override(x, 1); + x->d86_numopnds = 2; + dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 0); + if (addr_size == SIZE64) + (void) strlcat(x->d86_opnd[1].d86_opnd, "(%rdi)", + OPLEN); + else if (addr_size == SIZE32) + (void) strlcat(x->d86_opnd[1].d86_opnd, "(%edi)", + OPLEN); + else + (void) strlcat(x->d86_opnd[1].d86_opnd, "(%di)", + OPLEN); +#endif + break; + + /* si register to accumulator */ + case SA: + wbit = WBIT(opcode2); +#ifdef DIS_TEXT + dtrace_check_override(x, 0); + x->d86_numopnds = 2; + if (addr_size == SIZE64) + (void) strlcat(x->d86_opnd[0].d86_opnd, "(%rsi)", + OPLEN); + else if (addr_size == SIZE32) + (void) strlcat(x->d86_opnd[0].d86_opnd, "(%esi)", + OPLEN); + else + (void) strlcat(x->d86_opnd[0].d86_opnd, "(%si)", + OPLEN); + dtrace_get_operand(x, REG_ONLY, EAX_REGNO, wbit, 1); +#endif + break; + + /* + * single operand, a 16/32 bit displacement + */ + case D: + wbit = LONG_OPND; + dtrace_disp_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 0); + NOMEM; + break; + + /* jmp/call indirect to memory or register operand */ + case INM: +#ifdef DIS_TEXT + (void) strlcat(x->d86_opnd[0].d86_prefix, "*", OPLEN); +#endif + dtrace_rex_adjust(rex_prefix, mode, NULL, &r_m); + dtrace_get_operand(x, mode, r_m, LONG_OPND, 0); + wbit = LONG_OPND; + break; + + /* + * for long jumps and long calls -- a new code segment + * register and an offset in IP -- stored in object + * code in reverse order. Note - not valid in amd64 + */ + case SO: + dtrace_check_override(x, 1); + wbit = LONG_OPND; + dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 1); +#ifdef DIS_TEXT + x->d86_opnd[1].d86_mode = MODE_SIGNED; +#endif + /* will now get segment operand */ + dtrace_imm_opnd(x, wbit, 2, 0); + break; + + /* + * jmp/call. single operand, 8 bit displacement. + * added to current EIP in 'compofff' + */ + case BD: + dtrace_disp_opnd(x, BYTE_OPND, 1, 0); + NOMEM; + break; + + /* single 32/16 bit immediate operand */ + case I: + wbit = LONG_OPND; + dtrace_imm_opnd(x, wbit, OPSIZE(opnd_size, LONG_OPND), 0); + break; + + /* single 8 bit immediate operand */ + case Ib: + wbit = LONG_OPND; + dtrace_imm_opnd(x, wbit, 1, 0); + break; + + case ENTER: + wbit = LONG_OPND; + dtrace_imm_opnd(x, wbit, 2, 0); + dtrace_imm_opnd(x, wbit, 1, 1); + switch (opnd_size) { + case SIZE64: + x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 8; + break; + case SIZE32: + x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 4; + break; + case SIZE16: + x->d86_memsize = (x->d86_opnd[1].d86_value + 1) * 2; + break; + } + + break; + + /* 16-bit immediate operand */ + case RET: + wbit = LONG_OPND; + dtrace_imm_opnd(x, wbit, 2, 0); + break; + + /* single 8 bit port operand */ + case P: + dtrace_check_override(x, 0); + dtrace_imm_opnd(x, BYTE_OPND, 1, 0); + NOMEM; + break; + + /* single operand, dx register (variable port instruction) */ + case V: + x->d86_numopnds = 1; + dtrace_check_override(x, 0); +#ifdef DIS_TEXT + (void) strlcat(x->d86_opnd[0].d86_opnd, "(%dx)", OPLEN); +#endif + NOMEM; + break; + + /* + * The int instruction, which has two forms: + * int 3 (breakpoint) or + * int n, where n is indicated in the subsequent + * byte (format Ib). The int 3 instruction (opcode 0xCC), + * where, although the 3 looks like an operand, + * it is implied by the opcode. It must be converted + * to the correct base and output. + */ + case INT3: +#ifdef DIS_TEXT + x->d86_numopnds = 1; + x->d86_opnd[0].d86_mode = MODE_SIGNED; + x->d86_opnd[0].d86_value_size = 1; + x->d86_opnd[0].d86_value = 3; +#endif + NOMEM; + break; + + /* single 8 bit immediate operand */ + case INTx: + dtrace_imm_opnd(x, BYTE_OPND, 1, 0); + NOMEM; + break; + + /* an unused byte must be discarded */ + case U: + if (x->d86_get_byte(x->d86_data) < 0) + goto error; + x->d86_len++; + NOMEM; + break; + + case CBW: +#ifdef DIS_TEXT + if (opnd_size == SIZE16) + (void) strlcat(x->d86_mnem, "cbtw", OPLEN); + else if (opnd_size == SIZE32) + (void) strlcat(x->d86_mnem, "cwtl", OPLEN); + else + (void) strlcat(x->d86_mnem, "cltq", OPLEN); +#endif + wbit = LONG_OPND; + NOMEM; + break; + + case CWD: +#ifdef DIS_TEXT + if (opnd_size == SIZE16) + (void) strlcat(x->d86_mnem, "cwtd", OPLEN); + else if (opnd_size == SIZE32) + (void) strlcat(x->d86_mnem, "cltd", OPLEN); + else + (void) strlcat(x->d86_mnem, "cqtd", OPLEN); +#endif + wbit = LONG_OPND; + NOMEM; + break; + + case XMMSFNC: + /* + * sfence is sfence if mode is REG_ONLY. If mode isn't + * REG_ONLY, mnemonic should be 'clflush'. + */ + dtrace_get_modrm(x, &mode, ®, &r_m); + + /* sfence doesn't take operands */ +#ifdef DIS_TEXT + if (mode == REG_ONLY) { + (void) strlcat(x->d86_mnem, "sfence", OPLEN); + } else { + (void) strlcat(x->d86_mnem, "clflush", OPLEN); + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); + dtrace_get_operand(x, mode, r_m, BYTE_OPND, 0); + NOMEM; + } +#else + if (mode != REG_ONLY) { + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); + dtrace_get_operand(x, mode, r_m, LONG_OPND, 0); + NOMEM; + } +#endif + break; + + /* + * no disassembly, the mnemonic was all there was so go on + */ + case NORM: + if (dp->it_invalid32 && cpu_mode != SIZE64) + goto error; + NOMEM; + /*FALLTHROUGH*/ + case IMPLMEM: + break; + + case XMMFENCE: + /* + * XRSTOR and LFENCE share the same opcode but differ in mode + */ + dtrace_get_modrm(x, &mode, ®, &r_m); + + if (mode == REG_ONLY) { + /* + * Only the following exact byte sequences are allowed: + * + * 0f ae e8 lfence + * 0f ae f0 mfence + */ + if ((uint8_t)x->d86_bytes[x->d86_len - 1] != 0xe8 && + (uint8_t)x->d86_bytes[x->d86_len - 1] != 0xf0) + goto error; + } else { +#ifdef DIS_TEXT + (void) strncpy(x->d86_mnem, "xrstor", OPLEN); +#endif + dtrace_rex_adjust(rex_prefix, mode, ®, &r_m); + dtrace_get_operand(x, mode, r_m, BYTE_OPND, 0); + } + break; + + /* float reg */ + case F: +#ifdef DIS_TEXT + x->d86_numopnds = 1; + (void) strlcat(x->d86_opnd[0].d86_opnd, "%st(X)", OPLEN); + x->d86_opnd[0].d86_opnd[4] = r_m + '0'; +#endif + NOMEM; + break; + + /* float reg to float reg, with ret bit present */ + case FF: + vbit = opcode2 >> 2 & 0x1; /* vbit = 1: st -> st(i) */ + /*FALLTHROUGH*/ + case FFC: /* case for vbit always = 0 */ +#ifdef DIS_TEXT + x->d86_numopnds = 2; + (void) strlcat(x->d86_opnd[1 - vbit].d86_opnd, "%st", OPLEN); + (void) strlcat(x->d86_opnd[vbit].d86_opnd, "%st(X)", OPLEN); + x->d86_opnd[vbit].d86_opnd[4] = r_m + '0'; +#endif + NOMEM; + break; + + /* AVX instructions */ + case VEX_MO: + /* op(ModR/M.r/m) */ + x->d86_numopnds = 1; + dtrace_get_modrm(x, &mode, ®, &r_m); +#ifdef DIS_TEXT + if ((dp == &dis_opAVX0F[0xA][0xE]) && (reg == 3)) + (void) strncpy(x->d86_mnem, "vstmxcsr", OPLEN); +#endif + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + dtrace_get_operand(x, mode, r_m, wbit, 0); + break; + case VEX_RMrX: + case FMA: + /* ModR/M.reg := op(VEX.vvvv, ModR/M.r/m) */ + x->d86_numopnds = 3; + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + + /* + * In classic Intel fashion, the opcodes for all of the FMA + * instructions all have two possible mnemonics which vary by + * one letter, which is selected based on the value of the wbit. + * When wbit is one, they have the 'd' suffix and when 'wbit' is + * 0, they have the 's' suffix. Otherwise, the FMA instructions + * are all a standard VEX_RMrX. + */ +#ifdef DIS_TEXT + if (dp->it_adrmode == FMA) { + size_t len = strlen(dp->it_name); + (void) strncpy(x->d86_mnem, dp->it_name, OPLEN); + if (len + 1 < OPLEN) { + (void) strncpy(x->d86_mnem + len, + vex_W != 0 ? "d" : "s", OPLEN - len); + } + } +#endif + + if (mode != REG_ONLY) { + if ((dp == &dis_opAVXF20F[0x10]) || + (dp == &dis_opAVXF30F[0x10])) { + /* vmovsd , */ + /* or vmovss , */ + x->d86_numopnds = 2; + goto L_VEX_MX; + } + } + + dtrace_get_operand(x, REG_ONLY, reg, wbit, 2); + /* + * VEX prefix uses the 1's complement form to encode the + * XMM/YMM regs + */ + dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 1); + + if ((dp == &dis_opAVXF20F[0x2A]) || + (dp == &dis_opAVXF30F[0x2A])) { + /* + * vcvtsi2si , , or vcvtsi2ss , + * , + */ + wbit = LONG_OPND; + } +#ifdef DIS_TEXT + else if ((mode == REG_ONLY) && + (dp == &dis_opAVX0F[0x1][0x6])) { /* vmovlhps */ + (void) strncpy(x->d86_mnem, "vmovlhps", OPLEN); + } else if ((mode == REG_ONLY) && + (dp == &dis_opAVX0F[0x1][0x2])) { /* vmovhlps */ + (void) strncpy(x->d86_mnem, "vmovhlps", OPLEN); + } +#endif + dtrace_get_operand(x, mode, r_m, wbit, 0); + + break; + + case VEX_VRMrX: + /* ModR/M.reg := op(MODR/M.r/m, VEX.vvvv) */ + x->d86_numopnds = 3; + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + + dtrace_get_operand(x, REG_ONLY, reg, wbit, 2); + /* + * VEX prefix uses the 1's complement form to encode the + * XMM/YMM regs + */ + dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 0); + + dtrace_get_operand(x, mode, r_m, wbit, 1); + break; + + case VEX_SbVM: + /* ModR/M.reg := op(MODR/M.r/m, VSIB, VEX.vvvv) */ + x->d86_numopnds = 3; + x->d86_vsib = 1; + + /* + * All instructions that use VSIB are currently a mess. See the + * comment around the dis_gather_regs_t structure definition. + */ + + vreg = &dis_vgather[opcode2][vex_W][vex_L]; + +#ifdef DIS_TEXT + (void) strncpy(x->d86_mnem, dp->it_name, OPLEN); + (void) strlcat(x->d86_mnem + strlen(dp->it_name), + vreg->dgr_suffix, OPLEN - strlen(dp->it_name)); +#endif + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + + dtrace_get_operand(x, REG_ONLY, reg, vreg->dgr_arg2, 2); + /* + * VEX prefix uses the 1's complement form to encode the + * XMM/YMM regs + */ + dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), vreg->dgr_arg0, + 0); + dtrace_get_operand(x, mode, r_m, vreg->dgr_arg1, 1); + break; + + case VEX_RRX: + /* ModR/M.rm := op(VEX.vvvv, ModR/M.reg) */ + x->d86_numopnds = 3; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + + if (mode != REG_ONLY) { + if ((dp == &dis_opAVXF20F[0x11]) || + (dp == &dis_opAVXF30F[0x11])) { + /* vmovsd , */ + /* or vmovss , */ + x->d86_numopnds = 2; + goto L_VEX_RM; + } + } + + dtrace_get_operand(x, mode, r_m, wbit, 2); + dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 1); + dtrace_get_operand(x, REG_ONLY, reg, wbit, 0); + break; + + case VEX_RMRX: + /* ModR/M.reg := op(VEX.vvvv, ModR/M.r_m, imm8[7:4]) */ + x->d86_numopnds = 4; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + dtrace_get_operand(x, REG_ONLY, reg, wbit, 3); + dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 2); + if (dp == &dis_opAVX660F3A[0x18]) { + /* vinsertf128 , , , */ + dtrace_get_operand(x, mode, r_m, XMM_OPND, 1); + } else if ((dp == &dis_opAVX660F3A[0x20]) || + (dp == & dis_opAVX660F[0xC4])) { + /* vpinsrb , , , */ + /* or vpinsrw , , , */ + dtrace_get_operand(x, mode, r_m, LONG_OPND, 1); + } else if (dp == &dis_opAVX660F3A[0x22]) { + /* vpinsrd/q , , , */ +#ifdef DIS_TEXT + if (vex_W) + x->d86_mnem[6] = 'q'; +#endif + dtrace_get_operand(x, mode, r_m, LONG_OPND, 1); + } else { + dtrace_get_operand(x, mode, r_m, wbit, 1); + } + + /* one byte immediate number */ + dtrace_imm_opnd(x, wbit, 1, 0); + + /* vblendvpd, vblendvps, vblendvb use the imm encode the regs */ + if ((dp == &dis_opAVX660F3A[0x4A]) || + (dp == &dis_opAVX660F3A[0x4B]) || + (dp == &dis_opAVX660F3A[0x4C])) { +#ifdef DIS_TEXT + int regnum = (x->d86_opnd[0].d86_value & 0xF0) >> 4; +#endif + x->d86_opnd[0].d86_mode = MODE_NONE; +#ifdef DIS_TEXT + if (vex_L) + (void) strncpy(x->d86_opnd[0].d86_opnd, + dis_YMMREG[regnum], OPLEN); + else + (void) strncpy(x->d86_opnd[0].d86_opnd, + dis_XMMREG[regnum], OPLEN); +#endif + } + break; + + case VEX_MX: + /* ModR/M.reg := op(ModR/M.rm) */ + x->d86_numopnds = 2; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); +L_VEX_MX: + + if ((dp == &dis_opAVXF20F[0xE6]) || + (dp == &dis_opAVX660F[0x5A]) || + (dp == &dis_opAVX660F[0xE6])) { + /* vcvtpd2dq , */ + /* or vcvtpd2ps , */ + /* or vcvttpd2dq , */ + dtrace_get_operand(x, REG_ONLY, reg, XMM_OPND, 1); + dtrace_get_operand(x, mode, r_m, wbit, 0); + } else if ((dp == &dis_opAVXF30F[0xE6]) || + (dp == &dis_opAVX0F[0x5][0xA]) || + (dp == &dis_opAVX660F38[0x13]) || + (dp == &dis_opAVX660F38[0x18]) || + (dp == &dis_opAVX660F38[0x19]) || + (dp == &dis_opAVX660F38[0x58]) || + (dp == &dis_opAVX660F38[0x78]) || + (dp == &dis_opAVX660F38[0x79]) || + (dp == &dis_opAVX660F38[0x59])) { + /* vcvtdq2pd , */ + /* or vcvtps2pd , */ + /* or vcvtph2ps , */ + /* or vbroadcasts* , */ + dtrace_get_operand(x, REG_ONLY, reg, wbit, 1); + dtrace_get_operand(x, mode, r_m, XMM_OPND, 0); + } else if (dp == &dis_opAVX660F[0x6E]) { + /* vmovd/q , */ +#ifdef DIS_TEXT + if (vex_W) + x->d86_mnem[4] = 'q'; +#endif + dtrace_get_operand(x, REG_ONLY, reg, wbit, 1); + dtrace_get_operand(x, mode, r_m, LONG_OPND, 0); + } else { + dtrace_get_operand(x, REG_ONLY, reg, wbit, 1); + dtrace_get_operand(x, mode, r_m, wbit, 0); + } + + break; + + case VEX_MXI: + /* ModR/M.reg := op(ModR/M.rm, imm8) */ + x->d86_numopnds = 3; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + + dtrace_get_operand(x, REG_ONLY, reg, wbit, 2); + dtrace_get_operand(x, mode, r_m, wbit, 1); + + /* one byte immediate number */ + dtrace_imm_opnd(x, wbit, 1, 0); + break; + + case VEX_XXI: + /* VEX.vvvv := op(ModR/M.rm, imm8) */ + x->d86_numopnds = 3; + + dtrace_get_modrm(x, &mode, ®, &r_m); +#ifdef DIS_TEXT + (void) strncpy(x->d86_mnem, dis_AVXvgrp7[opcode2 - 1][reg], + OPLEN); +#endif + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + + dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 2); + dtrace_get_operand(x, REG_ONLY, r_m, wbit, 1); + + /* one byte immediate number */ + dtrace_imm_opnd(x, wbit, 1, 0); + break; + + case VEX_MR: + /* ModR/M.reg (reg32/64) := op(ModR/M.rm) */ + if (dp == &dis_opAVX660F[0xC5]) { + /* vpextrw , , */ + x->d86_numopnds = 2; + vbit = 2; + } else { + x->d86_numopnds = 2; + vbit = 1; + } + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + dtrace_get_operand(x, REG_ONLY, reg, LONG_OPND, vbit); + dtrace_get_operand(x, mode, r_m, wbit, vbit - 1); + + if (vbit == 2) + dtrace_imm_opnd(x, wbit, 1, 0); + + break; + + case VEX_RRI: + /* implicit(eflags/r32) := op(ModR/M.reg, ModR/M.rm) */ + x->d86_numopnds = 2; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + dtrace_get_operand(x, REG_ONLY, reg, wbit, 1); + dtrace_get_operand(x, mode, r_m, wbit, 0); + break; + + case VEX_RX: + /* ModR/M.rm := op(ModR/M.reg) */ + /* vextractf128 || vcvtps2ph */ + if (dp == &dis_opAVX660F3A[0x19] || + dp == &dis_opAVX660F3A[0x1d]) { + x->d86_numopnds = 3; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + + dtrace_get_operand(x, mode, r_m, XMM_OPND, 2); + dtrace_get_operand(x, REG_ONLY, reg, wbit, 1); + + /* one byte immediate number */ + dtrace_imm_opnd(x, wbit, 1, 0); + break; + } + + x->d86_numopnds = 2; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + dtrace_get_operand(x, mode, r_m, wbit, 1); + dtrace_get_operand(x, REG_ONLY, reg, wbit, 0); + break; + + case VEX_RR: + /* ModR/M.rm := op(ModR/M.reg) */ + x->d86_numopnds = 2; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + + if (dp == &dis_opAVX660F[0x7E]) { + /* vmovd/q , */ +#ifdef DIS_TEXT + if (vex_W) + x->d86_mnem[4] = 'q'; +#endif + dtrace_get_operand(x, mode, r_m, LONG_OPND, 1); + } else + dtrace_get_operand(x, mode, r_m, wbit, 1); + + dtrace_get_operand(x, REG_ONLY, reg, wbit, 0); + break; + + case VEX_RRi: + /* ModR/M.rm := op(ModR/M.reg, imm) */ + x->d86_numopnds = 3; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + +#ifdef DIS_TEXT + if (dp == &dis_opAVX660F3A[0x16]) { + /* vpextrd/q , , */ + if (vex_W) + x->d86_mnem[6] = 'q'; + } +#endif + dtrace_get_operand(x, mode, r_m, LONG_OPND, 2); + dtrace_get_operand(x, REG_ONLY, reg, wbit, 1); + + /* one byte immediate number */ + dtrace_imm_opnd(x, wbit, 1, 0); + break; + case VEX_RIM: + /* ModR/M.rm := op(ModR/M.reg, imm) */ + x->d86_numopnds = 3; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + + dtrace_get_operand(x, mode, r_m, XMM_OPND, 2); + dtrace_get_operand(x, REG_ONLY, reg, wbit, 1); + /* one byte immediate number */ + dtrace_imm_opnd(x, wbit, 1, 0); + break; + + case VEX_RM: + /* ModR/M.rm := op(ModR/M.reg) */ + if (dp == &dis_opAVX660F3A[0x17]) { /* vextractps */ + x->d86_numopnds = 3; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + + dtrace_get_operand(x, mode, r_m, LONG_OPND, 2); + dtrace_get_operand(x, REG_ONLY, reg, wbit, 1); + /* one byte immediate number */ + dtrace_imm_opnd(x, wbit, 1, 0); + break; + } + x->d86_numopnds = 2; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); +L_VEX_RM: + vbit = 1; + dtrace_get_operand(x, mode, r_m, wbit, vbit); + dtrace_get_operand(x, REG_ONLY, reg, wbit, vbit - 1); + + break; + + case VEX_RRM: + /* ModR/M.rm := op(VEX.vvvv, ModR/M.reg) */ + x->d86_numopnds = 3; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + dtrace_get_operand(x, mode, r_m, wbit, 2); + /* VEX use the 1's complement form encode the XMM/YMM regs */ + dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 1); + dtrace_get_operand(x, REG_ONLY, reg, wbit, 0); + break; + + case VEX_RMX: + /* ModR/M.reg := op(VEX.vvvv, ModR/M.rm) */ + x->d86_numopnds = 3; + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + dtrace_get_operand(x, REG_ONLY, reg, wbit, 2); + dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 1); + dtrace_get_operand(x, REG_ONLY, r_m, wbit, 0); + break; + + case VEX_NONE: +#ifdef DIS_TEXT + if (vex_L) + (void) strncpy(x->d86_mnem, "vzeroall", OPLEN); +#endif + break; + case BLS: { + + /* + * The BLS instructions are VEX instructions that are based on + * VEX.0F38.F3; however, they are considered special group 17 + * and like everything else, they use the bits in 3-5 of the + * MOD R/M to determine the sub instruction. Unlike many others + * like the VMX instructions, these are valid both for memory + * and register forms. + */ + + dtrace_get_modrm(x, &mode, ®, &r_m); + dtrace_vex_adjust(vex_byte1, mode, ®, &r_m); + + switch (reg) { + case 1: +#ifdef DIS_TEXT + blsinstr = "blsr"; +#endif + break; + case 2: +#ifdef DIS_TEXT + blsinstr = "blsmsk"; +#endif + break; + case 3: +#ifdef DIS_TEXT + blsinstr = "blsi"; +#endif + break; + default: + goto error; + } + + x->d86_numopnds = 2; +#ifdef DIS_TEXT + (void) strncpy(x->d86_mnem, blsinstr, OPLEN); +#endif + dtrace_get_operand(x, REG_ONLY, (0xF - vex_v), wbit, 1); + dtrace_get_operand(x, mode, r_m, wbit, 0); + break; + } + /* an invalid op code */ + case AM: + case DM: + case OVERRIDE: + case PREFIX: + case UNKNOWN: + NOMEM; + default: + goto error; + } /* end switch */ + if (x->d86_error) + goto error; + +done: +#ifdef DIS_MEM + /* + * compute the size of any memory accessed by the instruction + */ + if (x->d86_memsize != 0) { + return (0); + } else if (dp->it_stackop) { + switch (opnd_size) { + case SIZE16: + x->d86_memsize = 2; + break; + case SIZE32: + x->d86_memsize = 4; + break; + case SIZE64: + x->d86_memsize = 8; + break; + } + } else if (nomem || mode == REG_ONLY) { + x->d86_memsize = 0; + + } else if (dp->it_size != 0) { + /* + * In 64 bit mode descriptor table entries + * go up to 10 bytes and popf/pushf are always 8 bytes + */ + if (x->d86_mode == SIZE64 && dp->it_size == 6) + x->d86_memsize = 10; + else if (x->d86_mode == SIZE64 && opcode1 == 0x9 && + (opcode2 == 0xc || opcode2 == 0xd)) + x->d86_memsize = 8; + else + x->d86_memsize = dp->it_size; + + } else if (wbit == 0) { + x->d86_memsize = 1; + + } else if (wbit == LONG_OPND) { + if (opnd_size == SIZE64) + x->d86_memsize = 8; + else if (opnd_size == SIZE32) + x->d86_memsize = 4; + else + x->d86_memsize = 2; + + } else if (wbit == SEG_OPND) { + x->d86_memsize = 4; + + } else { + x->d86_memsize = 8; + } +#endif + return (0); + +error: +#ifdef DIS_TEXT + (void) strlcat(x->d86_mnem, "undef", OPLEN); +#endif + return (1); +} + +#ifdef DIS_TEXT + +/* + * Some instructions should have immediate operands printed + * as unsigned integers. We compare against this table. + */ +static char *unsigned_ops[] = { + "or", "and", "xor", "test", "in", "out", "lcall", "ljmp", + "rcr", "rcl", "ror", "rol", "shl", "shr", "sal", "psr", "psl", + 0 +}; + + +static int +isunsigned_op(char *opcode) +{ + char *where; + int i; + int is_unsigned = 0; + + /* + * Work back to start of last mnemonic, since we may have + * prefixes on some opcodes. + */ + where = opcode + strlen(opcode) - 1; + while (where > opcode && *where != ' ') + --where; + if (*where == ' ') + ++where; + + for (i = 0; unsigned_ops[i]; ++i) { + if (strncmp(where, unsigned_ops[i], + strlen(unsigned_ops[i]))) + continue; + is_unsigned = 1; + break; + } + return (is_unsigned); +} + +/* + * Print a numeric immediate into end of buf, maximum length buflen. + * The immediate may be an address or a displacement. Mask is set + * for address size. If the immediate is a "small negative", or + * if it's a negative displacement of any magnitude, print as -. + * Respect the "octal" flag. "Small negative" is defined as "in the + * interval [NEG_LIMIT, 0)". + * + * Also, "isunsigned_op()" instructions never print negatives. + * + * Return whether we decided to print a negative value or not. + */ + +#define NEG_LIMIT -255 +enum {IMM, DISP}; +enum {POS, TRY_NEG}; + +static int +print_imm(dis86_t *dis, uint64_t usv, uint64_t mask, char *buf, + size_t buflen, int disp, int try_neg) +{ + int curlen; + int64_t sv = (int64_t)usv; + int octal = dis->d86_flags & DIS_F_OCTAL; + + curlen = strlen(buf); + + if (try_neg == TRY_NEG && sv < 0 && + (disp || sv >= NEG_LIMIT) && + !isunsigned_op(dis->d86_mnem)) { + dis->d86_sprintf_func(buf + curlen, buflen - curlen, + octal ? "-0%llo" : "-0x%llx", (-sv) & mask); + return (1); + } else { + if (disp == DISP) + dis->d86_sprintf_func(buf + curlen, buflen - curlen, + octal ? "+0%llo" : "+0x%llx", usv & mask); + else + dis->d86_sprintf_func(buf + curlen, buflen - curlen, + octal ? "0%llo" : "0x%llx", usv & mask); + return (0); + + } +} + + +static int +log2(int size) +{ + switch (size) { + case 1: return (0); + case 2: return (1); + case 4: return (2); + case 8: return (3); + } + return (0); +} + +/* ARGSUSED */ +void +dtrace_disx86_str(dis86_t *dis, uint_t mode, uint64_t pc, char *buf, + size_t buflen) +{ + uint64_t reltgt = 0; + uint64_t tgt = 0; + int curlen; + int (*lookup)(void *, uint64_t, char *, size_t); + int i; + int64_t sv; + uint64_t usv, mask, save_mask, save_usv; + static uint64_t masks[] = + {0xffU, 0xffffU, 0xffffffffU, 0xffffffffffffffffULL}; + save_usv = 0; + + dis->d86_sprintf_func(buf, buflen, "%-6s ", dis->d86_mnem); + + /* + * For PC-relative jumps, the pc is really the next pc after executing + * this instruction, so increment it appropriately. + */ + pc += dis->d86_len; + + for (i = 0; i < dis->d86_numopnds; i++) { + d86opnd_t *op = &dis->d86_opnd[i]; + + if (i != 0) + (void) strlcat(buf, ",", buflen); + + (void) strlcat(buf, op->d86_prefix, buflen); + + /* + * sv is for the signed, possibly-truncated immediate or + * displacement; usv retains the original size and + * unsignedness for symbol lookup. + */ + + sv = usv = op->d86_value; + + /* + * About masks: for immediates that represent + * addresses, the appropriate display size is + * the effective address size of the instruction. + * This includes MODE_OFFSET, MODE_IPREL, and + * MODE_RIPREL. Immediates that are simply + * immediate values should display in the operand's + * size, however, since they don't represent addresses. + */ + + /* d86_addr_size is SIZEnn, which is log2(real size) */ + mask = masks[dis->d86_addr_size]; + + /* d86_value_size and d86_imm_bytes are in bytes */ + if (op->d86_mode == MODE_SIGNED || + op->d86_mode == MODE_IMPLIED) + mask = masks[log2(op->d86_value_size)]; + + switch (op->d86_mode) { + + case MODE_NONE: + + (void) strlcat(buf, op->d86_opnd, buflen); + break; + + case MODE_SIGNED: + case MODE_IMPLIED: + case MODE_OFFSET: + + tgt = usv; + + if (dis->d86_seg_prefix) + (void) strlcat(buf, dis->d86_seg_prefix, + buflen); + + if (op->d86_mode == MODE_SIGNED || + op->d86_mode == MODE_IMPLIED) { + (void) strlcat(buf, "$", buflen); + } + + if (print_imm(dis, usv, mask, buf, buflen, + IMM, TRY_NEG) && + (op->d86_mode == MODE_SIGNED || + op->d86_mode == MODE_IMPLIED)) { + + /* + * We printed a negative value for an + * immediate that wasn't a + * displacement. Note that fact so we can + * print the positive value as an + * annotation. + */ + + save_usv = usv; + save_mask = mask; + } + (void) strlcat(buf, op->d86_opnd, buflen); + + break; + + case MODE_IPREL: + case MODE_RIPREL: + + reltgt = pc + sv; + + switch (mode) { + case SIZE16: + reltgt = (uint16_t)reltgt; + break; + case SIZE32: + reltgt = (uint32_t)reltgt; + break; + } + + (void) print_imm(dis, usv, mask, buf, buflen, + DISP, TRY_NEG); + + if (op->d86_mode == MODE_RIPREL) + (void) strlcat(buf, "(%rip)", buflen); + break; + } + } + + /* + * The symbol lookups may result in false positives, + * particularly on object files, where small numbers may match + * the 0-relative non-relocated addresses of symbols. + */ + + lookup = dis->d86_sym_lookup; + if (tgt != 0) { + if ((dis->d86_flags & DIS_F_NOIMMSYM) == 0 && + lookup(dis->d86_data, tgt, NULL, 0) == 0) { + (void) strlcat(buf, "\t<", buflen); + curlen = strlen(buf); + lookup(dis->d86_data, tgt, buf + curlen, + buflen - curlen); + (void) strlcat(buf, ">", buflen); + } + + /* + * If we printed a negative immediate above, print the + * positive in case our heuristic was unhelpful + */ + if (save_usv) { + (void) strlcat(buf, "\t<", buflen); + (void) print_imm(dis, save_usv, save_mask, buf, buflen, + IMM, POS); + (void) strlcat(buf, ">", buflen); + } + } + + if (reltgt != 0) { + /* Print symbol or effective address for reltgt */ + + (void) strlcat(buf, "\t<", buflen); + curlen = strlen(buf); + lookup(dis->d86_data, reltgt, buf + curlen, + buflen - curlen); + (void) strlcat(buf, ">", buflen); + } +} + +#endif /* DIS_TEXT */ Index: src/external/cddl/osnet/dev/dtrace/x86/dis_tables.h =================================================================== RCS file: src/external/cddl/osnet/dev/dtrace/x86/dis_tables.h diff -N src/external/cddl/osnet/dev/dtrace/x86/dis_tables.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dev/dtrace/x86/dis_tables.h 20 Apr 2017 11:49:47 -0000 @@ -0,0 +1,112 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1988 AT&T */ +/* All Rights Reserved */ + +/* + * $FreeBSD: head/sys/cddl/dev/dtrace/x86/dis_tables.h 313133 2017-02-03 03:22:47Z markj $ + */ + +#ifndef _DIS_TABLES_H +#define _DIS_TABLES_H + +/* + * Constants and prototypes for the IA32 disassembler backend. See dis_tables.c + * for usage information and documentation. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +/* + * values for cpu mode + */ +#define SIZE16 1 +#define SIZE32 2 +#define SIZE64 3 + +#define OPLEN 256 +#define PFIXLEN 8 +#define NCPS 20 /* number of chars per symbol */ + +/* + * data structures that must be provided to dtrace_dis86() + */ +typedef struct d86opnd { + char d86_opnd[OPLEN]; /* symbolic rep of operand */ + char d86_prefix[PFIXLEN]; /* any prefix string or "" */ + uint_t d86_mode; /* mode for immediate */ + uint_t d86_value_size; /* size in bytes of d86_value */ + uint64_t d86_value; /* immediate value of opnd */ +} d86opnd_t; + +typedef struct dis86 { + uint_t d86_mode; + uint_t d86_error; + uint_t d86_len; /* instruction length */ + int d86_rmindex; /* index of modrm byte or -1 */ + uint_t d86_memsize; /* size of memory referenced */ + char d86_bytes[16]; /* bytes of instruction */ + char d86_mnem[OPLEN]; + uint_t d86_numopnds; + uint_t d86_rex_prefix; /* value of REX prefix if !0 */ + char *d86_seg_prefix; /* segment prefix, if any */ + uint_t d86_opnd_size; + uint_t d86_addr_size; + uint_t d86_got_modrm; + uint_t d86_vsib; /* Has a VSIB */ + struct d86opnd d86_opnd[4]; /* up to 4 operands */ + int (*d86_check_func)(void *); + int (*d86_get_byte)(void *); +#ifdef DIS_TEXT + int (*d86_sym_lookup)(void *, uint64_t, char *, size_t); + int (*d86_sprintf_func)(char *, size_t, const char *, ...); + int d86_flags; + uint_t d86_imm_bytes; +#endif + void *d86_data; +} dis86_t; + +extern int dtrace_disx86(dis86_t *x, uint_t cpu_mode); + +#define DIS_F_OCTAL 0x1 /* Print all numbers in octal */ +#define DIS_F_NOIMMSYM 0x2 /* Don't print symbols for immediates (.o) */ + +#ifdef DIS_TEXT +extern void dtrace_disx86_str(dis86_t *x, uint_t cpu_mode, uint64_t pc, + char *buf, size_t len); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _DIS_TABLES_H */ Index: src/external/cddl/osnet/dev/dtrace/x86/instr_size.c =================================================================== RCS file: src/external/cddl/osnet/dev/dtrace/x86/instr_size.c diff -N src/external/cddl/osnet/dev/dtrace/x86/instr_size.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dev/dtrace/x86/instr_size.c 20 Apr 2017 11:58:22 -0000 @@ -0,0 +1,149 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD: head/sys/cddl/dev/dtrace/x86/instr_size.c 303050 2016-07-20 00:02:10Z markj $ + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1988 AT&T */ +/* All Rights Reserved */ + + +#ifdef illumos +#pragma ident "@(#)instr_size.c 1.14 05/07/08 SMI" +#endif + +#include +#include +#include +#ifdef illumos +#include +#include +#include +#include +#include +#endif +#ifdef __FreeBSD__ +#include +#include + +typedef u_int model_t; +#define DATAMODEL_NATIVE 0 +int dtrace_instr_size(uchar_t *); +int dtrace_instr_size_isa(uchar_t *, model_t, int *); +#endif +#ifdef __NetBSD__ +#include + +typedef u_int model_t; +#define DATAMODEL_NATIVE 0 +int dtrace_instr_size(uchar_t *); +int dtrace_instr_size_isa(uchar_t *, model_t, int *); +#endif + +#include + +/* + * This subsystem (with the minor exception of the instr_size() function) is + * is called from DTrace probe context. This imposes several requirements on + * the implementation: + * + * 1. External subsystems and functions may not be referenced. The one current + * exception is for cmn_err, but only to signal the detection of table + * errors. Assuming the tables are correct, no combination of input is to + * trigger a cmn_err call. + * + * 2. These functions can't be allowed to be traced. To prevent this, + * all functions in the probe path (everything except instr_size()) must + * have names that begin with "dtrace_". + */ + +typedef enum dis_isize { + DIS_ISIZE_INSTR, + DIS_ISIZE_OPERAND +} dis_isize_t; + + +/* + * get a byte from instruction stream + */ +static int +dtrace_dis_get_byte(void *p) +{ + int ret; + uchar_t **instr = p; + + ret = **instr; + *instr += 1; + + return (ret); +} + +/* + * Returns either the size of a given instruction, in bytes, or the size of that + * instruction's memory access (if any), depending on the value of `which'. + * If a programming error in the tables is detected, the system will panic to + * ease diagnosis. Invalid instructions will not be flagged. They will appear + * to have an instruction size between 1 and the actual size, and will be + * reported as having no memory impact. + */ +/* ARGSUSED2 */ +static int +dtrace_dis_isize(uchar_t *instr, dis_isize_t which, model_t model, int *rmindex) +{ + int sz; + dis86_t x; + uint_t mode = SIZE32; + + mode = (model == DATAMODEL_LP64) ? SIZE64 : SIZE32; + + x.d86_data = (void **)&instr; + x.d86_get_byte = dtrace_dis_get_byte; + x.d86_check_func = NULL; + + if (dtrace_disx86(&x, mode) != 0) + return (-1); + + if (which == DIS_ISIZE_INSTR) + sz = x.d86_len; /* length of the instruction */ + else + sz = x.d86_memsize; /* length of memory operand */ + + if (rmindex != NULL) + *rmindex = x.d86_rmindex; + return (sz); +} + +int +dtrace_instr_size_isa(uchar_t *instr, model_t model, int *rmindex) +{ + return (dtrace_dis_isize(instr, DIS_ISIZE_INSTR, model, rmindex)); +} + +int +dtrace_instr_size(uchar_t *instr) +{ + return (dtrace_dis_isize(instr, DIS_ISIZE_INSTR, DATAMODEL_NATIVE, + NULL)); +} Index: src/external/cddl/osnet/dev/dtrace/x86/regset.h =================================================================== RCS file: src/external/cddl/osnet/dev/dtrace/x86/regset.h diff -N src/external/cddl/osnet/dev/dtrace/x86/regset.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dev/dtrace/x86/regset.h 8 May 2017 08:17:51 -0000 @@ -0,0 +1,178 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD: head/sys/cddl/dev/dtrace/x86/regset.h 277300 2015-01-17 14:44:59Z smh $ + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +#ifndef _REGSET_H +#define _REGSET_H + +/* + * #pragma ident "@(#)regset.h 1.11 05/06/08 SMI" + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The names and offsets defined here should be specified by the + * AMD64 ABI suppl. + * + * We make fsbase and gsbase part of the lwp context (since they're + * the only way to access the full 64-bit address range via the segment + * registers) and thus belong here too. However we treat them as + * read-only; if %fs or %gs are updated, the results of the descriptor + * table lookup that those updates implicitly cause will be reflected + * in the corresponding fsbase and/or gsbase values the next time the + * context can be inspected. However it is NOT possible to override + * the fsbase/gsbase settings via this interface. + * + * Direct modification of the base registers (thus overriding the + * descriptor table base address) can be achieved with _lwp_setprivate. + */ + +#define REG_GSBASE 27 +#define REG_FSBASE 26 +#ifdef illumos +#define REG_DS 25 +#define REG_ES 24 + +#define REG_GS 23 +#define REG_FS 22 +#define REG_SS 21 +#define REG_RSP 20 +#define REG_RFL 19 +#define REG_CS 18 +#define REG_RIP 17 +#define REG_ERR 16 +#define REG_TRAPNO 15 +#define REG_RAX 14 +#define REG_RCX 13 +#define REG_RDX 12 +#define REG_RBX 11 +#define REG_RBP 10 +#define REG_RSI 9 +#define REG_RDI 8 +#define REG_R8 7 +#define REG_R9 6 +#define REG_R10 5 +#define REG_R11 4 +#define REG_R12 3 +#define REG_R13 2 +#define REG_R14 1 +#define REG_R15 0 +#else /* !illumos */ +#define REG_SS 25 +#define REG_RSP 24 +#define REG_RFL 23 +#define REG_CS 22 +#define REG_RIP 21 +#define REG_DS 20 +#define REG_ES 19 +#define REG_ERR 18 +#define REG_GS 17 +#define REG_FS 16 +#define REG_TRAPNO 15 +#define REG_RAX 14 +#define REG_RCX 13 +#define REG_RDX 12 +#define REG_RBX 11 +#define REG_RBP 10 +#define REG_RSI 9 +#define REG_RDI 8 +#define REG_R8 7 +#define REG_R9 6 +#define REG_R10 5 +#define REG_R11 4 +#define REG_R12 3 +#define REG_R13 2 +#define REG_R14 1 +#define REG_R15 0 +#endif /* illumos */ + +/* + * The names and offsets defined here are specified by i386 ABI suppl. + */ + +#ifdef illumos +#define SS 18 /* only stored on a privilege transition */ +#define UESP 17 /* only stored on a privilege transition */ +#define EFL 16 +#define CS 15 +#define EIP 14 +#define ERR 13 +#define TRAPNO 12 +#define EAX 11 +#define ECX 10 +#define EDX 9 +#define EBX 8 +#define ESP 7 +#define EBP 6 +#define ESI 5 +#define EDI 4 +#define DS 3 +#define ES 2 +#define FS 1 +#define GS 0 +#else /* !illumos */ +#define GS 18 +#define SS 17 /* only stored on a privilege transition */ +#define UESP 16 /* only stored on a privilege transition */ +#define EFL 15 +#define CS 14 +#define EIP 13 +#define ERR 12 +#define TRAPNO 11 +#define EAX 10 +#define ECX 9 +#define EDX 8 +#define EBX 7 +#define ESP 6 +#define EBP 5 +#define ESI 4 +#define EDI 3 +#define DS 2 +#define ES 1 +#define FS 0 +#endif /* illumos */ + +#define REG_PC EIP +#define REG_FP EBP +#define REG_SP UESP +#define REG_PS EFL +#define REG_R0 EAX +#define REG_R1 EDX + +#ifdef __cplusplus +} +#endif + +#endif /* _REGSET_H */ Index: src/external/cddl/osnet/dev/fbt/fbt.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/fbt/fbt.c,v retrieving revision 1.22 diff -u -p -r1.22 fbt.c --- src/external/cddl/osnet/dev/fbt/fbt.c 27 Feb 2017 06:47:00 -0000 1.22 +++ src/external/cddl/osnet/dev/fbt/fbt.c 5 Jul 2017 00:25:12 -0000 @@ -23,7 +23,7 @@ * Portions Copyright 2006-2008 John Birrell jb@freebsd.org * Portions Copyright 2010 Darran Hunt darran@NetBSD.org * - * $FreeBSD: src/sys/cddl/dev/fbt/fbt.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/fbt/fbt.c 309786 2016-12-10 03:13:11Z markj $ * */ @@ -33,6 +33,7 @@ */ #include +#include #include #include #include @@ -57,21 +58,6 @@ #include #include -#include -#if defined(__i386__) || defined(__amd64__) -#include -#include -#if 0 -#include -#endif -#include -#elif __arm__ -#include -#include -#include -#include -#endif - #define ELFSIZE ARCH_ELFSIZE #include @@ -80,65 +66,19 @@ #include #include -mod_ctf_t *modptr; - -MALLOC_DEFINE(M_FBT, "fbt", "Function Boundary Tracing"); +#include "fbt.h" -#if defined(__i386__) || defined(__amd64__) -#define FBT_PUSHL_EBP 0x55 -#define FBT_MOVL_ESP_EBP0_V0 0x8b -#define FBT_MOVL_ESP_EBP1_V0 0xec -#define FBT_MOVL_ESP_EBP0_V1 0x89 -#define FBT_MOVL_ESP_EBP1_V1 0xe5 -#define FBT_REX_RSP_RBP 0x48 - -#define FBT_POPL_EBP 0x5d -#define FBT_RET 0xc3 -#define FBT_RET_IMM16 0xc2 -#define FBT_LEAVE 0xc9 -#endif - -#ifdef __amd64__ -#define FBT_PATCHVAL 0xcc -#elif defined(__i386__) -#define FBT_PATCHVAL 0xf0 - -#elif defined(__arm__) -#define FBT_PATCHVAL DTRACE_BREAKPOINT - -/* entry and return */ -#define FBT_BX_LR_P(insn) (((insn) & ~INSN_COND_MASK) == 0x012fff1e) -#define FBT_B_LABEL_P(insn) (((insn) & 0xff000000) == 0xea000000) -/* entry */ -#define FBT_MOV_IP_SP_P(insn) ((insn) == 0xe1a0c00d) -/* index=1, add=1, wback=0 */ -#define FBT_LDR_IMM_P(insn) (((insn) & 0xfff00000) == 0xe5900000) -#define FBT_MOVW_P(insn) (((insn) & 0xfff00000) == 0xe3000000) -#define FBT_MOV_IMM_P(insn) (((insn) & 0xffff0000) == 0xe3a00000) -#define FBT_CMP_IMM_P(insn) (((insn) & 0xfff00000) == 0xe3500000) -#define FBT_PUSH_P(insn) (((insn) & 0xffff0000) == 0xe92d0000) -/* return */ -/* cond=always, writeback=no, rn=sp and register_list includes pc */ -#define FBT_LDM_P(insn) (((insn) & 0x0fff8000) == 0x089d8000) -#define FBT_LDMIB_P(insn) (((insn) & 0x0fff8000) == 0x099d8000) -#define FBT_MOV_PC_LR_P(insn) (((insn) & ~INSN_COND_MASK) == 0x01a0f00e) -/* cond=always, writeback=no, rn=sp and register_list includes lr, but not pc */ -#define FBT_LDM_LR_P(insn) (((insn) & 0xffffc000) == 0xe89d4000) -#define FBT_LDMIB_LR_P(insn) (((insn) & 0xffffc000) == 0xe99d4000) - -/* rval = insn | invop_id (overwriting cond with invop ID) */ -#define BUILD_RVAL(insn, id) (((insn) & ~INSN_COND_MASK) | __SHIFTIN((id), INSN_COND_MASK)) -/* encode cond in the first byte */ -#define PATCHVAL_ENCODE_COND(insn) (FBT_PATCHVAL | __SHIFTOUT((insn), INSN_COND_MASK)) +mod_ctf_t *modptr; -#else -#error "architecture not supported" -#endif +dtrace_provider_id_t fbt_id; +fbt_probe_t **fbt_probetab; +int fbt_probetab_mask; +static int fbt_probetab_size; static dev_type_open(fbt_open); static int fbt_unload(void); static void fbt_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *); -static void fbt_provide_module(void *, dtrace_modctl_t *); +static void fbt_provide_module(void *, modctl_t *); static void fbt_destroy(void *, dtrace_id_t, void *); static int fbt_enable(void *, dtrace_id_t, void *); static void fbt_disable(void *, dtrace_id_t, void *); @@ -146,11 +86,6 @@ static void fbt_load(void); static void fbt_suspend(void *, dtrace_id_t, void *); static void fbt_resume(void *, dtrace_id_t, void *); -#define FBT_ENTRY "entry" -#define FBT_RETURN "return" -#define FBT_ADDR2NDX(addr) ((((uintptr_t)(addr)) >> 4) & fbt_probetab_mask) -#define FBT_PROBETAB_SIZE 0x8000 /* 32k entries -- 128K total */ - static const struct cdevsw fbt_cdevsw = { .d_open = fbt_open, .d_close = noclose, @@ -187,259 +122,71 @@ static dtrace_pops_t fbt_pops = { fbt_destroy }; -typedef struct fbt_probe { - struct fbt_probe *fbtp_hashnext; -#if defined(__i386__) || defined(__amd64__) - uint8_t *fbtp_patchpoint; - int8_t fbtp_rval; - uint8_t fbtp_patchval; - uint8_t fbtp_savedval; -#elif __arm__ - uint32_t *fbtp_patchpoint; - int32_t fbtp_rval; - uint32_t fbtp_patchval; - uint32_t fbtp_savedval; -#endif - uintptr_t fbtp_roffset; - dtrace_id_t fbtp_id; - const char *fbtp_name; - dtrace_modctl_t *fbtp_ctl; - int fbtp_loadcnt; - int fbtp_primary; - int fbtp_invop_cnt; - int fbtp_symindx; - struct fbt_probe *fbtp_next; -} fbt_probe_t; - -#ifdef notyet -static struct cdev *fbt_cdev; +#ifdef __FreeBSD__ static int fbt_verbose = 0; -#endif -static dtrace_provider_id_t fbt_id; -static fbt_probe_t **fbt_probetab; -static int fbt_probetab_size; -static int fbt_probetab_mask; - -#ifdef __arm__ -extern void (* dtrace_emulation_jump_addr)(int, struct trapframe *); -static uint32_t -expand_imm(uint32_t imm12) -{ - uint32_t unrot = imm12 & 0xff; - int amount = 2 * (imm12 >> 8); +static struct cdev *fbt_cdev; +#endif /* __FreeBSD__ */ - if (amount) - return (unrot >> amount) | (unrot << (32 - amount)); - else - return unrot; -} +#ifdef __NetBSD__ +specificdata_key_t fbt_module_key; -static uint32_t -add_with_carry(uint32_t x, uint32_t y, int carry_in, - int *carry_out, int *overflow) -{ - uint32_t result; - uint64_t unsigned_sum = x + y + (uint32_t)carry_in; - int64_t signed_sum = (int32_t)x + (int32_t)y + (int32_t)carry_in; - KASSERT(carry_in == 1); - - result = (uint32_t)(unsigned_sum & 0xffffffff); - *carry_out = ((uint64_t)result == unsigned_sum) ? 1 : 0; - *overflow = ((int64_t)result == signed_sum) ? 0 : 1; - - return result; -} +#define version xversion +#endif /* __NetBSD__ */ -static void -fbt_emulate(int _op, struct trapframe *frame) +int +fbt_excluded(const char *name) { - uint32_t op = _op; - switch (op >> 28) { - case DTRACE_INVOP_MOV_IP_SP: - /* mov ip, sp */ - frame->tf_ip = frame->tf_svc_sp; - frame->tf_pc += 4; - break; - case DTRACE_INVOP_BX_LR: - /* bx lr */ - frame->tf_pc = frame->tf_svc_lr; - break; - case DTRACE_INVOP_MOV_PC_LR: - /* mov pc, lr */ - frame->tf_pc = frame->tf_svc_lr; - break; - case DTRACE_INVOP_LDM: - /* ldm sp, {..., pc} */ - /* FALLTHRU */ - case DTRACE_INVOP_LDMIB: { - /* ldmib sp, {..., pc} */ - uint32_t register_list = (op & 0xffff); - uint32_t *sp = (uint32_t *)(intptr_t)frame->tf_svc_sp; - uint32_t *regs = &frame->tf_r0; - int i; - - /* IDMIB */ - if ((op >> 28) == 5) - sp++; - - for (i=0; i <= 12; i++) { - if (register_list & (1 << i)) - regs[i] = *sp++; - } - if (register_list & (1 << 13)) - frame->tf_svc_sp = *sp++; - if (register_list & (1 << 14)) - frame->tf_svc_lr = *sp++; - frame->tf_pc = *sp; - break; - } - case DTRACE_INVOP_LDR_IMM: { - /* ldr r?, [{pc,r?}, #?] */ - uint32_t rt = (op >> 12) & 0xf; - uint32_t rn = (op >> 16) & 0xf; - uint32_t imm = op & 0xfff; - uint32_t *regs = &frame->tf_r0; - KDASSERT(rt <= 12); - KDASSERT(rn == 15 || rn =< 12); - if (rn == 15) - regs[rt] = *((uint32_t *)(intptr_t)(frame->tf_pc + 8 + imm)); - else - regs[rt] = *((uint32_t *)(intptr_t)(regs[rn] + imm)); - frame->tf_pc += 4; - break; - } - case DTRACE_INVOP_MOVW: { - /* movw r?, #? */ - uint32_t rd = (op >> 12) & 0xf; - uint32_t imm = (op & 0xfff) | ((op & 0xf0000) >> 4); - uint32_t *regs = &frame->tf_r0; - KDASSERT(rd <= 12); - regs[rd] = imm; - frame->tf_pc += 4; - break; - } - case DTRACE_INVOP_MOV_IMM: { - /* mov r?, #? */ - uint32_t rd = (op >> 12) & 0xf; - uint32_t imm = expand_imm(op & 0xfff); - uint32_t *regs = &frame->tf_r0; - KDASSERT(rd <= 12); - regs[rd] = imm; - frame->tf_pc += 4; - break; - } - case DTRACE_INVOP_CMP_IMM: { - /* cmp r?, #? */ - uint32_t rn = (op >> 16) & 0xf; - uint32_t *regs = &frame->tf_r0; - uint32_t imm = expand_imm(op & 0xfff); - uint32_t spsr = frame->tf_spsr; - uint32_t result; - int carry; - int overflow; + if (strncmp(name, "dtrace_", 7) == 0 && + strncmp(name, "dtrace_safe_", 12) != 0) { /* - * (result, carry, overflow) = AddWithCarry(R[n], NOT(imm32), ’1’); - * APSR.N = result<31>; - * APSR.Z = IsZeroBit(result); - * APSR.C = carry; - * APSR.V = overflow; + * Anything beginning with "dtrace_" may be called + * from probe context unless it explicitly indicates + * that it won't be called from probe context by + * using the prefix "dtrace_safe_". */ - KDASSERT(rn <= 12); - result = add_with_carry(regs[rn], ~imm, 1, &carry, &overflow); - if (result & 0x80000000) - spsr |= PSR_N_bit; - else - spsr &= ~PSR_N_bit; - if (result == 0) - spsr |= PSR_Z_bit; - else - spsr &= ~PSR_Z_bit; - if (carry) - spsr |= PSR_C_bit; - else - spsr &= ~PSR_C_bit; - if (overflow) - spsr |= PSR_V_bit; - else - spsr &= ~PSR_V_bit; - -#if 0 - aprint_normal("pc=%x Rn=%x imm=%x %c%c%c%c\n", frame->tf_pc, regs[rn], imm, - (spsr & PSR_N_bit) ? 'N' : 'n', - (spsr & PSR_Z_bit) ? 'Z' : 'z', - (spsr & PSR_C_bit) ? 'C' : 'c', - (spsr & PSR_V_bit) ? 'V' : 'v'); -#endif - frame->tf_spsr = spsr; - frame->tf_pc += 4; - break; - } - case DTRACE_INVOP_B_LABEL: { - /* b ??? */ - uint32_t imm = (op & 0x00ffffff) << 2; - int32_t diff; - /* SignExtend(imm26, 32) */ - if (imm & 0x02000000) - imm |= 0xfc000000; - diff = (int32_t)imm; - frame->tf_pc += 8 + diff; - break; + return (1); } - /* FIXME: push will overwrite trapframe... */ - case DTRACE_INVOP_PUSH: { - /* push {...} */ - uint32_t register_list = (op & 0xffff); - uint32_t *sp = (uint32_t *)(intptr_t)frame->tf_svc_sp; - uint32_t *regs = &frame->tf_r0; - int i; - int count = 0; - -#if 0 - if ((op & 0x0fff0fff) == 0x052d0004) { - /* A2: str r4, [sp, #-4]! */ - *(sp - 1) = regs[4]; - frame->tf_pc += 4; - break; - } -#endif - for (i=0; i < 16; i++) { - if (register_list & (1 << i)) - count++; - } - sp -= count; +#ifdef __FreeBSD__ + /* + * Lock owner methods may be called from probe context. + */ + if (strcmp(name, "owner_mtx") == 0 || + strcmp(name, "owner_rm") == 0 || + strcmp(name, "owner_rw") == 0 || + strcmp(name, "owner_sx") == 0) + return (1); - for (i=0; i <= 12; i++) { - if (register_list & (1 << i)) - *sp++ = regs[i]; - } - if (register_list & (1 << 13)) - *sp++ = frame->tf_svc_sp; - if (register_list & (1 << 14)) - *sp++ = frame->tf_svc_lr; - if (register_list & (1 << 15)) - *sp = frame->tf_pc + 8; - - /* make sure the caches and memory are in sync */ - cpu_dcache_wbinv_range(frame->tf_svc_sp, count * 4); - - /* In case the current page tables have been modified ... */ - cpu_tlb_flushID(); - cpu_cpwait(); + /* + * When DTrace is built into the kernel we need to exclude + * the FBT functions from instrumentation. + */ +#ifndef _KLD_MODULE + if (strncmp(name, "fbt_", 4) == 0) + return (1); +#endif +#endif - frame->tf_svc_sp -= count * 4; - frame->tf_pc += 4; +#ifdef __NetBSD__ + if (name[0] == '_' && name[1] == '_') + return (1); - break; - } - default: - KDASSERTMSG(0, "op=%u\n", op >> 28); + if (strcmp(name, "cpu_index") == 0 || + strncmp(name, "db_", 3) == 0 || + strncmp(name, "ddb_", 4) == 0 || + strncmp(name, "kdb_", 4) == 0 || + strncmp(name, "lockdebug_", 10) == 0 || + strncmp(name, "kauth_", 5) == 0 || + strncmp(name, "ktext_write", 11) == 0) { + return (1); } -} #endif + return (0); +} + static void fbt_doubletrap(void) { @@ -450,489 +197,70 @@ fbt_doubletrap(void) fbt = fbt_probetab[i]; for (; fbt != NULL; fbt = fbt->fbtp_next) - *fbt->fbtp_patchpoint = fbt->fbtp_savedval; - } -} - - -static int -fbt_invop(uintptr_t addr, struct trapframe *frame, uintptr_t rval) -{ - solaris_cpu_t *cpu; - uintptr_t *stack; - uintptr_t arg0, arg1, arg2, arg3, arg4; - fbt_probe_t *fbt; - -#ifdef __amd64__ - stack = (uintptr_t *)frame->tf_rsp; -#endif -#ifdef __i386__ - /* Skip hardware-saved registers. */ - stack = (uintptr_t *)&frame->tf_esp; -#endif -#ifdef __arm__ - stack = (uintptr_t *)frame->tf_svc_sp; -#endif - - cpu = &solaris_cpu[cpu_number()]; - fbt = fbt_probetab[FBT_ADDR2NDX(addr)]; - for (; fbt != NULL; fbt = fbt->fbtp_hashnext) { - if ((uintptr_t)fbt->fbtp_patchpoint == addr) { - fbt->fbtp_invop_cnt++; - if (fbt->fbtp_roffset == 0) { -#ifdef __amd64__ - /* fbt->fbtp_rval == DTRACE_INVOP_PUSHQ_RBP */ - DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); - cpu->cpu_dtrace_caller = stack[0]; - DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | - CPU_DTRACE_BADADDR); - - arg0 = frame->tf_rdi; - arg1 = frame->tf_rsi; - arg2 = frame->tf_rdx; - arg3 = frame->tf_rcx; - arg4 = frame->tf_r8; -#else - int i = 0; - - /* - * When accessing the arguments on the stack, - * we must protect against accessing beyond - * the stack. We can safely set NOFAULT here - * -- we know that interrupts are already - * disabled. - */ - DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); - cpu->cpu_dtrace_caller = stack[i++]; - arg0 = stack[i++]; - arg1 = stack[i++]; - arg2 = stack[i++]; - arg3 = stack[i++]; - arg4 = stack[i++]; - DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | - CPU_DTRACE_BADADDR); -#endif - - dtrace_probe(fbt->fbtp_id, arg0, arg1, - arg2, arg3, arg4); - - cpu->cpu_dtrace_caller = 0; - } else { -#ifdef __amd64__ - /* - * On amd64, we instrument the ret, not the - * leave. We therefore need to set the caller - * to ensure that the top frame of a stack() - * action is correct. - */ - DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); - cpu->cpu_dtrace_caller = stack[0]; - DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | - CPU_DTRACE_BADADDR); -#endif - - dtrace_probe(fbt->fbtp_id, fbt->fbtp_roffset, - rval, 0, 0, 0); - cpu->cpu_dtrace_caller = 0; - } - - return (fbt->fbtp_rval); - } + fbt_patch_tracepoint(fbt, fbt->fbtp_savedval); } - - return (0); } -#if defined(__i386__) || defined(__amd64__) -static int -fbt_provide_module_cb(const char *name, int symindx, void *value, - uint32_t symsize, int type, void *opaque) +#ifdef __FreeBSD__ +static void +fbt_provide_module(void *arg, modctl_t *lf) { - fbt_probe_t *fbt, *retfbt; - u_int8_t *instr, *limit; - dtrace_modctl_t *mod = opaque; - const char *modname = mod->mod_info->mi_name; - int j; - int size; - - /* got a function? */ - if (ELF_ST_TYPE(type) != STT_FUNC) { - return 0; - } - - if (strncmp(name, "dtrace_", 7) == 0 && - strncmp(name, "dtrace_safe_", 12) != 0) { - /* - * Anything beginning with "dtrace_" may be called - * from probe context unless it explicitly indicates - * that it won't be called from probe context by - * using the prefix "dtrace_safe_". - */ - return (0); - } + char modname[MAXPATHLEN]; + int i; + size_t len; - if (name[0] == '_' && name[1] == '_') - return (0); + strlcpy(modname, lf->filename, sizeof(modname)); + len = strlen(modname); + if (len > 3 && strcmp(modname + len - 3, ".ko") == 0) + modname[len - 3] = '\0'; /* - * Exclude some more symbols which can be called from probe context. + * Employees of dtrace and their families are ineligible. Void + * where prohibited. */ - if (strcmp(name, "x86_curcpu") == 0 /* CPU */ - || strcmp(name, "x86_curlwp") == 0 /* curproc, curlwp, curthread */ - || strcmp(name, "cpu_index") == 0 /* cpu_number, curcpu_id */ - || strncmp(name, "db_", 3) == 0 /* debugger */ - || strncmp(name, "ddb_", 4) == 0 /* debugger */ - || strncmp(name, "kdb_", 4) == 0 /* debugger */ - || strncmp(name, "lockdebug_", 10) == 0 /* lockdebug XXX for now */ - || strncmp(name, "kauth_", 5) == 0 /* CRED XXX for now */ - ) { - return 0; - } - - instr = (u_int8_t *) value; - limit = (u_int8_t *) value + symsize; - -#ifdef __amd64__ - while (instr < limit) { - if (*instr == FBT_PUSHL_EBP) - break; - - if ((size = dtrace_instr_size(instr)) <= 0) - break; - - instr += size; - } - - if (instr >= limit || *instr != FBT_PUSHL_EBP) { - /* - * We either don't save the frame pointer in this - * function, or we ran into some disassembly - * screw-up. Either way, we bail. - */ - return (0); - } -#else - if (instr[0] != FBT_PUSHL_EBP) { - return (0); - } - - if (!(instr[1] == FBT_MOVL_ESP_EBP0_V0 && - instr[2] == FBT_MOVL_ESP_EBP1_V0) && - !(instr[1] == FBT_MOVL_ESP_EBP0_V1 && - instr[2] == FBT_MOVL_ESP_EBP1_V1)) { - return (0); - } -#endif - fbt = malloc(sizeof (fbt_probe_t), M_FBT, M_WAITOK | M_ZERO); - fbt->fbtp_name = name; - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, - name, FBT_ENTRY, 3, fbt); - fbt->fbtp_patchpoint = instr; - fbt->fbtp_ctl = mod; - /* fbt->fbtp_loadcnt = lf->loadcnt; */ - fbt->fbtp_rval = DTRACE_INVOP_PUSHL_EBP; - fbt->fbtp_savedval = *instr; - fbt->fbtp_patchval = FBT_PATCHVAL; - fbt->fbtp_symindx = symindx; - - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; - fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; - mod->mod_fbtentries++; - - retfbt = NULL; - - while (instr < limit) { - if (instr >= limit) - return (0); - - /* - * If this disassembly fails, then we've likely walked off into - * a jump table or some other unsuitable area. Bail out of the - * disassembly now. - */ - if ((size = dtrace_instr_size(instr)) <= 0) - return (0); - -#ifdef __amd64__ - /* - * We only instrument "ret" on amd64 -- we don't yet instrument - * ret imm16, largely because the compiler doesn't seem to - * (yet) emit them in the kernel... - */ - if (*instr != FBT_RET) { - instr += size; - continue; - } -#else - if (!(size == 1 && - (*instr == FBT_POPL_EBP || *instr == FBT_LEAVE) && - (*(instr + 1) == FBT_RET || - *(instr + 1) == FBT_RET_IMM16))) { - instr += size; - continue; - } -#endif - - /* - * We (desperately) want to avoid erroneously instrumenting a - * jump table, especially given that our markers are pretty - * short: two bytes on x86, and just one byte on amd64. To - * determine if we're looking at a true instruction sequence - * or an inline jump table that happens to contain the same - * byte sequences, we resort to some heuristic sleeze: we - * treat this instruction as being contained within a pointer, - * and see if that pointer points to within the body of the - * function. If it does, we refuse to instrument it. - */ - for (j = 0; j < sizeof (uintptr_t); j++) { - caddr_t check = (caddr_t) instr - j; - uint8_t *ptr; - - if (check < (caddr_t)value) - break; - - if (check + sizeof (caddr_t) > (caddr_t)limit) - continue; - - ptr = *(uint8_t **)check; - - if (ptr >= (uint8_t *) value && ptr < limit) { - instr += size; - continue; - } - } - - /* - * We have a winner! - */ - fbt = malloc(sizeof (fbt_probe_t), M_FBT, M_WAITOK | M_ZERO); - fbt->fbtp_name = name; - - if (retfbt == NULL) { - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, - name, FBT_RETURN, 3, fbt); - } else { - retfbt->fbtp_next = fbt; - fbt->fbtp_id = retfbt->fbtp_id; - } - - retfbt = fbt; - fbt->fbtp_patchpoint = instr; - fbt->fbtp_ctl = mod; - /* fbt->fbtp_loadcnt = lf->loadcnt; */ - fbt->fbtp_symindx = symindx; - -#ifndef __amd64__ - if (*instr == FBT_POPL_EBP) { - fbt->fbtp_rval = DTRACE_INVOP_POPL_EBP; - } else { - ASSERT(*instr == FBT_LEAVE); - fbt->fbtp_rval = DTRACE_INVOP_LEAVE; - } - fbt->fbtp_roffset = - (uintptr_t)(instr - (uint8_t *) value) + 1; - -#else - ASSERT(*instr == FBT_RET); - fbt->fbtp_rval = DTRACE_INVOP_RET; - fbt->fbtp_roffset = - (uintptr_t)(instr - (uint8_t *) value); -#endif - - fbt->fbtp_savedval = *instr; - fbt->fbtp_patchval = FBT_PATCHVAL; - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; - fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; - - mod->mod_fbtentries++; - - instr += size; - } - - return 0; -} - -#elif defined(__arm__) - -static int -fbt_provide_module_cb(const char *name, int symindx, void *value, - uint32_t symsize, int type, void *opaque) -{ - fbt_probe_t *fbt, *retfbt; - uint32_t *instr, *limit; - bool was_ldm_lr = false; - dtrace_modctl_t *mod = opaque; - const char *modname = mod->mod_info->mi_name; - int size; - - /* got a function? */ - if (ELF_ST_TYPE(type) != STT_FUNC) { - return 0; - } - - if (strncmp(name, "dtrace_", 7) == 0 && - strncmp(name, "dtrace_safe_", 12) != 0) { - /* - * Anything beginning with "dtrace_" may be called - * from probe context unless it explicitly indicates - * that it won't be called from probe context by - * using the prefix "dtrace_safe_". - */ - return (0); - } - - if (name[0] == '_' && name[1] == '_') - return (0); + if (strcmp(modname, "dtrace") == 0) + return; /* - * Exclude some more symbols which can be called from probe context. + * To register with DTrace, a module must list 'dtrace' as a + * dependency in order for the kernel linker to resolve + * symbols like dtrace_register(). All modules with such a + * dependency are ineligible for FBT tracing. */ - if (strncmp(name, "db_", 3) == 0 /* debugger */ - || strncmp(name, "ddb_", 4) == 0 /* debugger */ - || strncmp(name, "kdb_", 4) == 0 /* debugger */ - || strncmp(name, "lockdebug_", 10) == 0 /* lockdebug XXX for now */ - || strncmp(name, "kauth_", 5) == 0 /* CRED XXX for now */ - /* Sensitive functions on ARM */ - || strncmp(name, "_spl", 4) == 0 - || strcmp(name, "binuptime") == 0 - || strcmp(name, "dosoftints") == 0 - || strcmp(name, "fbt_emulate") == 0 - || strcmp(name, "nanouptime") == 0 - || strcmp(name, "undefinedinstruction") == 0 - || strncmp(name, "dmt_", 4) == 0 /* omap */ - || strncmp(name, "mvsoctmr_", 9) == 0 /* marvell */ - ) { - return 0; - } - - instr = (uint32_t *) value; - limit = (uint32_t *)((uintptr_t)value + symsize); - - if (!FBT_MOV_IP_SP_P(*instr) - && !FBT_BX_LR_P(*instr) - && !FBT_MOVW_P(*instr) - && !FBT_MOV_IMM_P(*instr) - && !FBT_B_LABEL_P(*instr) - && !FBT_LDR_IMM_P(*instr) - && !FBT_CMP_IMM_P(*instr) - /* && !FBT_PUSH_P(*instr) */ - ) { - return 0; - } - - fbt = malloc(sizeof (fbt_probe_t), M_FBT, M_WAITOK | M_ZERO); - fbt->fbtp_name = name; - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, - name, FBT_ENTRY, 3, fbt); - fbt->fbtp_patchpoint = instr; - fbt->fbtp_ctl = mod; - /* fbt->fbtp_loadcnt = lf->loadcnt; */ - if (FBT_MOV_IP_SP_P(*instr)) - fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOV_IP_SP); - else if (FBT_LDR_IMM_P(*instr)) - fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_LDR_IMM); - else if (FBT_MOVW_P(*instr)) - fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOVW); - else if (FBT_MOV_IMM_P(*instr)) - fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOV_IMM); - else if (FBT_CMP_IMM_P(*instr)) - fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_CMP_IMM); - else if (FBT_BX_LR_P(*instr)) - fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_BX_LR); - else if (FBT_PUSH_P(*instr)) - fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_PUSH); - else if (FBT_B_LABEL_P(*instr)) - fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_B_LABEL); - - fbt->fbtp_patchval = PATCHVAL_ENCODE_COND(*instr); - fbt->fbtp_savedval = *instr; - fbt->fbtp_symindx = symindx; - - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; - fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; - mod->mod_fbtentries++; - - retfbt = NULL; - - while (instr < limit) { - if (instr >= limit) - return (0); - - size = 1; - - if (!FBT_BX_LR_P(*instr) - && !FBT_MOV_PC_LR_P(*instr) - && !FBT_LDM_P(*instr) - && !FBT_LDMIB_P(*instr) - && !(was_ldm_lr && FBT_B_LABEL_P(*instr)) - ) { - if (FBT_LDM_LR_P(*instr) || FBT_LDMIB_LR_P(*instr)) - was_ldm_lr = true; - else - was_ldm_lr = false; - instr += size; - continue; - } + for (i = 0; i < lf->ndeps; i++) + if (strncmp(lf->deps[i]->filename, "dtrace", 6) == 0) + return; + if (lf->fbt_nentries) { /* - * We have a winner! + * This module has some FBT entries allocated; we're afraid + * to screw with it. */ - fbt = malloc(sizeof (fbt_probe_t), M_FBT, M_WAITOK | M_ZERO); - fbt->fbtp_name = name; - - if (retfbt == NULL) { - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, - name, FBT_RETURN, 3, fbt); - } else { - retfbt->fbtp_next = fbt; - fbt->fbtp_id = retfbt->fbtp_id; - } - - retfbt = fbt; - fbt->fbtp_patchpoint = instr; - fbt->fbtp_ctl = mod; - /* fbt->fbtp_loadcnt = lf->loadcnt; */ - fbt->fbtp_symindx = symindx; - - if (FBT_BX_LR_P(*instr)) - fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_BX_LR); - else if (FBT_MOV_PC_LR_P(*instr)) - fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOV_PC_LR); - else if (FBT_LDM_P(*instr)) - fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_LDM); - else if (FBT_LDMIB_P(*instr)) - fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_LDMIB); - else if (FBT_B_LABEL_P(*instr)) - fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_B_LABEL); - - fbt->fbtp_roffset = (uintptr_t)(instr - (uint32_t *) value); - fbt->fbtp_patchval = PATCHVAL_ENCODE_COND(*instr); - - fbt->fbtp_savedval = *instr; - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; - fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; - - mod->mod_fbtentries++; - - instr += size; - was_ldm_lr = false; + return; } - return 0; + /* + * List the functions in the module and the symbol values. + */ + (void) linker_file_function_listall(lf, fbt_provide_module_function, modname); } -#else -#error "architecture not supported" #endif - - +#ifdef __NetBSD__ static void -fbt_provide_module(void *arg, dtrace_modctl_t *mod) +fbt_provide_module(void *arg, modctl_t *mod) { + struct fbt_ksyms_arg fka; + struct mod_ctf *mc; char modname[MAXPATHLEN]; int i; size_t len; - strlcpy(modname, mod->mod_info->mi_name, sizeof(modname)); + if (mod_ctf_get(mod, &mc)) { + printf("fbt: no CTF data for module %s\n", module_name(mod)); + return; + } + + strlcpy(modname, module_name(mod), sizeof(modname)); len = strlen(modname); if (len > 5 && strcmp(modname + len - 3, ".kmod") == 0) modname[len - 4] = '\0'; @@ -958,36 +286,52 @@ fbt_provide_module(void *arg, dtrace_mod * dependency are ineligible for FBT tracing. */ for (i = 0; i < mod->mod_nrequired; i++) { - if (strncmp(mod->mod_required[i]->mod_info->mi_name, + if (strncmp(module_name(mod->mod_required[i]), "dtrace", 6) == 0) return; } - - if (mod->mod_fbtentries) { - /* - * This module has some FBT entries allocated; we're afraid - * to screw with it. - */ + if (mc->fbt_provided) { return; } /* * List the functions in the module and the symbol values. */ - ksyms_mod_foreach(modname, fbt_provide_module_cb, mod); + memset(&fka, 0, sizeof(fka)); + fka.fka_mod = mod; + fka.fka_mc = mc; + ksyms_mod_foreach(modname, fbt_provide_module_cb, &fka); + mc->fbt_provided = true; +} + +static void +fbt_module_dtor(void *arg) +{ + mod_ctf_t *mc = arg; + + if (mc->ctfalloc) + free(mc->ctftab, M_TEMP); + kmem_free(mc, sizeof(*mc)); } +#endif static void fbt_destroy(void *arg, dtrace_id_t id, void *parg) { fbt_probe_t *fbt = parg, *next, *hash, *last; - dtrace_modctl_t *ctl; + modctl_t *ctl; int ndx; do { ctl = fbt->fbtp_ctl; +#ifdef __FreeBSD__ ctl->mod_fbtentries--; +#endif +#ifdef __NetBSD__ + mod_ctf_t *mc = module_getspecific(ctl, fbt_module_key); + mc->fbt_provided = false; +#endif /* * Now we need to remove this probe from the fbt_probetab. @@ -1009,26 +353,21 @@ fbt_destroy(void *arg, dtrace_id_t id, v } next = fbt->fbtp_next; - free(fbt, M_FBT); + kmem_free(fbt, sizeof(*fbt)); fbt = next; } while (fbt != NULL); } -#if defined(__i386__) || defined(__amd64__) - static int fbt_enable(void *arg, dtrace_id_t id, void *parg) { fbt_probe_t *fbt = parg; -#if 0 - dtrace_modctl_t *ctl = fbt->fbtp_ctl; -#endif - u_long psl; - u_long cr0; + modctl_t *ctl = fbt->fbtp_ctl; - -#if 0 /* XXX TBD */ +#ifdef __NetBSD__ + module_hold(ctl); +#else ctl->nenabled++; /* @@ -1040,33 +379,15 @@ fbt_enable(void *arg, dtrace_id_t id, vo if (fbt_verbose) { printf("fbt is failing for probe %s " "(module %s reloaded)", - fbt->fbtp_name, ctl->filename); + fbt->fbtp_name, module_name(ctl)); } - return; + return 0; } #endif - /* Disable interrupts. */ - psl = x86_read_psl(); - x86_disable_intr(); - - /* Disable write protection in supervisor mode. */ - cr0 = rcr0(); - lcr0(cr0 & ~CR0_WP); - - for (; fbt != NULL; fbt = fbt->fbtp_next) { - *fbt->fbtp_patchpoint = fbt->fbtp_patchval; - } - - /* Write back and invalidate cache, flush pipelines. */ - wbinvd(); - x86_flush(); - x86_write_psl(psl); - - /* Re-enable write protection. */ - lcr0(cr0); - + for (; fbt != NULL; fbt = fbt->fbtp_next) + fbt_patch_tracepoint(fbt, fbt->fbtp_patchval); return 0; } @@ -1074,240 +395,60 @@ static void fbt_disable(void *arg, dtrace_id_t id, void *parg) { fbt_probe_t *fbt = parg; -#if 0 - dtrace_modctl_t *ctl = fbt->fbtp_ctl; -#endif - u_long psl; - u_long cr0; + modctl_t *ctl = fbt->fbtp_ctl; -#if 0 /* XXX TBD */ +#ifndef __NetBSD__ ASSERT(ctl->nenabled > 0); ctl->nenabled--; if ((ctl->loadcnt != fbt->fbtp_loadcnt)) return; #endif - /* Disable interrupts. */ - psl = x86_read_psl(); - x86_disable_intr(); - - /* Disable write protection in supervisor mode. */ - cr0 = rcr0(); - lcr0(cr0 & ~CR0_WP); for (; fbt != NULL; fbt = fbt->fbtp_next) - *fbt->fbtp_patchpoint = fbt->fbtp_savedval; + fbt_patch_tracepoint(fbt, fbt->fbtp_savedval); - /* Write back and invalidate cache, flush pipelines. */ - wbinvd(); - x86_flush(); - x86_write_psl(psl); - - /* Re-enable write protection. */ - lcr0(cr0); +#ifdef __NetBSD__ + module_rele(ctl); +#endif } static void fbt_suspend(void *arg, dtrace_id_t id, void *parg) { fbt_probe_t *fbt = parg; -#if 0 - dtrace_modctl_t *ctl = fbt->fbtp_ctl; -#endif - u_long psl; - u_long cr0; +#ifndef __NetBSD__ + modctl_t *ctl = fbt->fbtp_ctl; -#if 0 /* XXX TBD */ ASSERT(ctl->nenabled > 0); if ((ctl->loadcnt != fbt->fbtp_loadcnt)) return; #endif - /* Disable interrupts. */ - psl = x86_read_psl(); - x86_disable_intr(); - - /* Disable write protection in supervisor mode. */ - cr0 = rcr0(); - lcr0(cr0 & ~CR0_WP); - for (; fbt != NULL; fbt = fbt->fbtp_next) - *fbt->fbtp_patchpoint = fbt->fbtp_savedval; - - /* Write back and invalidate cache, flush pipelines. */ - wbinvd(); - x86_flush(); - x86_write_psl(psl); - - /* Re-enable write protection. */ - lcr0(cr0); + fbt_patch_tracepoint(fbt, fbt->fbtp_savedval); } static void fbt_resume(void *arg, dtrace_id_t id, void *parg) { fbt_probe_t *fbt = parg; -#if 0 - dtrace_modctl_t *ctl = fbt->fbtp_ctl; -#endif - u_long psl; - u_long cr0; +#ifndef __NetBSD__ + modctl_t *ctl = fbt->fbtp_ctl; -#if 0 /* XXX TBD */ ASSERT(ctl->nenabled > 0); if ((ctl->loadcnt != fbt->fbtp_loadcnt)) return; #endif - /* Disable interrupts. */ - psl = x86_read_psl(); - x86_disable_intr(); - - /* Disable write protection in supervisor mode. */ - cr0 = rcr0(); - lcr0(cr0 & ~CR0_WP); for (; fbt != NULL; fbt = fbt->fbtp_next) - *fbt->fbtp_patchpoint = fbt->fbtp_patchval; - - /* Write back and invalidate cache, flush pipelines. */ - wbinvd(); - x86_flush(); - x86_write_psl(psl); - - /* Re-enable write protection. */ - lcr0(cr0); + fbt_patch_tracepoint(fbt, fbt->fbtp_patchval); } -#elif defined(__arm__) - static int -fbt_enable(void *arg, dtrace_id_t id, void *parg) -{ - fbt_probe_t *fbt = parg; -#if 0 - dtrace_modctl_t *ctl = fbt->fbtp_ctl; -#endif - dtrace_icookie_t c; - - -#if 0 /* XXX TBD */ - ctl->nenabled++; - - /* - * Now check that our modctl has the expected load count. If it - * doesn't, this module must have been unloaded and reloaded -- and - * we're not going to touch it. - */ - if (ctl->loadcnt != fbt->fbtp_loadcnt) { - if (fbt_verbose) { - printf("fbt is failing for probe %s " - "(module %s reloaded)", - fbt->fbtp_name, ctl->filename); - } - - return; - } -#endif - - c = dtrace_interrupt_disable(); - - for (fbt = parg; fbt != NULL; fbt = fbt->fbtp_next) { - *fbt->fbtp_patchpoint = fbt->fbtp_patchval; - cpu_idcache_wbinv_range((vaddr_t)fbt->fbtp_patchpoint, 4); - } - - dtrace_interrupt_enable(c); - - return 0; -} - -static void -fbt_disable(void *arg, dtrace_id_t id, void *parg) -{ - fbt_probe_t *fbt = parg; -#if 0 - dtrace_modctl_t *ctl = fbt->fbtp_ctl; -#endif - dtrace_icookie_t c; - -#if 0 /* XXX TBD */ - ASSERT(ctl->nenabled > 0); - ctl->nenabled--; - - if ((ctl->loadcnt != fbt->fbtp_loadcnt)) - return; -#endif - - c = dtrace_interrupt_disable(); - - for (; fbt != NULL; fbt = fbt->fbtp_next) { - *fbt->fbtp_patchpoint = fbt->fbtp_savedval; - cpu_idcache_wbinv_range((vaddr_t)fbt->fbtp_patchpoint, 4); - } - - dtrace_interrupt_enable(c); -} - -static void -fbt_suspend(void *arg, dtrace_id_t id, void *parg) -{ - fbt_probe_t *fbt = parg; -#if 0 - dtrace_modctl_t *ctl = fbt->fbtp_ctl; -#endif - dtrace_icookie_t c; - -#if 0 /* XXX TBD */ - ASSERT(ctl->nenabled > 0); - - if ((ctl->loadcnt != fbt->fbtp_loadcnt)) - return; -#endif - - c = dtrace_interrupt_disable(); - - for (; fbt != NULL; fbt = fbt->fbtp_next) { - *fbt->fbtp_patchpoint = fbt->fbtp_savedval; - cpu_idcache_wbinv_range((vaddr_t)fbt->fbtp_patchpoint, 4); - } - - dtrace_interrupt_enable(c); -} - -static void -fbt_resume(void *arg, dtrace_id_t id, void *parg) -{ - fbt_probe_t *fbt = parg; -#if 0 - dtrace_modctl_t *ctl = fbt->fbtp_ctl; -#endif - dtrace_icookie_t c; - -#if 0 /* XXX TBD */ - ASSERT(ctl->nenabled > 0); - - if ((ctl->loadcnt != fbt->fbtp_loadcnt)) - return; -#endif - - c = dtrace_interrupt_disable(); - - for (; fbt != NULL; fbt = fbt->fbtp_next) { - *fbt->fbtp_patchpoint = fbt->fbtp_patchval; - cpu_idcache_wbinv_range((vaddr_t)fbt->fbtp_patchpoint, 4); - } - - dtrace_interrupt_enable(c); -} - -#else -#error "architecture not supported" -#endif - -static int -fbt_ctfoff_init(dtrace_modctl_t *mod, mod_ctf_t *mc) +fbt_ctfoff_init(modctl_t *mod, mod_ctf_t *mc) { const Elf_Sym *symp = mc->symtab; const ctf_header_t *hp = (const ctf_header_t *) mc->ctftab; @@ -1323,19 +464,16 @@ fbt_ctfoff_init(dtrace_modctl_t *mod, mo /* Sanity check. */ if (hp->cth_magic != CTF_MAGIC) { printf("Bad magic value in CTF data of '%s'\n", - mod->mod_info->mi_name); + module_name(mod)); return (EINVAL); } if (mc->symtab == NULL) { - printf("No symbol table in '%s'\n", - mod->mod_info->mi_name); + printf("No symbol table in '%s'\n", module_name(mod)); return (EINVAL); } - if ((ctfoff = malloc(sizeof(uint32_t) * nsyms, M_FBT, M_WAITOK)) == NULL) - return (ENOMEM); - + ctfoff = malloc(sizeof(uint32_t) * nsyms, M_FBT, M_WAITOK); mc->ctfoffp = ctfoff; for (i = 0; i < nsyms; i++, ctfoff++, symp++) { @@ -1346,9 +484,10 @@ fbt_ctfoff_init(dtrace_modctl_t *mod, mo continue; } - /* CTF expects the pre-sorted symbol ordering, + /* + * CTF expects the unsorted symbol ordering, * so map it from that to the current sorted - * and trimmed symbol table. + * symbol table. * ctfoff[new-ind] = oldind symbol info. */ @@ -1359,6 +498,12 @@ fbt_ctfoff_init(dtrace_modctl_t *mod, mo ctfoff = &mc->ctfoffp[mc->nmap[i]-1]; } + /* + * Note that due to how kern_ksyms.c adjusts st_name + * to be the offset into a virtual combined strtab, + * st_name will never be 0 for loaded modules. + */ + if (symp->st_name == 0 || symp->st_shndx == SHN_UNDEF) { *ctfoff = 0xffffffff; continue; @@ -1408,12 +553,12 @@ fbt_ctfoff_init(dtrace_modctl_t *mod, mo } static ssize_t -fbt_get_ctt_size(uint8_t xversion, const ctf_type_t *tp, ssize_t *sizep, +fbt_get_ctt_size(uint8_t version, const ctf_type_t *tp, ssize_t *sizep, ssize_t *incrementp) { ssize_t size, increment; - if (xversion > CTF_VERSION_1 && + if (version > CTF_VERSION_1 && tp->ctt_size == CTF_LSIZE_SENT) { size = CTF_TYPE_LSIZE(tp); increment = sizeof (ctf_type_t); @@ -1442,6 +587,7 @@ fbt_typoff_init(mod_ctf_t *mc) uint32_t *xp; ulong_t pop[CTF_K_MAX + 1] = { 0 }; + /* Sanity check. */ if (hp->cth_magic != CTF_MAGIC) return (EINVAL); @@ -1522,17 +668,19 @@ fbt_typoff_init(mod_ctf_t *mc) vbytes = 0; break; default: - printf("%s(%d): detected invalid CTF kind -- %u\n", __func__, __LINE__, kind); + printf("%s(%d): detected invalid CTF kind -- %u\n", + __func__, __LINE__, kind); return (EIO); } tp = (ctf_type_t *)((uintptr_t)tp + increment + vbytes); pop[kind]++; } + /* account for a sentinel value below */ + ctf_typemax++; mc->typlen = ctf_typemax; - if ((xp = malloc(sizeof(uint32_t) * ctf_typemax, M_FBT, M_ZERO | M_WAITOK)) == NULL) - return (ENOMEM); + xp = malloc(sizeof(uint32_t) * ctf_typemax, M_FBT, M_ZERO | M_WAITOK); mc->typoffp = xp; @@ -1854,11 +1002,7 @@ ctf_decl_push(ctf_decl_t *cd, mod_ctf_t prec = CTF_PREC_BASE; } - if ((cdp = malloc(sizeof (ctf_decl_node_t), M_FBT, M_WAITOK)) == NULL) { - cd->cd_err = EAGAIN; - return; - } - + cdp = malloc(sizeof(*cdp), M_FBT, M_WAITOK); cdp->cd_type = type; cdp->cd_kind = kind; cdp->cd_n = n; @@ -2002,8 +1146,8 @@ fbt_getargdesc(void *arg __unused, dtrac { const ushort_t *dp; fbt_probe_t *fbt = parg; - mod_ctf_t mc; - dtrace_modctl_t *ctl = fbt->fbtp_ctl; + mod_ctf_t *mc; + modctl_t *ctl = fbt->fbtp_ctl; int ndx = desc->dtargd_ndx; int symindx = fbt->fbtp_symindx; uint32_t *ctfoff; @@ -2011,41 +1155,44 @@ fbt_getargdesc(void *arg __unused, dtrac ushort_t info, kind, n; int nsyms; + if (fbt->fbtp_roffset != 0 && desc->dtargd_ndx == 0) { + (void) strcpy(desc->dtargd_native, "int"); + return; + } + desc->dtargd_ndx = DTRACE_ARGNONE; - /* Get a pointer to the CTF data and it's length. */ + /* Get a pointer to the CTF data and its length. */ if (mod_ctf_get(ctl, &mc) != 0) { - static int report=0; + static int report = 0; if (report < 1) { - report++; - printf("FBT: Error no CTF section found in module \"%s\"\n", - ctl->mod_info->mi_name); + report++; + printf("FBT: Error no CTF section found in module \"%s\"\n", + module_name(ctl)); } /* No CTF data? Something wrong? *shrug* */ return; } - nsyms = (mc.nmap != NULL) ? mc.nmapsize : mc.nsym; + nsyms = (mc->nmap != NULL) ? mc->nmapsize : mc->nsym; /* Check if this module hasn't been initialised yet. */ - if (mc.ctfoffp == NULL) { + if (mc->ctfoffp == NULL) { /* * Initialise the CTF object and function symindx to * byte offset array. */ - if (fbt_ctfoff_init(ctl, &mc) != 0) { + if (fbt_ctfoff_init(ctl, mc) != 0) return; - } /* Initialise the CTF type to byte offset array. */ - if (fbt_typoff_init(&mc) != 0) { + if (fbt_typoff_init(mc) != 0) return; - } } - ctfoff = mc.ctfoffp; + ctfoff = mc->ctfoffp; - if (ctfoff == NULL || mc.typoffp == NULL) { + if (ctfoff == NULL || mc->typoffp == NULL) { return; } @@ -2057,7 +1204,7 @@ fbt_getargdesc(void *arg __unused, dtrac if ((offset = ctfoff[symindx]) == 0xffffffff) return; - dp = (const ushort_t *)(mc.ctftab + offset + sizeof(ctf_header_t)); + dp = (const ushort_t *)(mc->ctftab + offset + sizeof(ctf_header_t)); info = *dp++; kind = CTF_INFO_KIND(info); @@ -2075,23 +1222,50 @@ fbt_getargdesc(void *arg __unused, dtrac return; } - /* Check if the requested argument doesn't exist. */ - if (ndx >= n) - return; + if (fbt->fbtp_roffset != 0) { + /* Only return type is available for args[1] in return probe. */ + if (ndx > 1) + return; + ASSERT(ndx == 1); + } else { + /* Check if the requested argument doesn't exist. */ + if (ndx >= n) + return; - /* Skip the return type and arguments up to the one requested. */ - dp += ndx + 1; + /* Skip the return type and arguments up to the one requested. */ + dp += ndx + 1; + } - if (fbt_type_name(&mc, *dp, desc->dtargd_native, sizeof(desc->dtargd_native)) > 0) { + if (fbt_type_name(mc, *dp, desc->dtargd_native, sizeof(desc->dtargd_native)) > 0) desc->dtargd_ndx = ndx; - } return; } +#ifdef __FreeBSD__ +static int +fbt_linker_file_cb(linker_file_t lf, void *arg) +{ + + fbt_provide_module(arg, lf); + + return (0); +} +#endif + static void fbt_load(void) { + +#ifdef __FreeBSD__ + /* Create the /dev/dtrace/fbt entry. */ + fbt_cdev = make_dev(&fbt_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, + "dtrace/fbt"); +#endif +#ifdef __NetBSD__ + (void) module_specific_key_create(&fbt_module_key, fbt_module_dtor); +#endif + /* Default the probe table size if not specified. */ if (fbt_probetab_size == 0) fbt_probetab_size = FBT_PROBETAB_SIZE; @@ -2105,9 +1279,6 @@ fbt_load(void) dtrace_doubletrap_func = fbt_doubletrap; dtrace_invop_add(fbt_invop); -#ifdef __arm__ - dtrace_emulation_jump_addr = fbt_emulate; -#endif if (dtrace_register("fbt", &fbt_attr, DTRACE_PRIV_USER, NULL, &fbt_pops, NULL, &fbt_id) != 0) @@ -2120,9 +1291,6 @@ fbt_unload(void) { int error = 0; -#ifdef __arm__ - dtrace_emulation_jump_addr = NULL; -#endif /* De-register the invalid opcode handler. */ dtrace_invop_remove(fbt_invop); @@ -2137,6 +1305,12 @@ fbt_unload(void) fbt_probetab = NULL; fbt_probetab_mask = 0; +#ifdef __FreeBSD__ + destroy_dev(fbt_cdev); +#endif +#ifdef __NetBSD__ + (void) module_specific_key_delete(fbt_module_key); +#endif return (error); } @@ -2170,4 +1344,15 @@ fbt_open(dev_t dev, int flags, int mode, return (0); } +#ifdef __FreeBSD__ +SYSINIT(fbt_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, fbt_load, NULL); +SYSUNINIT(fbt_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, fbt_unload, NULL); + +DEV_MODULE(fbt, fbt_modevent, NULL); +MODULE_VERSION(fbt, 1); +MODULE_DEPEND(fbt, dtrace, 1, 1, 1); +MODULE_DEPEND(fbt, opensolaris, 1, 1, 1); +#endif +#ifdef __NetBSD__ MODULE(MODULE_CLASS_MISC, dtrace_fbt, "dtrace,zlib"); +#endif Index: src/external/cddl/osnet/dev/fbt/fbt.h =================================================================== RCS file: src/external/cddl/osnet/dev/fbt/fbt.h diff -N src/external/cddl/osnet/dev/fbt/fbt.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dev/fbt/fbt.h 20 Jun 2017 13:43:33 -0000 @@ -0,0 +1,82 @@ +/* $NetBSD$ */ + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Portions Copyright 2006-2008 John Birrell jb@freebsd.org + * + * $FreeBSD: head/sys/cddl/dev/fbt/fbt.h 298171 2016-04-17 23:08:47Z markj $ + * + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _FBT_H_ +#define _FBT_H_ + +#include "fbt_isa.h" + +typedef struct fbt_probe { + struct fbt_probe *fbtp_hashnext; + fbt_patchval_t *fbtp_patchpoint; + fbt_patchval_t fbtp_rval; + fbt_patchval_t fbtp_patchval; + fbt_patchval_t fbtp_savedval; + uintptr_t fbtp_roffset; + dtrace_id_t fbtp_id; + const char *fbtp_name; + modctl_t *fbtp_ctl; + int fbtp_loadcnt; + int fbtp_symindx; + struct fbt_probe *fbtp_next; +} fbt_probe_t; + +struct fbt_ksyms_arg { + modctl_t *fka_mod; + void *fka_mc; +}; + +struct linker_file; +struct linker_symval; +struct trapframe; + +int fbt_invop(uintptr_t, struct trapframe *, uintptr_t); +void fbt_patch_tracepoint(fbt_probe_t *, fbt_patchval_t); +int fbt_provide_module_function(struct linker_file *, int, + struct linker_symval *, void *); +int fbt_provide_module_cb(const char *, int, void *, + uint32_t, int, void *); +int fbt_excluded(const char *); + +extern dtrace_provider_id_t fbt_id; +extern fbt_probe_t **fbt_probetab; +extern int fbt_probetab_mask; + +#define FBT_ADDR2NDX(addr) ((((uintptr_t)(addr)) >> 4) & fbt_probetab_mask) +#define FBT_PROBETAB_SIZE 0x8000 /* 32k entries -- 128K total */ + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_FBT); +#endif + +#endif Index: src/external/cddl/osnet/dev/fbt/arm/fbt_isa.c =================================================================== RCS file: src/external/cddl/osnet/dev/fbt/arm/fbt_isa.c diff -N src/external/cddl/osnet/dev/fbt/arm/fbt_isa.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dev/fbt/arm/fbt_isa.c 20 Jun 2017 19:14:24 -0000 @@ -0,0 +1,403 @@ +/* $NetBSD$ */ + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Portions Copyright 2006-2008 John Birrell jb@freebsd.org + * Portions Copyright 2013 Justin Hibbits jhibbits@freebsd.org + * Portions Copyright 2013 Howard Su howardsu@freebsd.org + * + * $FreeBSD: head/sys/cddl/dev/fbt/arm/fbt_isa.c 312378 2017-01-18 13:27:24Z andrew $ + * + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include "fbt.h" + +#define FBT_PUSHM 0xe92d0000 +#define FBT_POPM 0xe8bd0000 +#define FBT_JUMP 0xea000000 +#define FBT_SUBSP 0xe24dd000 + +#define FBT_ENTRY "entry" +#define FBT_RETURN "return" + +int +fbt_invop(uintptr_t addr, struct trapframe *frame, uintptr_t rval) +{ + solaris_cpu_t *cpu = &solaris_cpu[cpu_number()]; + fbt_probe_t *fbt = fbt_probetab[FBT_ADDR2NDX(addr)]; + register_t fifthparam; + + for (; fbt != NULL; fbt = fbt->fbtp_hashnext) { + if ((uintptr_t)fbt->fbtp_patchpoint == addr) { + if (fbt->fbtp_roffset == 0) { + /* Get 5th parameter from stack */ + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + fifthparam = *(register_t *)frame->tf_svc_sp; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | + CPU_DTRACE_BADADDR); + + cpu->cpu_dtrace_caller = frame->tf_svc_lr; + dtrace_probe(fbt->fbtp_id, frame->tf_r0, + frame->tf_r1, frame->tf_r2, + frame->tf_r3, fifthparam); + } else { + /* XXX set caller */ + cpu->cpu_dtrace_caller = 0; + dtrace_probe(fbt->fbtp_id, fbt->fbtp_roffset, + rval, 0, 0, 0); + } + + cpu->cpu_dtrace_caller = 0; + return (fbt->fbtp_rval); + } + } + + return (0); +} + + +void +fbt_patch_tracepoint(fbt_probe_t *fbt, fbt_patchval_t val) +{ + dtrace_icookie_t c; + + c = dtrace_interrupt_disable(); + + ktext_write(fbt->fbtp_patchpoint, &val, sizeof (val)); + + dtrace_interrupt_enable(c); +} + +#ifdef __FreeBSD__ + +int +fbt_provide_module_function(linker_file_t lf, int symindx, + linker_symval_t *symval, void *opaque) +{ + char *modname = opaque; + const char *name = symval->name; + fbt_probe_t *fbt, *retfbt; + uint32_t *instr, *limit; + int popm; + + if (fbt_excluded(name)) + return (0); + + instr = (uint32_t *)symval->value; + limit = (uint32_t *)(symval->value + symval->size); + + /* + * va_arg functions has first instruction of + * sub sp, sp, #? + */ + if ((*instr & 0xfffff000) == FBT_SUBSP) + instr++; + + /* + * check if insn is a pushm with LR + */ + if ((*instr & 0xffff0000) != FBT_PUSHM || + (*instr & (1 << LR)) == 0) + return (0); + + fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); + fbt->fbtp_name = name; + fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, + name, FBT_ENTRY, 5, fbt); + fbt->fbtp_patchpoint = instr; + fbt->fbtp_ctl = lf; + fbt->fbtp_loadcnt = lf->loadcnt; + fbt->fbtp_savedval = *instr; + fbt->fbtp_patchval = FBT_BREAKPOINT; + fbt->fbtp_rval = DTRACE_INVOP_PUSHM; + fbt->fbtp_symindx = symindx; + + fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; + fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; + + lf->fbt_nentries++; + + popm = FBT_POPM | ((*instr) & 0x3FFF) | 0x8000; + + retfbt = NULL; +again: + for (; instr < limit; instr++) { + if (*instr == popm) + break; + else if ((*instr & 0xff000000) == FBT_JUMP) { + uint32_t *target, *start; + int offset; + + offset = (*instr & 0xffffff); + offset <<= 8; + offset /= 64; + target = instr + (2 + offset); + start = (uint32_t *)symval->value; + if (target >= limit || target < start) + break; + } + } + + if (instr >= limit) + return (0); + + /* + * We have a winner! + */ + fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); + fbt->fbtp_name = name; + if (retfbt == NULL) { + fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, + name, FBT_RETURN, 5, fbt); + } else { + retfbt->fbtp_next = fbt; + fbt->fbtp_id = retfbt->fbtp_id; + } + retfbt = fbt; + + fbt->fbtp_patchpoint = instr; + fbt->fbtp_ctl = lf; + fbt->fbtp_loadcnt = lf->loadcnt; + fbt->fbtp_symindx = symindx; + if ((*instr & 0xff000000) == FBT_JUMP) + fbt->fbtp_rval = DTRACE_INVOP_B; + else + fbt->fbtp_rval = DTRACE_INVOP_POPM; + fbt->fbtp_savedval = *instr; + fbt->fbtp_patchval = FBT_BREAKPOINT; + fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; + fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; + + lf->fbt_nentries++; + + instr++; + goto again; +} + +#endif /* __FreeBSD_ */ + +#ifdef __NetBSD__ + +#define FBT_PATCHVAL DTRACE_BREAKPOINT + +/* entry and return */ +#define FBT_BX_LR_P(insn) (((insn) & ~INSN_COND_MASK) == 0x012fff1e) +#define FBT_B_LABEL_P(insn) (((insn) & 0xff000000) == 0xea000000) +/* entry */ +#define FBT_MOV_IP_SP_P(insn) ((insn) == 0xe1a0c00d) +/* index=1, add=1, wback=0 */ +#define FBT_LDR_IMM_P(insn) (((insn) & 0xfff00000) == 0xe5900000) +#define FBT_MOVW_P(insn) (((insn) & 0xfff00000) == 0xe3000000) +#define FBT_MOV_IMM_P(insn) (((insn) & 0xffff0000) == 0xe3a00000) +#define FBT_CMP_IMM_P(insn) (((insn) & 0xfff00000) == 0xe3500000) +#define FBT_PUSH_P(insn) (((insn) & 0xffff0000) == 0xe92d0000) +/* return */ +/* cond=always, writeback=no, rn=sp and register_list includes pc */ +#define FBT_LDM_P(insn) (((insn) & 0x0fff8000) == 0x089d8000) +#define FBT_LDMIB_P(insn) (((insn) & 0x0fff8000) == 0x099d8000) +#define FBT_MOV_PC_LR_P(insn) (((insn) & ~INSN_COND_MASK) == 0x01a0f00e) +/* cond=always, writeback=no, rn=sp and register_list includes lr, but not pc */ +#define FBT_LDM_LR_P(insn) (((insn) & 0xffffc000) == 0xe89d4000) +#define FBT_LDMIB_LR_P(insn) (((insn) & 0xffffc000) == 0xe99d4000) + +/* rval = insn | invop_id (overwriting cond with invop ID) */ +#define BUILD_RVAL(insn, id) (((insn) & ~INSN_COND_MASK) | __SHIFTIN((id), INSN_COND_MASK)) +/* encode cond in the first byte */ +#define PATCHVAL_ENCODE_COND(insn) (FBT_PATCHVAL | __SHIFTOUT((insn), INSN_COND_MASK)) + +int +fbt_provide_module_cb(const char *name, int symindx, void *value, + uint32_t symsize, int type, void *opaque) +{ + fbt_probe_t *fbt, *retfbt; + uint32_t *instr, *limit; + bool was_ldm_lr = false; + int size; + + struct fbt_ksyms_arg *fka = opaque; + modctl_t *mod = fka->fka_mod; + const char *modname = module_name(mod); + + + /* got a function? */ + if (ELF_ST_TYPE(type) != STT_FUNC) + return 0; + + if (fbt_excluded(name)) + return (0); + + /* + * Exclude some more symbols which can be called from probe context. + */ + if (strncmp(name, "_spl", 4) == 0 || + strcmp(name, "binuptime") == 0 || + strcmp(name, "nanouptime") == 0 || + strcmp(name, "dosoftints") == 0 || + strcmp(name, "fbt_emulate") == 0 || + strcmp(name, "undefinedinstruction") == 0 || + strncmp(name, "dmt_", 4) == 0 /* omap */ || + strncmp(name, "mvsoctmr_", 9) == 0 /* marvell */ ) { + return 0; + } + + instr = (uint32_t *) value; + limit = (uint32_t *)((uintptr_t)value + symsize); + + if (!FBT_MOV_IP_SP_P(*instr) + && !FBT_BX_LR_P(*instr) + && !FBT_MOVW_P(*instr) + && !FBT_MOV_IMM_P(*instr) + && !FBT_B_LABEL_P(*instr) + && !FBT_LDR_IMM_P(*instr) + && !FBT_CMP_IMM_P(*instr) + && !FBT_PUSH_P(*instr) + ) { + return 0; + } + + fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); + fbt->fbtp_name = name; + fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, + name, FBT_ENTRY, 5, fbt); + fbt->fbtp_patchpoint = instr; + fbt->fbtp_ctl = mod; + /* fbt->fbtp_loadcnt = lf->loadcnt; */ + if (FBT_MOV_IP_SP_P(*instr)) + fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOV_IP_SP); + else if (FBT_LDR_IMM_P(*instr)) + fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_LDR_IMM); + else if (FBT_MOVW_P(*instr)) + fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOVW); + else if (FBT_MOV_IMM_P(*instr)) + fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOV_IMM); + else if (FBT_CMP_IMM_P(*instr)) + fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_CMP_IMM); + else if (FBT_BX_LR_P(*instr)) + fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_BX_LR); + else if (FBT_PUSH_P(*instr)) + fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_PUSHM); + else if (FBT_B_LABEL_P(*instr)) + fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_B); + else + KASSERT(0); + + KASSERTMSG((fbt->fbtp_rval >> 28) != 0, + "fbt %p insn 0x%x name %s rval 0x%08x", + fbt, *instr, name, fbt->fbtp_rval); + + fbt->fbtp_patchval = PATCHVAL_ENCODE_COND(*instr); + fbt->fbtp_savedval = *instr; + fbt->fbtp_symindx = symindx; + + fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; + fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; + + retfbt = NULL; + + while (instr < limit) { + if (instr >= limit) + return (0); + + size = 1; + + if (!FBT_BX_LR_P(*instr) + && !FBT_MOV_PC_LR_P(*instr) + && !FBT_LDM_P(*instr) + && !FBT_LDMIB_P(*instr) + && !(was_ldm_lr && FBT_B_LABEL_P(*instr)) + ) { + if (FBT_LDM_LR_P(*instr) || FBT_LDMIB_LR_P(*instr)) + was_ldm_lr = true; + else + was_ldm_lr = false; + instr += size; + continue; + } + + /* + * We have a winner! + */ + fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); + fbt->fbtp_name = name; + + if (retfbt == NULL) { + fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, + name, FBT_RETURN, 5, fbt); + } else { + retfbt->fbtp_next = fbt; + fbt->fbtp_id = retfbt->fbtp_id; + } + + retfbt = fbt; + fbt->fbtp_patchpoint = instr; + fbt->fbtp_ctl = mod; + /* fbt->fbtp_loadcnt = lf->loadcnt; */ + fbt->fbtp_symindx = symindx; + + if (FBT_BX_LR_P(*instr)) + fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_BX_LR); + else if (FBT_MOV_PC_LR_P(*instr)) + fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_MOV_PC_LR); + else if (FBT_LDM_P(*instr)) + fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_LDM); + else if (FBT_LDMIB_P(*instr)) + fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_POPM); + else if (FBT_B_LABEL_P(*instr)) + fbt->fbtp_rval = BUILD_RVAL(*instr, DTRACE_INVOP_B); + else + KASSERT(0); + + KASSERTMSG((fbt->fbtp_rval >> 28) != 0, "fbt %p name %s rval 0x%08x", + fbt, name, fbt->fbtp_rval); + + fbt->fbtp_roffset = (uintptr_t)(instr - (uint32_t *) value); + fbt->fbtp_patchval = PATCHVAL_ENCODE_COND(*instr); + + fbt->fbtp_savedval = *instr; + fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; + fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; + + instr += size; + was_ldm_lr = false; + } + + return 0; +} + +#endif /* __NetBSD__ */ Index: src/external/cddl/osnet/dev/fbt/arm/fbt_isa.h =================================================================== RCS file: src/external/cddl/osnet/dev/fbt/arm/fbt_isa.h diff -N src/external/cddl/osnet/dev/fbt/arm/fbt_isa.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dev/fbt/arm/fbt_isa.h 12 Apr 2017 18:56:59 -0000 @@ -0,0 +1,32 @@ +/* $NetBSD$ */ + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD: head/sys/cddl/dev/fbt/arm/fbt_isa.h 278529 2015-02-10 19:41:30Z gnn $ + * + */ + +#ifndef _FBT_ISA_H_ +#define _FBT_ISA_H_ + +typedef uint32_t fbt_patchval_t; + +#endif Index: src/external/cddl/osnet/dev/fbt/x86/fbt_isa.c =================================================================== RCS file: src/external/cddl/osnet/dev/fbt/x86/fbt_isa.c diff -N src/external/cddl/osnet/dev/fbt/x86/fbt_isa.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dev/fbt/x86/fbt_isa.c 15 Jun 2017 19:39:26 -0000 @@ -0,0 +1,425 @@ +/* $NetBSD$ */ + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Portions Copyright 2006-2008 John Birrell jb@freebsd.org + * + * $FreeBSD: head/sys/cddl/dev/fbt/x86/fbt_isa.c 309785 2016-12-10 03:11:05Z markj $ + * + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#if 1 +#include +#include +#if 0 +#include +#endif +#include +#endif + +#include "fbt.h" + +#define FBT_PUSHL_EBP 0x55 +#define FBT_MOVL_ESP_EBP0_V0 0x8b +#define FBT_MOVL_ESP_EBP1_V0 0xec +#define FBT_MOVL_ESP_EBP0_V1 0x89 +#define FBT_MOVL_ESP_EBP1_V1 0xe5 +#define FBT_REX_RSP_RBP 0x48 + +#define FBT_POPL_EBP 0x5d +#define FBT_RET 0xc3 +#define FBT_RET_IMM16 0xc2 +#define FBT_LEAVE 0xc9 + +#ifdef __amd64__ +#define FBT_PATCHVAL 0xcc +#else +#define FBT_PATCHVAL 0xf0 +#endif + +#define FBT_ENTRY "entry" +#define FBT_RETURN "return" + +int +fbt_invop(uintptr_t addr, struct trapframe *frame, uintptr_t rval) +{ + solaris_cpu_t *cpu; + uintptr_t *stack; + uintptr_t arg0, arg1, arg2, arg3, arg4; + fbt_probe_t *fbt; + +#ifdef __amd64__ + stack = (uintptr_t *)frame->tf_rsp; +#else + /* Skip hardware-saved registers. */ +#ifdef __NetBSD__ + stack = (uintptr_t *)&frame->tf_esp; +#else + stack = (uintptr_t *)frame->tf_isp + 3; +#endif +#endif + + cpu = &solaris_cpu[cpu_number()]; + fbt = fbt_probetab[FBT_ADDR2NDX(addr)]; + for (; fbt != NULL; fbt = fbt->fbtp_hashnext) { + if ((uintptr_t)fbt->fbtp_patchpoint == addr) { + if (fbt->fbtp_roffset == 0) { +#ifdef __amd64__ + /* fbt->fbtp_rval == DTRACE_INVOP_PUSHQ_RBP */ + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + cpu->cpu_dtrace_caller = stack[0]; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | + CPU_DTRACE_BADADDR); + + arg0 = frame->tf_rdi; + arg1 = frame->tf_rsi; + arg2 = frame->tf_rdx; + arg3 = frame->tf_rcx; + arg4 = frame->tf_r8; +#else + int i = 0; + + /* + * When accessing the arguments on the stack, + * we must protect against accessing beyond + * the stack. We can safely set NOFAULT here + * -- we know that interrupts are already + * disabled. + */ + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + cpu->cpu_dtrace_caller = stack[i++]; + arg0 = stack[i++]; + arg1 = stack[i++]; + arg2 = stack[i++]; + arg3 = stack[i++]; + arg4 = stack[i++]; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | + CPU_DTRACE_BADADDR); +#endif + + dtrace_probe(fbt->fbtp_id, arg0, arg1, + arg2, arg3, arg4); + + cpu->cpu_dtrace_caller = 0; + } else { +#ifdef __amd64__ + /* + * On amd64, we instrument the ret, not the + * leave. We therefore need to set the caller + * to ensure that the top frame of a stack() + * action is correct. + */ + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + cpu->cpu_dtrace_caller = stack[0]; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | + CPU_DTRACE_BADADDR); +#endif + + dtrace_probe(fbt->fbtp_id, fbt->fbtp_roffset, + rval, 0, 0, 0); + cpu->cpu_dtrace_caller = 0; + } + + return (fbt->fbtp_rval); + } + } + + return (0); +} + + + + +#ifdef __FreeBSD__ +void +fbt_patch_tracepoint(fbt_probe_t *fbt, fbt_patchval_t val) +{ + + *fbt->fbtp_patchpoint = val; +} +#endif + +#ifdef __NetBSD__ +void +fbt_patch_tracepoint(fbt_probe_t *fbt, fbt_patchval_t val) +{ + u_long psl; + u_long cr0; + + /* Disable interrupts. */ + psl = x86_read_psl(); + x86_disable_intr(); + + /* Disable write protection in supervisor mode. */ + cr0 = rcr0(); + lcr0(cr0 & ~CR0_WP); + + for (; fbt != NULL; fbt = fbt->fbtp_next) { + *fbt->fbtp_patchpoint = val; + } + + /* Write back and invalidate cache, flush pipelines. */ + wbinvd(); + x86_flush(); + x86_write_psl(psl); + + /* Re-enable write protection. */ + lcr0(cr0); +} +#endif + + +#ifdef __FreeBSD__ +int +fbt_provide_module_function(linker_file_t lf, int symindx, + linker_symval_t *symval, void *opaque) +#endif +#ifdef __NetBSD__ +int +fbt_provide_module_cb(const char *name, int symindx, void *value, + uint32_t symsize, int type, void *opaque) +#endif +{ + fbt_probe_t *fbt, *retfbt; + u_int8_t *instr, *limit; + int j; + int size; + +#ifdef __FreeBSD_ + char *modname = opaque; + const char *name = symval->name; + size_t symsize = symval->size; + void *value = symval->value; + + /* + * trap_check() is a wrapper for DTrace's fault handler, so we don't + * want to be able to instrument it. + */ + if (strcmp(name, "trap_check") == 0) + return (0); +#endif +#ifdef __NetBSD__ + struct fbt_ksyms_arg *fka = opaque; + modctl_t *mod = fka->fka_mod; + const char *modname = module_name(mod); + + /* got a function? */ + if (ELF_ST_TYPE(type) != STT_FUNC) + return 0; + + /* + * Exclude some more symbols which can be called from probe context. + */ + if (strcmp(name, "x86_curcpu") == 0 || + strcmp(name, "x86_curlwp") == 0) { + return 0; + } +#endif + + if (fbt_excluded(name)) + return (0); + + instr = (u_int8_t *) value; + limit = (u_int8_t *) value + symsize; + +#ifdef __amd64__ + while (instr < limit) { + if (*instr == FBT_PUSHL_EBP) + break; + + if ((size = dtrace_instr_size(instr)) <= 0) + break; + + instr += size; + } + + if (instr >= limit || *instr != FBT_PUSHL_EBP) { + /* + * We either don't save the frame pointer in this + * function, or we ran into some disassembly + * screw-up. Either way, we bail. + */ + return (0); + } +#else + if (instr[0] != FBT_PUSHL_EBP) + return (0); + + if (!(instr[1] == FBT_MOVL_ESP_EBP0_V0 && + instr[2] == FBT_MOVL_ESP_EBP1_V0) && + !(instr[1] == FBT_MOVL_ESP_EBP0_V1 && + instr[2] == FBT_MOVL_ESP_EBP1_V1)) + return (0); +#endif + + fbt = kmem_zalloc(sizeof (*fbt), KM_SLEEP); + fbt->fbtp_name = name; + fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, + name, FBT_ENTRY, 3, fbt); + fbt->fbtp_patchpoint = instr; +#ifdef __FreeBSD__ + fbt->fbtp_ctl = lf; + fbt->fbtp_loadcnt = lf->loadcnt; +#endif +#ifdef __NetBSD__ + fbt->fbtp_ctl = mod; +#endif + fbt->fbtp_rval = DTRACE_INVOP_PUSHL_EBP; + fbt->fbtp_savedval = *instr; + fbt->fbtp_patchval = FBT_PATCHVAL; + fbt->fbtp_symindx = symindx; + + fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; + fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; +#ifdef __FreeBSD__ + lf->fbt_nentries++; +#endif + + retfbt = NULL; +again: + if (instr >= limit) + return (0); + + /* + * If this disassembly fails, then we've likely walked off into + * a jump table or some other unsuitable area. Bail out of the + * disassembly now. + */ + if ((size = dtrace_instr_size(instr)) <= 0) + return (0); + +#ifdef __amd64__ + /* + * We only instrument "ret" on amd64 -- we don't yet instrument + * ret imm16, largely because the compiler doesn't seem to + * (yet) emit them in the kernel... + */ + if (*instr != FBT_RET) { + instr += size; + goto again; + } +#else + if (!(size == 1 && + (*instr == FBT_POPL_EBP || *instr == FBT_LEAVE) && + (*(instr + 1) == FBT_RET || + *(instr + 1) == FBT_RET_IMM16))) { + instr += size; + goto again; + } +#endif + + /* + * We (desperately) want to avoid erroneously instrumenting a + * jump table, especially given that our markers are pretty + * short: two bytes on x86, and just one byte on amd64. To + * determine if we're looking at a true instruction sequence + * or an inline jump table that happens to contain the same + * byte sequences, we resort to some heuristic sleeze: we + * treat this instruction as being contained within a pointer, + * and see if that pointer points to within the body of the + * function. If it does, we refuse to instrument it. + */ + for (j = 0; j < sizeof (uintptr_t); j++) { + caddr_t check = (caddr_t) instr - j; + uint8_t *ptr; + + if (check < (caddr_t)value) + break; + + if (check + sizeof (caddr_t) > (caddr_t)limit) + continue; + + ptr = *(uint8_t **)check; + + if (ptr >= (uint8_t *) value && ptr < limit) { + instr += size; + goto again; + } + } + + /* + * We have a winner! + */ + fbt = kmem_zalloc(sizeof (*fbt), KM_SLEEP); + fbt->fbtp_name = name; + + if (retfbt == NULL) { + fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, + name, FBT_RETURN, 3, fbt); + } else { + retfbt->fbtp_next = fbt; + fbt->fbtp_id = retfbt->fbtp_id; + } + + retfbt = fbt; + fbt->fbtp_patchpoint = instr; +#ifdef __FreeBSD__ + fbt->fbtp_ctl = lf; + fbt->fbtp_loadcnt = lf->loadcnt; +#endif +#ifdef __NetBSD__ + fbt->fbtp_ctl = mod; +#endif + fbt->fbtp_symindx = symindx; + +#ifndef __amd64__ + if (*instr == FBT_POPL_EBP) { + fbt->fbtp_rval = DTRACE_INVOP_POPL_EBP; + } else { + ASSERT(*instr == FBT_LEAVE); + fbt->fbtp_rval = DTRACE_INVOP_LEAVE; + } + fbt->fbtp_roffset = + (uintptr_t)(instr - (uint8_t *) value) + 1; + +#else + ASSERT(*instr == FBT_RET); + fbt->fbtp_rval = DTRACE_INVOP_RET; + fbt->fbtp_roffset = + (uintptr_t)(instr - (uint8_t *) value); +#endif + + fbt->fbtp_savedval = *instr; + fbt->fbtp_patchval = FBT_PATCHVAL; + fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; + fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; + +#ifdef __FreeBSD__ + lf->fbt_nentries++; +#endif + + instr += size; + goto again; +} Index: src/external/cddl/osnet/dev/fbt/x86/fbt_isa.h =================================================================== RCS file: src/external/cddl/osnet/dev/fbt/x86/fbt_isa.h diff -N src/external/cddl/osnet/dev/fbt/x86/fbt_isa.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dev/fbt/x86/fbt_isa.h 19 Apr 2017 17:15:48 -0000 @@ -0,0 +1,32 @@ +/* $NetBSD$ */ + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD: head/sys/cddl/dev/fbt/x86/fbt_isa.h 270067 2014-08-16 21:42:55Z markj $ + * + */ + +#ifndef _FBT_ISA_H_ +#define _FBT_ISA_H_ + +typedef uint8_t fbt_patchval_t; + +#endif Index: src/external/cddl/osnet/dev/lockstat/lockstat.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/lockstat/lockstat.c,v retrieving revision 1.8 diff -u -p -r1.8 lockstat.c --- src/external/cddl/osnet/dev/lockstat/lockstat.c 9 Mar 2015 01:42:26 -0000 1.8 +++ src/external/cddl/osnet/dev/lockstat/lockstat.c 20 Apr 2017 14:17:11 -0000 @@ -29,6 +29,7 @@ __KERNEL_RCSID(0, "$NetBSD: lockstat.c,v 1.8 2015/03/09 01:42:26 christos Exp $"); #include +#include #include #include #include @@ -41,8 +42,6 @@ __KERNEL_RCSID(0, "$NetBSD: lockstat.c,v #define NLOCKSTAT 1 #include -#define ASSERT KASSERT - typedef struct lockstat_probe { const char *lsp_func; const char *lsp_name; @@ -100,7 +99,7 @@ lockstat_disable(void *arg, dtrace_id_t /*ARGSUSED*/ static void -lockstat_provide(void *arg, const dtrace_probedesc_t *desc) +lockstat_provide(void *arg, dtrace_probedesc_t *desc) { int i = 0; Index: src/external/cddl/osnet/dev/profile/profile.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/profile/profile.c,v retrieving revision 1.7 diff -u -p -r1.7 profile.c --- src/external/cddl/osnet/dev/profile/profile.c 7 Jan 2017 21:39:52 -0000 1.7 +++ src/external/cddl/osnet/dev/profile/profile.c 6 May 2017 23:59:31 -0000 @@ -22,7 +22,7 @@ * * Portions Copyright 2006-2008 John Birrell jb@freebsd.org * - * $FreeBSD: src/sys/cddl/dev/profile/profile.c,v 1.1.4.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/dev/profile/profile.c 300618 2016-05-24 16:41:37Z br $ * */ @@ -44,7 +44,9 @@ #include #include #include -#include +#ifdef __FreeBSD__ +#include +#endif #include #include #include @@ -55,17 +57,22 @@ #include #ifdef __FreeBSD__ #include +#include #endif #include #include +#ifdef __FreeBSD__ +#include +#include +#endif #ifdef __NetBSD__ +#include #include #include -#define ASSERT(x) KASSERT(x) +#include #endif -#include #include #include @@ -109,7 +116,7 @@ */ #ifdef __FreeBSD__ #ifdef __amd64 -#define PROF_ARTIFICIAL_FRAMES 7 +#define PROF_ARTIFICIAL_FRAMES 10 #else #ifdef __i386 #define PROF_ARTIFICIAL_FRAMES 6 @@ -123,8 +130,44 @@ #endif #endif #endif + +#ifdef __mips +/* + * This value is bogus just to make module compilable on mips + */ +#define PROF_ARTIFICIAL_FRAMES 3 +#endif + +#ifdef __powerpc__ +/* + * This value is bogus just to make module compilable on powerpc + */ +#define PROF_ARTIFICIAL_FRAMES 3 +#endif + +struct profile_probe_percpu; + +#ifdef __mips +/* bogus */ +#define PROF_ARTIFICIAL_FRAMES 3 +#endif + +#ifdef __arm__ +#define PROF_ARTIFICIAL_FRAMES 3 +#endif + +#ifdef __aarch64__ +/* TODO: verify */ +#define PROF_ARTIFICIAL_FRAMES 10 +#endif + +#ifdef __riscv__ +/* TODO: verify */ +#define PROF_ARTIFICIAL_FRAMES 10 #endif +#endif /* __FreeBSD__ */ + #ifdef __NetBSD__ #define PROF_ARTIFICIAL_FRAMES 3 #endif @@ -133,14 +176,25 @@ typedef struct profile_probe { char prof_name[PROF_NAMELEN]; dtrace_id_t prof_id; int prof_kind; +#if defined(illumos) || defined(__NetBSD__) hrtime_t prof_interval; cyclic_id_t prof_cyclic; +#endif +#ifdef __FreeBSD__ + sbintime_t prof_interval; + struct callout prof_cyclic; + sbintime_t prof_expected; + struct profile_probe_percpu **prof_pcpus; +#endif } profile_probe_t; typedef struct profile_probe_percpu { hrtime_t profc_expected; hrtime_t profc_interval; profile_probe_t *profc_probe; +#ifdef __FreeBSD__ + struct callout profc_cyclic; +#endif } profile_probe_percpu_t; #ifdef __FreeBSD__ @@ -152,7 +206,7 @@ static void profile_destroy(void *, dtra static int profile_enable(void *, dtrace_id_t, void *); static void profile_disable(void *, dtrace_id_t, void *); static void profile_load(void *); -static void profile_provide(void *, const dtrace_probedesc_t *); +static void profile_provide(void *, dtrace_probedesc_t *); static int profile_rates[] = { 97, 199, 499, 997, 1999, @@ -213,8 +267,105 @@ static struct cdev *profile_cdev; #endif static dtrace_provider_id_t profile_id; static hrtime_t profile_interval_min = NANOSEC / 5000; /* 5000 hz */ -static int profile_aframes = 0; /* override */ +static int profile_aframes = PROF_ARTIFICIAL_FRAMES; + +#ifdef __FreeBSD__ +SYSCTL_DECL(_kern_dtrace); +SYSCTL_NODE(_kern_dtrace, OID_AUTO, profile, CTLFLAG_RD, 0, "DTrace profile parameters"); +SYSCTL_INT(_kern_dtrace_profile, OID_AUTO, aframes, CTLFLAG_RW, &profile_aframes, + 0, "Skipped frames for profile provider"); + +static sbintime_t +nsec_to_sbt(hrtime_t nsec) +{ + time_t sec; + + /* + * We need to calculate nsec * 2^32 / 10^9 + * Seconds and nanoseconds are split to avoid overflow. + */ + sec = nsec / NANOSEC; + nsec = nsec % NANOSEC; + return (((sbintime_t)sec << 32) | ((sbintime_t)nsec << 32) / NANOSEC); +} + +static hrtime_t +sbt_to_nsec(sbintime_t sbt) +{ + + return ((sbt >> 32) * NANOSEC + + (((uint32_t)sbt * (hrtime_t)NANOSEC) >> 32)); +} + +static void +profile_fire(void *arg) +{ + profile_probe_percpu_t *pcpu = arg; + profile_probe_t *prof = pcpu->profc_probe; + hrtime_t late; + struct trapframe *frame; + uintfptr_t pc, upc; + +#ifdef illumos + late = gethrtime() - pcpu->profc_expected; +#else + late = sbt_to_nsec(sbinuptime() - pcpu->profc_expected); +#endif + + pc = 0; + upc = 0; + + /* + * td_intr_frame can be unset if this is a catch up event + * after waking up from idle sleep. + * This can only happen on a CPU idle thread. + */ + frame = curthread->td_intr_frame; + if (frame != NULL) { + if (TRAPF_USERMODE(frame)) + upc = TRAPF_PC(frame); + else + pc = TRAPF_PC(frame); + } + dtrace_probe(prof->prof_id, pc, upc, late, 0, 0); + + pcpu->profc_expected += pcpu->profc_interval; + callout_schedule_sbt_curcpu(&pcpu->profc_cyclic, + pcpu->profc_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE); +} + +static void +profile_tick(void *arg) +{ + profile_probe_t *prof = arg; + struct trapframe *frame; + uintfptr_t pc, upc; + + pc = 0; + upc = 0; + + /* + * td_intr_frame can be unset if this is a catch up event + * after waking up from idle sleep. + * This can only happen on a CPU idle thread. + */ + frame = curthread->td_intr_frame; + if (frame != NULL) { + if (TRAPF_USERMODE(frame)) + upc = TRAPF_PC(frame); + else + pc = TRAPF_PC(frame); + } + dtrace_probe(prof->prof_id, pc, upc, 0, 0, 0); + + prof->prof_expected += prof->prof_interval; + callout_schedule_sbt(&prof->prof_cyclic, + prof->prof_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE); +} + +#endif +#ifdef __NetBSD__ static void profile_fire(void *arg) { @@ -240,6 +391,8 @@ profile_tick(void *arg) c->cpu_profile_upc, 0, 0, 0); } +#endif + static void profile_create(hrtime_t interval, char *name, int kind) { @@ -259,24 +412,29 @@ profile_create(hrtime_t interval, char * prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP); (void) strcpy(prof->prof_name, name); +#ifdef __FreeBSD__ + prof->prof_interval = nsec_to_sbt(interval); + callout_init(&prof->prof_cyclic, 1); +#else prof->prof_interval = interval; prof->prof_cyclic = CYCLIC_NONE; +#endif prof->prof_kind = kind; prof->prof_id = dtrace_probe_create(profile_id, NULL, NULL, name, - profile_aframes ? profile_aframes : PROF_ARTIFICIAL_FRAMES, prof); + profile_aframes, prof); } /*ARGSUSED*/ static void -profile_provide(void *arg, const dtrace_probedesc_t *desc) +profile_provide(void *arg, dtrace_probedesc_t *desc) { int i, j, rate, kind; hrtime_t val = 0, mult = 1, len = 0; char *name, *suffix = NULL; const struct { - const char *prefix; + char *prefix; int kind; } types[] = { { PROF_PREFIX_PROFILE, PROF_PROFILE }, @@ -285,7 +443,7 @@ profile_provide(void *arg, const dtrace_ }; const struct { - const char *name; + char *name; hrtime_t mult; } suffixes[] = { { "ns", NANOSEC / NANOSEC }, @@ -333,7 +491,7 @@ profile_provide(void *arg, const dtrace_ return; } - name = (char *)desc->dtpd_name; + name = desc->dtpd_name; for (i = 0; types[i].prefix != NULL; i++) { len = strlen(types[i].prefix); @@ -405,13 +563,19 @@ profile_destroy(void *arg, dtrace_id_t i { profile_probe_t *prof = parg; +#ifdef __FreeBSD__ + ASSERT(!callout_active(&prof->prof_cyclic) && prof->prof_pcpus == NULL); +#else ASSERT(prof->prof_cyclic == CYCLIC_NONE); +#endif kmem_free(prof, sizeof (profile_probe_t)); ASSERT(profile_total >= 1); atomic_add_32(&profile_total, -1); } +#ifndef __FreeBSD__ + /*ARGSUSED*/ static void profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when) @@ -488,6 +652,81 @@ profile_disable(void *arg, dtrace_id_t i prof->prof_cyclic = CYCLIC_NONE; } +#else + +static void +profile_enable_omni(profile_probe_t *prof) +{ + profile_probe_percpu_t *pcpu; + int cpu; + + prof->prof_pcpus = kmem_zalloc((mp_maxid + 1) * sizeof(pcpu), KM_SLEEP); + CPU_FOREACH(cpu) { + pcpu = kmem_zalloc(sizeof(profile_probe_percpu_t), KM_SLEEP); + prof->prof_pcpus[cpu] = pcpu; + pcpu->profc_probe = prof; + pcpu->profc_expected = sbinuptime() + prof->prof_interval; + pcpu->profc_interval = prof->prof_interval; + callout_init(&pcpu->profc_cyclic, 1); + callout_reset_sbt_on(&pcpu->profc_cyclic, + pcpu->profc_expected, 0, profile_fire, pcpu, + cpu, C_DIRECT_EXEC | C_ABSOLUTE); + } +} + +static void +profile_disable_omni(profile_probe_t *prof) +{ + profile_probe_percpu_t *pcpu; + int cpu; + + ASSERT(prof->prof_pcpus != NULL); + CPU_FOREACH(cpu) { + pcpu = prof->prof_pcpus[cpu]; + ASSERT(pcpu->profc_probe == prof); + ASSERT(callout_active(&pcpu->profc_cyclic)); + callout_stop(&pcpu->profc_cyclic); + callout_drain(&pcpu->profc_cyclic); + kmem_free(pcpu, sizeof(profile_probe_percpu_t)); + } + kmem_free(prof->prof_pcpus, (mp_maxid + 1) * sizeof(pcpu)); + prof->prof_pcpus = NULL; +} + +/* ARGSUSED */ +static void +profile_enable(void *arg, dtrace_id_t id, void *parg) +{ + profile_probe_t *prof = parg; + + if (prof->prof_kind == PROF_TICK) { + prof->prof_expected = sbinuptime() + prof->prof_interval; + callout_reset_sbt(&prof->prof_cyclic, + prof->prof_expected, 0, profile_tick, prof, + C_DIRECT_EXEC | C_ABSOLUTE); + } else { + ASSERT(prof->prof_kind == PROF_PROFILE); + profile_enable_omni(prof); + } +} + +/* ARGSUSED */ +static void +profile_disable(void *arg, dtrace_id_t id, void *parg) +{ + profile_probe_t *prof = parg; + + if (prof->prof_kind == PROF_TICK) { + ASSERT(callout_active(&prof->prof_cyclic)); + callout_stop(&prof->prof_cyclic); + callout_drain(&prof->prof_cyclic); + } else { + ASSERT(prof->prof_kind == PROF_PROFILE); + profile_disable_omni(prof); + } +} +#endif + static void profile_load(void *dummy) { Index: src/external/cddl/osnet/dev/sdt/sdt.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/sdt/sdt.c,v retrieving revision 1.18 diff -u -p -r1.18 sdt.c --- src/external/cddl/osnet/dev/sdt/sdt.c 7 Jan 2017 21:39:52 -0000 1.18 +++ src/external/cddl/osnet/dev/sdt/sdt.c 20 Apr 2017 13:42:05 -0000 @@ -20,7 +20,7 @@ * * Portions Copyright 2006-2008 John Birrell jb@freebsd.org * - * $FreeBSD: head/sys/cddl/dev/sdt/sdt.c 285703 2015-07-19 22:14:09Z markj $ + * $FreeBSD: head/sys/cddl/dev/sdt/sdt.c 297771 2016-04-10 01:24:27Z markj $ * */ @@ -42,6 +42,7 @@ __KERNEL_RCSID(0, "$NetBSD: sdt.c,v 1.18 2017/01/07 21:39:52 christos Exp $"); #include +#include #include #include @@ -72,7 +73,7 @@ __KERNEL_RCSID(0, "$NetBSD: sdt.c,v 1.18 /* DTrace methods. */ static void sdt_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *); -static void sdt_provide_probes(void *, const dtrace_probedesc_t *); +static void sdt_provide_probes(void *, dtrace_probedesc_t *); static void sdt_destroy(void *, dtrace_id_t, void *); static int sdt_enable(void *, dtrace_id_t, void *); static void sdt_disable(void *, dtrace_id_t, void *); @@ -115,7 +116,7 @@ static dtrace_pops_t sdt_pops = { static int sdt_open(dev_t dev, int flags, int mode, struct lwp *l) { - return (0); + return 0; } static const struct cdevsw sdt_cdevsw = { @@ -137,8 +138,8 @@ static const struct cdevsw sdt_cdevsw = static TAILQ_HEAD(, sdt_provider) sdt_prov_list; #ifdef __FreeBSD__ -eventhandler_tag sdt_kld_load_tag; -eventhandler_tag sdt_kld_unload_try_tag; +static eventhandler_tag sdt_kld_load_tag; +static eventhandler_tag sdt_kld_unload_try_tag; #endif #ifdef __NetBSD__ @@ -191,6 +192,12 @@ sdt_create_probe(struct sdt_probe *probe char *to; size_t len; + if (probe->version != (int)sizeof(*probe)) { + printf("ignoring probe %p, version %u expected %u\n", + probe, probe->version, (int)sizeof(*probe)); + return; + } + TAILQ_FOREACH(prov, &sdt_prov_list, prov_entry) if (strcmp(prov->name, probe->prov->name) == 0) break; @@ -214,6 +221,8 @@ sdt_create_probe(struct sdt_probe *probe * in the C compiler, so we have to respect const vs non-const. */ strlcpy(func, probe->func, sizeof(func)); + if (func[0] == '\0') + strcpy(func, "none"); from = probe->name; to = name; @@ -239,7 +248,7 @@ sdt_create_probe(struct sdt_probe *probe * requires one of provide_probes and provide_module to be defined. */ static void -sdt_provide_probes(void *arg, const dtrace_probedesc_t *desc) +sdt_provide_probes(void *arg, dtrace_probedesc_t *desc) { } @@ -248,10 +257,6 @@ sdt_enable(void *arg __unused, dtrace_id { struct sdt_probe *probe = parg; -#ifdef SDT_DEBUG - printf("sdt: %s\n", __func__); -#endif - probe->id = id; #ifdef __FreeBSD__ probe->sdtp_lf->nenabled++; @@ -268,13 +273,6 @@ sdt_disable(void *arg __unused, dtrace_i #ifdef __FreeBSD__ SDT_KASSERT(probe->sdtp_lf->nenabled > 0, ("no probes enabled")); -#endif - -#ifdef SDT_DEBUG - printf("sdt: %s\n", __func__); -#endif - -#ifdef __FreeBSD__ if (strcmp(probe->prov->name, "lockstat") == 0) lockstat_enabled--; probe->sdtp_lf->nenabled--; @@ -288,16 +286,6 @@ sdt_getargdesc(void *arg, dtrace_id_t id struct sdt_argtype *argtype; struct sdt_probe *probe = parg; -#ifdef SDT_DEBUG - printf("sdt: %s probe %d\n", __func__, id); - printf("%s: probe %d (%s:%s:%s:%s).%d\n", - __func__, id, - probe->provider, - probe->module, - probe->function, - probe->name, - desc->dtargd_ndx); -#endif if (desc->dtargd_ndx >= probe->n_args) { desc->dtargd_ndx = DTRACE_ARGNONE; return; @@ -488,12 +476,19 @@ sdt_load(void) { TAILQ_INIT(&sdt_prov_list); - sdt_init(dtrace_probe); #ifdef __FreeBSD__ + sdt_probe_func = dtrace_probe; + + sdt_kld_load_tag = EVENTHANDLER_REGISTER(kld_load, sdt_kld_load, NULL, + EVENTHANDLER_PRI_ANY); + sdt_kld_unload_try_tag = EVENTHANDLER_REGISTER(kld_unload_try, + sdt_kld_unload_try, NULL, EVENTHANDLER_PRI_ANY); + /* Pick up probes from the kernel and already-loaded linker files. */ linker_file_foreach(sdt_linker_file_cb, NULL); #endif #ifdef __NetBSD__ + sdt_init(dtrace_probe); sdt_link_set_load(); #endif } @@ -504,53 +499,51 @@ sdt_unload(void) struct sdt_provider *prov, *tmp; int ret; - sdt_exit(); +#ifdef __FreeBSD__ + EVENTHANDLER_DEREGISTER(kld_load, sdt_kld_load_tag); + EVENTHANDLER_DEREGISTER(kld_unload_try, sdt_kld_unload_try_tag); + + sdt_probe_func = sdt_probe_stub; +#endif #ifdef __NetBSD__ + sdt_exit(); + sdt_link_set_unload(); #endif TAILQ_FOREACH_SAFE(prov, &sdt_prov_list, prov_entry, tmp) { ret = dtrace_unregister(prov->id); if (ret != 0) - return ret; + return (ret); TAILQ_REMOVE(&sdt_prov_list, prov, prov_entry); free(__UNCONST(prov->name), M_SDT); free(prov, M_SDT); } - return 0; + return (0); } #ifdef __FreeBSD__ static int sdt_modevent(module_t mod __unused, int type, void *data __unused) { - int error = 0; switch (type) { case MOD_LOAD: - sdt_load(); - break; - case MOD_UNLOAD: - error = sdt_unload(); - break; - case MOD_SHUTDOWN: - break; - + return (0); default: - error = EOPNOTSUPP; - break; + return (EOPNOTSUPP); } - - return (error); } +SYSINIT(sdt_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, sdt_load, NULL); +SYSUNINIT(sdt_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, sdt_unload, NULL); + DEV_MODULE(sdt, sdt_modevent, NULL); MODULE_VERSION(sdt, 1); MODULE_DEPEND(sdt, dtrace, 1, 1, 1); -MODULE_DEPEND(sdt, opensolaris, 1, 1, 1); #endif #ifdef __NetBSD__ Index: src/external/cddl/osnet/dev/systrace/systrace.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dev/systrace/systrace.c,v retrieving revision 1.9 diff -u -p -r1.9 systrace.c --- src/external/cddl/osnet/dev/systrace/systrace.c 7 Jan 2017 21:39:52 -0000 1.9 +++ src/external/cddl/osnet/dev/systrace/systrace.c 20 Apr 2017 14:31:12 -0000 @@ -22,8 +22,6 @@ * * Portions Copyright 2006-2008 John Birrell jb@freebsd.org * - * $FreeBSD: src/sys/cddl/dev/systrace/systrace.c,v 1.2.2.1 2009/08/03 08:13:06 kensmith Exp $ - * */ /* @@ -32,6 +30,9 @@ */ #include +/* __FBSDID("$FreeBSD: head/sys/cddl/dev/systrace/systrace.c 306220 2016-09-22 23:22:53Z markj $"); */ + +#include #include #include #include @@ -55,12 +56,82 @@ #include #include +#include "dtrace_cddl.h" #include "emultrace.h" #define CONCAT(x,y) __CONCAT(x,y) #define STRING(s) __STRING(s) +#ifdef __FreeBSD__ +#ifdef LINUX_SYSTRACE +#if defined(__amd64__) +#include +#include +#include +#include +#elif defined(__i386__) +#include +#include +#include +#include +#else +#error Only i386 and amd64 are supported. +#endif +#define MODNAME "linux" +extern struct sysent linux_sysent[]; +#define MAXSYSCALL LINUX_SYS_MAXSYSCALL +#define SYSCALLNAMES linux_syscallnames +#define SYSENT linux_sysent +#elif defined(LINUX32_SYSTRACE) +#if defined(__amd64__) +#include +#include +#include +#include +#else +#error Only amd64 is supported. +#endif +#define MODNAME "linux32" +extern struct sysent linux32_sysent[]; +#define MAXSYSCALL LINUX32_SYS_MAXSYSCALL +#define SYSCALLNAMES linux32_syscallnames +#define SYSENT linux32_sysent +#elif defined(FREEBSD32_SYSTRACE) +/* + * The syscall arguments are processed into a DTrace argument array + * using a generated function. See sys/kern/makesyscalls.sh. + */ +#include +#include +#include +#include +extern const char *freebsd32_syscallnames[]; +#define MODNAME "freebsd32" +#define MAXSYSCALL FREEBSD32_SYS_MAXSYSCALL +#define SYSCALLNAMES freebsd32_syscallnames +#define SYSENT freebsd32_sysent +#else +/* + * The syscall arguments are processed into a DTrace argument array + * using a generated function. See sys/kern/makesyscalls.sh. + */ +#include +#include +#define MODNAME "freebsd" +#define MAXSYSCALL SYS_MAXSYSCALL +#define SYSCALLNAMES syscallnames +#define SYSENT sysent +#define NATIVE_ABI +#endif + +#define PROVNAME "syscall" +#define DEVNAME "dtrace/systrace/" MODNAME +#endif /* __FreeBSD__ */ + +#ifdef __NetBSD__ +#include + #ifndef NATIVE extern const char * const CONCAT(emulname,_syscallnames)[]; extern const char * const CONCAT(alt,CONCAT(emulname,_syscallnames))[]; @@ -87,6 +158,8 @@ extern const char * const altsyscallname #define MODCMD CONCAT(MODNAME,_modcmd) #define EMUL CONCAT(emul_,emulname) extern struct emul EMUL; +#define curthread curlwp +#endif /* __NetBSD__ */ #define SYSTRACE_ARTIFICIAL_FRAMES 1 @@ -102,12 +175,20 @@ extern struct emul EMUL; static int systrace_unload(void); static void systrace_getargdesc(void *, dtrace_id_t, void *, dtrace_argdesc_t *); -static void systrace_provide(void *, const dtrace_probedesc_t *); +static uint64_t systrace_getargval(void *, dtrace_id_t, void *, int, int); +static void systrace_provide(void *, dtrace_probedesc_t *); static void systrace_destroy(void *, dtrace_id_t, void *); static int systrace_enable(void *, dtrace_id_t, void *); static void systrace_disable(void *, dtrace_id_t, void *); static void systrace_load(void *); +#ifdef __FreeBSD__ +static union { + const char **p_constnames; + char **pp_syscallnames; +} uglyhack = { SYSCALLNAMES }; +#endif + static dtrace_pattr_t systrace_attr = { { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, @@ -124,7 +205,7 @@ static dtrace_pops_t systrace_pops = { NULL, NULL, systrace_getargdesc, - NULL, + systrace_getargval, NULL, systrace_destroy }; @@ -138,6 +219,57 @@ static dtrace_provider_id_t systrace_id; * array the syscall comes from. It could be a standard syscall or a * compat syscall from something like Linux. */ +#ifdef __FreeBSD__ +#ifdef NATIVE_ABI +static void +systrace_probe(struct syscall_args *sa, enum systrace_probe_t type, int retval) +{ + uint64_t uargs[nitems(sa->args)]; + dtrace_id_t id; + int n_args, sysnum; + + sysnum = sa->code; + memset(uargs, 0, sizeof(uargs)); + + if (type == SYSTRACE_ENTRY) { + if ((id = sa->callp->sy_entry) == DTRACE_IDNONE) + return; + + if (sa->callp->sy_systrace_args_func != NULL) + /* + * Convert the syscall parameters using the registered + * function. + */ + (*sa->callp->sy_systrace_args_func)(sysnum, sa->args, + uargs, &n_args); + else + /* + * Use the built-in system call argument conversion + * function to translate the syscall structure fields + * into the array of 64-bit values that DTrace expects. + */ + systrace_args(sysnum, sa->args, uargs, &n_args); + /* + * Save probe arguments now so that we can retrieve them if + * the getargval method is called from further down the stack. + */ + curthread->t_dtrace_systrace_args = uargs; + } else { + if ((id = sa->callp->sy_return) == DTRACE_IDNONE) + return; + + curthread->t_dtrace_systrace_args = NULL; + /* Set arg0 and arg1 as the return value of this syscall. */ + uargs[0] = uargs[1] = retval; + } + + /* Process the probe using the converted argments. */ + dtrace_probe(id, uargs[0], uargs[1], uargs[2], uargs[3], uargs[4]); +} +#endif /* NATIVE_ABI */ +#endif /* __FreeBSD__ */ + +#ifdef __NetBSD__ static void systrace_probe(uint32_t id, register_t sysnum, const struct sysent *se, const void *params, const register_t *ret, int error) @@ -160,24 +292,47 @@ systrace_probe(uint32_t id, register_t s /* XXX: fix for more arguments! */ dtrace_probe(id, uargs[0], uargs[1], uargs[2], uargs[3], uargs[4]); } +#endif static void systrace_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc) { int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); + if (SYSTRACE_ISENTRY((uintptr_t)parg)) - systrace_entry_setargdesc(sysnum, desc->dtargd_ndx, + systrace_entry_setargdesc(sysnum, desc->dtargd_ndx, desc->dtargd_native, sizeof(desc->dtargd_native)); else - systrace_return_setargdesc(sysnum, desc->dtargd_ndx, + systrace_return_setargdesc(sysnum, desc->dtargd_ndx, desc->dtargd_native, sizeof(desc->dtargd_native)); if (desc->dtargd_native[0] == '\0') desc->dtargd_ndx = DTRACE_ARGNONE; } +static uint64_t +systrace_getargval(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) +{ + uint64_t *uargs; + + uargs = curthread->t_dtrace_systrace_args; + if (uargs == NULL) + /* This is a return probe. */ + return (0); +#ifdef __FreeBSD__ + if (argno >= nitems(((struct syscall_args *)NULL)->args)) + return (0); +#endif +#ifdef __NetBSD__ + if (argno >= SYS_MAXSYSARGS) + return (0); +#endif + + return (uargs[argno]); +} + static void -systrace_provide(void *arg, const dtrace_probedesc_t *desc) +systrace_provide(void *arg, dtrace_probedesc_t *desc) { int i; @@ -185,6 +340,20 @@ systrace_provide(void *arg, const dtrace return; for (i = 0; i < MAXSYSCALL; i++) { +#ifdef __FreeBSD__ + if (dtrace_probe_lookup(systrace_id, MODNAME, + uglyhack.pp_syscallnames[i], "entry") != 0) + continue; + + (void)dtrace_probe_create(systrace_id, MODNAME, + uglyhack.pp_syscallnames[i], "entry", + SYSTRACE_ARTIFICIAL_FRAMES, + (void *)((uintptr_t)SYSTRACE_ENTRY(i))); + (void)dtrace_probe_create(systrace_id, MODNAME, + uglyhack.pp_syscallnames[i], "return", + SYSTRACE_ARTIFICIAL_FRAMES, + (void *)((uintptr_t)SYSTRACE_RETURN(i))); +#else const char *name = ALTSYSCALLNAMES[i] ? ALTSYSCALLNAMES[i] : SYSCALLNAMES[i]; if (dtrace_probe_lookup(systrace_id, NULL, name, "entry") != 0) @@ -196,6 +365,7 @@ systrace_provide(void *arg, const dtrace (void) dtrace_probe_create(systrace_id, NULL, name, "return", SYSTRACE_ARTIFICIAL_FRAMES, (void *)(intptr_t)SYSTRACE_RETURN(i)); +#endif } } @@ -222,10 +392,16 @@ systrace_enable(void *arg, dtrace_id_t i { int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); +#ifdef __FreeBSD__ + if (SYSENT[sysnum].sy_systrace_args_func == NULL) + SYSENT[sysnum].sy_systrace_args_func = systrace_args; +#endif + if (SYSTRACE_ISENTRY((uintptr_t)parg)) SYSENT[sysnum].sy_entry = id; else SYSENT[sysnum].sy_return = id; + return 0; } @@ -241,11 +417,16 @@ systrace_disable(void *arg, dtrace_id_t static void systrace_load(void *dummy) { - if (dtrace_register(PROVNAME, &systrace_attr, DTRACE_PRIV_USER, - NULL, &systrace_pops, NULL, &systrace_id) != 0) + if (dtrace_register(PROVNAME, &systrace_attr, DTRACE_PRIV_USER, NULL, + &systrace_pops, NULL, &systrace_id) != 0) return; +#ifdef NATIVE_ABI + systrace_probe_func = systrace_probe; +#endif +#ifdef __NetBSD__ EMUL.e_dtrace_syscall = systrace_probe; +#endif } @@ -254,14 +435,80 @@ systrace_unload() { int error; +#ifdef NATIVE_ABI + systrace_probe_func = NULL; +#endif +#ifdef __NetBSD__ + EMUL.e_dtrace_syscall = NULL; +#endif + if ((error = dtrace_unregister(systrace_id)) != 0) return (error); - EMUL.e_dtrace_syscall = NULL; - return error; } +#ifdef __FreeBSD__ +static int +systrace_modevent(module_t mod __unused, int type, void *data __unused) +{ + int error; + + error = 0; + switch (type) { + case MOD_LOAD: + break; + + case MOD_UNLOAD: + break; + + case MOD_SHUTDOWN: + break; + + default: + error = EOPNOTSUPP; + break; + + } + return (error); +} + +SYSINIT(systrace_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, + systrace_load, NULL); +SYSUNINIT(systrace_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, + systrace_unload, NULL); + +#ifdef LINUX_SYSTRACE +DEV_MODULE(systrace_linux, systrace_modevent, NULL); +MODULE_VERSION(systrace_linux, 1); +#ifdef __amd64__ +MODULE_DEPEND(systrace_linux, linux64, 1, 1, 1); +#else +MODULE_DEPEND(systrace_linux, linux, 1, 1, 1); +#endif +MODULE_DEPEND(systrace_linux, dtrace, 1, 1, 1); +MODULE_DEPEND(systrace_linux, opensolaris, 1, 1, 1); +#elif defined(LINUX32_SYSTRACE) +DEV_MODULE(systrace_linux32, systrace_modevent, NULL); +MODULE_VERSION(systrace_linux32, 1); +MODULE_DEPEND(systrace_linux32, linux, 1, 1, 1); +MODULE_DEPEND(systrace_linux32, dtrace, 1, 1, 1); +MODULE_DEPEND(systrace_linux32, opensolaris, 1, 1, 1); +#elif defined(FREEBSD32_SYSTRACE) +DEV_MODULE(systrace_freebsd32, systrace_modevent, NULL); +MODULE_VERSION(systrace_freebsd32, 1); +MODULE_DEPEND(systrace_freebsd32, dtrace, 1, 1, 1); +MODULE_DEPEND(systrace_freebsd32, opensolaris, 1, 1, 1); +#else +DEV_MODULE(systrace, systrace_modevent, NULL); +MODULE_VERSION(systrace, 1); +MODULE_DEPEND(systrace, dtrace, 1, 1, 1); +MODULE_DEPEND(systrace, opensolaris, 1, 1, 1); +#endif +#endif /* __FreeBSD__ */ + +#ifdef __NetBSD__ + static int MODCMD(modcmd_t cmd, void *data) { @@ -282,3 +529,5 @@ MODCMD(modcmd_t cmd, void *data) } MODULE(MODULE_CLASS_MISC, MODNAME, MODDEP) + +#endif /* __NetBSD__ */ Index: src/external/cddl/osnet/dist/cmd/dtrace/dtrace.1 =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/dtrace/dtrace.1,v retrieving revision 1.3 diff -u -p -r1.3 dtrace.1 --- src/external/cddl/osnet/dist/cmd/dtrace/dtrace.1 12 May 2017 21:01:36 -0000 1.3 +++ src/external/cddl/osnet/dist/cmd/dtrace/dtrace.1 17 May 2017 00:00:52 -0000 @@ -2,7 +2,7 @@ .\" CDDL HEADER START .\" .\" The contents of this file are subject to the terms of the -.\" Common Development and Distribution License (the "License"). +.\" Common Development and Distribution License (the "License"). .\" You may not use this file except in compliance with the License. .\" .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE @@ -18,653 +18,660 @@ .\" .\" CDDL HEADER END .\" Copyright (c) 2006, Sun Microsystems, Inc. All Rights Reserved. -.TH dtrace 1 "5 Sep 2006" "SunOS 5.11" "System Administration Commands" -.SH NAME -dtrace \- DTrace dynamic tracing compiler and tracing utility -.SH SYNOPSIS -.LP -.nf -\fBdtrace\fR [\fB-32\fR | \fB-64\fR] [\fB-aACeFGHhlqSvVwZ\fR] [\fB-b\fR \fIbufsz\fR] [\fB-c\fR \fIcmd\fR] - [\fB-D\fR \fIname\fR [\fI=value\fR]] [\fB-I\fR \fIpath\fR] [\fB-L\fR \fIpath\fR] [\fB-o\fR \fIoutput\fR] - [\fB-s\fR \fIscript\fR] [\fB-U\fR \fIname\fR] [\fB-x\fR \fIarg\fR [\fI=val\fR]] - [\fB-X\fR a | c | s | t] [\fB-p\fR \fIpid\fR] - [\fB-P\fR \fIprovider\fR [[\fIpredicate\fR] \fIaction\fR]] - [\fB-m\fR [\fIprovider:\fR] \fImodule\fR [[\fIpredicate\fR] \fIaction\fR]] - [\fB-f\fR [[\fIprovider:\fR] \fImodule:\fR] \fIfunction\fR [[\fIpredicate\fR] \fIaction\fR]] - [\fB-n\fR [[[\fIprovider:\fR] \fImodule:\fR] \fIfunction:\fR] \fIname\fR [[\fIpredicate\fR] \fIaction\fR]] - [\fB-i\fR \fIprobe-id\fR [[\fIpredicate\fR] \fIaction\fR]] -.fi - -.SH DESCRIPTION -.sp -.LP -DTrace is a comprehensive dynamic tracing framework for the Solaris Operating System. DTrace provides a powerful infrastructure that permits administrators, developers, and service personnel to concisely answer arbitrary questions about the behavior of the operating system and user programs. -.sp -.LP -The \fISolaris Dynamic Tracing Guide\fR describes how to use DTrace to observe, debug, and tune system behavior. Refer to this book for a detailed description of DTrace features, including the bundled DTrace observability -tools, instrumentation providers, and the D programming language. -.sp -.LP -The \fBdtrace\fR command provides a generic interface to the essential services provided by the DTrace facility, including: -.RS +4 -.TP -.ie t \(bu -.el o +.\" +.\" $FreeBSD: head/cddl/contrib/opensolaris/cmd/dtrace/dtrace.1 281705 2015-04-18 21:00:36Z markj $ +.\" +.Dd April 18, 2015 +.Dt DTRACE 1 +.Os +.Sh NAME +.Nm dtrace +.Nd dynamic tracing compiler and tracing utility +.Sh SYNOPSIS +.Nm +.Op Fl 32 | Fl 64 +.Op Fl aACeFGhHlqSvVwZ +.Op Fl b Ar bufsz +.Op Fl c Ar cmd +.Op Fl D Ar name Op Ns = Ns value +.Op Fl I Ar path +.Op Fl L Ar path +.Op Fl o Ar output +.Op Fl s Ar script +.Op Fl U Ar name +.Op Fl x Ar arg Op Ns = Ns value +.Op Fl X Cm a | c | s | t +.Op Fl p Ar pid +.Op Fl P Ar provider Oo Oo Ar predicate Oc Ar action Oc +.Op Fl m Oo Ar provider : Oc Ar module Oo Oo Ar predicate Oc Ar action Oc +.Op Fl f Oo Oo Ar provider : Oc Ar module : Oc Ar function Oo Oo Ar predicate \ + Oc Ar action Oc +.Op Fl n Oo Oo Oo Ar provider : Oc Ar module : Oc Ar function : Oc Ar name \ + Oo Oo Ar predicate Oc Ar action Oc +.Op Fl i Ar probe-id Oo Oo Ar predicate Oc Ar action Oc +.Sh DESCRIPTION +DTrace is a comprehensive dynamic tracing framework ported from Solaris. +DTrace provides a powerful infrastructure that permits administrators, +developers, and service personnel to concisely answer arbitrary questions about +the behavior of the operating system and user programs. +.Pp +The +.Nm +command provides a generic interface to the essential services provided by the +DTrace facility, including: +.Bl -bullet -offset indent +.It Options that list the set of probes and providers currently published by DTrace -.RE -.RS +4 -.TP -.ie t \(bu -.el o -Options that enable probes directly using any of the probe description specifiers (provider, module, function, name) -.RE -.RS +4 -.TP -.ie t \(bu -.el o -Options that run the D compiler and compile one or more D program files or programs written directly on the command line -.RE -.RS +4 -.TP -.ie t \(bu -.el o +.It +Options that enable probes directly using any of the probe description +specifiers (provider, module, function, name) +.It +Options that run the D compiler and compile one or more D program files or +programs written directly on the command line +.It Options that generate anonymous tracing programs -.RE -.RS +4 -.TP -.ie t \(bu -.el o +.It Options that generate program stability reports -.RE -.RS +4 -.TP -.ie t \(bu -.el o -Options that modify DTrace tracing and buffering behavior and enable additional D compiler features -.RE -.sp -.LP -You can use \fBdtrace\fR to create D scripts by using it in a \fB#!\fR declaration to create an interpreter file. You can also use \fBdtrace\fR to attempt to compile D programs and determine their properties without actually enabling tracing using the \fB-e\fR option. See \fBOPTIONS\fR. See the \fISolaris Dynamic Tracing Guide\fR for detailed examples of how to use the \fBdtrace\fR utility to perform these tasks. -.SH OPTIONS -.sp -.LP -The arguments accepted by the \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, and \fB-i\fR options can include an optional D language \fIpredicate\fR enclosed in slashes \fB//\fR and optional D language \fIaction\fR statement list enclosed in braces \fB{}\fR. D program code specified on the command line must be appropriately quoted to avoid interpretation of meta-characters by the shell. -.sp -.LP +.It +Options that modify DTrace tracing and buffering behavior and enable +additional D compiler features +.El +.Pp +You can use +.Nm +to create D scripts by using it in a shebang declaration to create an +interpreter file. +You can also use +.Nm +to attempt to compile D programs and determine their properties without +actually enabling traces using the +.Fl e +option. +.Sh OPTIONS +The arguments accepted by the +.Fl P , +.Fl m , +.Fl f , +.Fl n , +and +.Fl i +options can include an optional D language +.Ar predicate +enclosed in slashes and an optional D language +.Ar action +statement list enclosed in braces. +D program code specified on the command line must be appropriately quoted to +avoid interpretation of meta-characters by the shell. +.Pp The following options are supported: -.sp -.ne 2 -.mk -.na -\fB\fB-32\fR | \fB-64\fR\fR -.ad -.sp .6 -.RS 4n -The D compiler produces programs using the native data model of the operating system kernel. You can use the \fBisainfo\fR \fB-b\fR command to determine the current operating system data model. If the \fB-32\fR option is specified, \fBdtrace\fR forces -the D compiler to compile a D program using the 32-bit data model. If the \fB-64\fR option is specified, \fBdtrace\fR forces the D compiler to compile a D program using the 64-bit data model. These options are typically not required as \fBdtrace\fR selects the -native data model as the default. The data model affects the sizes of integer types and other language properties. D programs compiled for either data model can be executed on both 32-bit and 64-bit kernels. The \fB-32\fR and \fB-64\fR options also determine the ELF file format -(ELF32 or ELF64) produced by the \fB-G\fR option. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-a\fR\fR -.ad -.sp .6 -.RS 4n -Claim anonymous tracing state and display the traced data. You can combine the \fB-a\fR option with the \fB-e\fR option to force \fBdtrace\fR to exit immediately after consuming the anonymous tracing state rather than continuing to wait for new -data. See the \fISolaris Dynamic Tracing Guide\fR for more information about anonymous tracing. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-A\fR\fR -.ad -.sp .6 -.RS 4n -Generate \fBdriver.conf\fR(4) directives for anonymous tracing. This option constructs a set of \fBdtrace\fR(7D) configuration file directives to enable the specified probes for anonymous tracing and then exits. By default, \fBdtrace\fR attempts to store the directives to the file \fB/kernel/drv/dtrace.conf\fR. You can modify this behavior if you use the \fB-o\fR option to specify an alternate output file. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-b\fR \fIbufsz\fR\fR -.ad -.sp .6 -.RS 4n -Set principal trace buffer size (\fIbufsz\fR). The trace buffer size can include any of the size suffixes \fBk\fR, \fBm\fR, \fBg\fR, or \fBt\fR. If the buffer space cannot be allocated, \fBdtrace\fR attempts -to reduce the buffer size or exit depending on the setting of the \fBbufresize\fR property. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-c\fR \fIcmd\fR\fR -.ad -.sp .6 -.RS 4n -Run the specified command \fIcmd\fR and exit upon its completion. If more than one \fB-c\fR option is present on the command line, \fBdtrace\fR exits when all commands have exited, reporting the exit status for each child process as it -terminates. The process-ID of the first command is made available to any D programs specified on the command line or using the \fB-s\fR option through the \fB$target\fR macro variable. Refer to the \fISolaris Dynamic Tracing Guide\fR for more information -on macro variables. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-C\fR\fR -.ad -.sp .6 -.RS 4n -Run the C preprocessor \fBcpp\fR(1) over D programs before compiling them. You can pass options to the C preprocessor using the \fB-D\fR, \fB-U\fR, \fB-I\fR, and \fB-H\fR options. You can select the degree of C standard conformance if you use the \fB-X\fR option. For a description of the set of tokens defined by the D compiler when invoking the C preprocessor, see \fB-X\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-D\fR \fIname\fR \fB[=\fR\fIvalue\fR\fB]\fR\fR -.ad -.sp .6 -.RS 4n -Define \fIname\fR when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). If you specify the equals sign (\fB=\fR) -and additional \fIvalue\fR, the name is assigned the corresponding value. This option passes the \fB-D\fR option to each \fBcpp\fR invocation. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-e\fR\fR -.ad -.sp .6 -.RS 4n -Exit after compiling any requests and consuming anonymous tracing state (\fB-a\fR option) but prior to enabling any probes. You can combine this option with the \fB-a\fR option to print anonymous tracing data and exit. You can also combine this option with D -compiler options. This combination verifies that the programs compile without actually executing them and enabling the corresponding instrumentation. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-f\fR\fB[[\fR\fIprovider\fR\fB:]\fR\fImodule\fR\fB:]\fR\fIfunction\fR\fB[[\fR\fIpredicate\fR\fB]\fR\fIaction\fR\fB]]\fR\fR -.ad -.sp .6 -.RS 4n -Specify function name to trace or list (\fB-l\fR option). The corresponding argument can include any of the probe description forms \fIprovider:module:function\fR, \fImodule:function\fR, or \fIfunction\fR. -Unspecified probe description fields are left blank and match any probes regardless of the values in those fields. If no qualifiers other than \fIfunction\fR are specified in the description, all probes with the corresponding \fIfunction\fR are matched. -The \fB-f\fR argument can be suffixed with an optional D probe clause. You can specify more than one \fB-f\fR option on the command line at a time. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-F\fR\fR -.ad -.sp .6 -.RS 4n -Coalesce trace output by identifying function entry and return. Function entry probe reports are indented and their output is prefixed with \fB->\fR. Function return probe reports are unindented and their output is prefixed with \fB<-\fR\&. System call -entry probe reports are indented and their output is prefixed with \fB=>\fR. System call return probe reports are unindented and their output is prefixed with \fB<=\fR\&. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-G\fR\fR -.ad -.sp .6 -.RS 4n -Generate an ELF file containing an embedded DTrace program. The DTrace probes specified in the program are saved inside of a relocatable ELF object which can be linked into another program. If the \fB-o\fR option is present, the ELF file is saved using the pathname specified -as the argument for this operand. If the \fB-o\fR option is not present and the DTrace program is contained with a file whose name is \fB\fIfilename\fR.d\fR, then the ELF file is saved using the name \fB\fIfilename\fR.o\fR. -Otherwise the ELF file is saved using the name \fBd.out\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-H\fR\fR -.ad -.sp .6 -.RS 4n -Print the pathnames of included files when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). This option passes the \fB-H\fR option -to each \fBcpp\fR invocation, causing it to display the list of pathnames, one for each line, to \fBstderr\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-h\fR\fR -.ad -.sp .6 -.RS 4n -Generate a header file containing macros that correspond to probes in the specified provider definitions. This option should be used to generate a header file that is included by other source files for later use with the \fB-G\fR option. If the \fB-o\fR option -is present, the header file is saved using the pathname specified as the argument for that option. If the \fB-o\fR option is not present and the DTrace program is contained with a file whose name is \fIfilename\fR\fB\&.d\fR, then the header file is saved -using the name \fIfilename\fR\fB\&.h\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-i\fR \fIprobe-id\fR\fB[[\fR\fIpredicate\fR] \fIaction\fR\fB]\fR\fR -.ad -.sp .6 -.RS 4n -Specify probe identifier (\fIprobe-id\fR) to trace or list (\fB-l\fR option). You can specify probe IDs using decimal integers as shown by \fBdtrace\fR \fB-l\fR. The \fB-i\fR argument can be suffixed with an optional -D probe clause. You can specify more than one \fB-i\fR option at a time. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-I\fR \fIpath\fR\fR -.ad -.sp .6 -.RS 4n -Add the specified directory \fIpath\fR to the search path for \fB#include\fR files when invoking \fBcpp\fR(1) (enabled -using the \fB-C\fR option). This option passes the \fB-I\fR option to each \fBcpp\fR invocation. The specified \fIpath\fR is inserted into the search path ahead of the default directory list. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-L\fR \fIpath\fR\fR -.ad -.sp .6 -.RS 4n -Add the specified directory \fIpath\fR to the search path for DTrace libraries. DTrace libraries are used to contain common definitions that can be used when writing D programs. The specified \fIpath\fR is added after the default library -search path. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-l\fR\fR -.ad -.sp .6 -.RS 4n -List probes instead of enabling them. If the \fB-l\fR option is specified, \fBdtrace\fR produces a report of the probes matching the descriptions given using the \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, \fB-i\fR, -and \fB-s\fR options. If none of these options are specified, this option lists all probes. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-m\fR [[\fIprovider:\fR] \fImodule:\fR [[\fIpredicate\fR] \fIaction\fR]]\fR -.ad -.sp .6 -.RS 4n -Specify module name to trace or list (\fB-l\fR option). The corresponding argument can include any of the probe description forms \fIprovider:module\fR or \fImodule\fR. Unspecified probe description fields are left blank and match -any probes regardless of the values in those fields. If no qualifiers other than \fImodule\fR are specified in the description, all probes with a corresponding \fImodule\fR are matched. The \fB-m\fR argument can be suffixed with an optional D -probe clause. More than one \fB-m\fR option can be specified on the command line at a time. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-n\fR [[[\fIprovider:\fR] \fImodule:\fR] \fIfunction:\fR] \fIname\fR [[\fIpredicate\fR] \fIaction\fR]\fR -.ad -.sp .6 -.RS 4n -Specify probe name to trace or list (\fB-l\fR option). The corresponding argument can include any of the probe description forms \fIprovider:module:function:name\fR, \fImodule:function:name\fR, \fIfunction:name\fR, -or \fIname\fR. Unspecified probe description fields are left blank and match any probes regardless of the values in those fields. If no qualifiers other than \fIname\fR are specified in the description, all probes with a corresponding \fIname\fR are -matched. The \fB-n\fR argument can be suffixed with an optional D probe clause. More than one \fB-n\fR option can be specified on the command line at a time. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-o\fR \fIoutput\fR\fR -.ad -.sp .6 -.RS 4n -Specify the \fIoutput\fR file for the \fB-A\fR , \fB-G\fR, and \fB-l\fR options, or for the traced data itself. If the \fB-A\fR option is present and \fB-o\fR is not present, the default output file is \fB/kernel/drv/dtrace.conf\fR. If the \fB-G\fR option is present and the \fB-s\fR option's argument is of the form \fB\fIfilename\fR.d\fR and \fB-o\fR is not present, the default output file is \fB\fIfilename\fR.o\fR. -Otherwise the default output file is \fBd.out\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-p\fR \fIpid\fR\fR -.ad -.sp .6 -.RS 4n -Grab the specified process-ID \fIpid\fR, cache its symbol tables, and exit upon its completion. If more than one \fB-p\fR option is present on the command line, \fBdtrace\fR exits when all commands have exited, reporting the exit status -for each process as it terminates. The first process-ID is made available to any D programs specified on the command line or using the \fB-s\fR option through the \fB$target\fR macro variable. Refer to the \fISolaris Dynamic Tracing Guide\fR for -more information on macro variables. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-P\fR \fIprovider\fR \fB[[\fR\fIpredicate\fR\fB]\fR \fIaction\fR]\fR -.ad -.sp .6 -.RS 4n -Specify provider name to trace or list (\fB-l\fR option). The remaining probe description fields module, function, and name are left blank and match any probes regardless of the values in those fields. The \fB-P\fR argument can be suffixed with an optional D -probe clause. You can specify more than one \fB-P\fR option on the command line at a time. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-q\fR\fR -.ad -.sp .6 -.RS 4n -Set quiet mode. \fBdtrace\fR suppresses messages such as the number of probes matched by the specified options and D programs and does not print column headers, the CPU ID, the probe ID, or insert newlines into the output. Only data traced and formatted by D program -statements such as \fBtrace()\fR and \fBprintf()\fR is displayed to \fBstdout\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-s\fR\fR -.ad -.sp .6 -.RS 4n -Compile the specified D program source file. If the \fB-e\fR option is present, the program is compiled but instrumentation is not enabled. If the \fB-l\fR option is present, the program is compiled and the set of probes matched by it is listed, but instrumentation -is not enabled. If none of \fB-e\fR, \fB-l\fR, \fB-G\fR, or \fB-A\fR are present, the instrumentation specified by the D program is enabled and tracing begins. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-S\fR\fR -.ad -.sp .6 -.RS 4n -Show D compiler intermediate code. The D compiler produces a report of the intermediate code generated for each D program to \fBstderr\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-U\fR \fIname\fR\fR -.ad -.sp .6 -.RS 4n -Undefine the specified \fIname\fR when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). This option passes the \fB-U\fR option to each \fBcpp\fR invocation. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-v\fR\fR -.ad -.sp .6 -.RS 4n -Set verbose mode. If the \fB-v\fR option is specified, \fBdtrace\fR produces a program stability report showing the minimum interface stability and dependency level for the specified D programs. DTrace stability levels are explained in further detail in the \fISolaris Dynamic Tracing Guide\fR. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-V\fR\fR -.ad -.sp .6 -.RS 4n -Report the highest D programming interface version supported by \fBdtrace\fR. The version information is printed to \fBstdout\fR and the \fBdtrace\fR command exits. Refer to the \fISolaris Dynamic Tracing Guide\fR for -more information about DTrace versioning features. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-w\fR\fR -.ad -.sp .6 -.RS 4n -Permit destructive actions in D programs specified using the \fB-s\fR, \fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, or \fB-i\fR options. If the \fB-w\fR option is not specified, \fBdtrace\fR does not -permit the compilation or enabling of a D program that contains destructive actions. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-x\fR \fIarg\fR [\fI=val\fR]\fR -.ad -.sp .6 -.RS 4n -Enable or modify a DTrace runtime option or D compiler option. The list of options is found in the \fISolaris Dynamic Tracing Guide\fR. Boolean options are enabled by specifying their name. Options with values are set by separating the option name and -value with an equals sign (\fB=\fR). -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-X\fR \fBa | c | s | t\fR\fR -.ad -.sp .6 -.RS 4n -Specify the degree of conformance to the ISO C standard that should be selected when invoking \fBcpp\fR(1) (enabled using the \fB-C\fR option). -The \fB-X\fR option argument affects the value and presence of the \fB__STDC__\fR macro depending upon the value of the argument letter. -.sp -The \fB-X\fR option supports the following arguments: -.sp -.ne 2 -.mk -.na -\fB\fBa\fR\fR -.ad -.RS 5n -.rt -Default. ISO C plus K&R compatibility extensions, with semantic changes required by ISO C. This is the default mode if \fB-X\fR is not specified. The predefined macro \fB__STDC__\fR has a value of 0 when \fBcpp\fR is invoked in conjunction -with the \fB-Xa\fR option. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fBc\fR\fR -.ad -.RS 5n -.rt -Conformance. Strictly conformant ISO C, without K&R C compatibility extensions. The predefined macro \fB__STDC__\fR has a value of 1 when \fBcpp\fR is invoked in conjunction with the \fB-Xc\fR option. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fBs\fR\fR -.ad -.RS 5n -.rt -K&R C only. The macro \fB__STDC__\fR is not defined when \fBcpp\fR is invoked in conjunction with the \fB-Xs\fR option. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fBt\fR\fR -.ad -.RS 5n -.rt -Transition. ISO C plus K&R C compatibility extensions, without semantic changes required by ISO C. The predefined macro \fB__STDC__\fR has a value of 0 when \fBcpp\fR is invoked in conjunction with the \fB-Xt\fR option. -.RE - -As the \fB-X\fR option only affects how the D compiler invokes the C preprocessor, the \fB-Xa\fR and \fB-Xt\fR options are equivalent from the perspective of D and both are provided only to ease re-use of settings from a C build environment. -.sp -Regardless of the \fB-X\fR mode, the following additional C preprocessor definitions are always specified and valid in all modes: -.RS +4 -.TP -.ie t \(bu -.el o -\fB__sun\fR -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__unix\fR -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__SVR4\fR -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__sparc\fR (on SPARC systems only) -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__sparcv9\fR (on SPARC systems only when 64-bit programs are compiled) -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__i386\fR (on x86 systems only when 32-bit programs are compiled) -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__amd64\fR (on x86 systems only when 64-bit programs are compiled) -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__\fI`uname -s`\fR_\fI`uname -r`\fR\fR (for example, \fB__SunOS_5_10\fR) -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__SUNW_D=1\fR -.RE -.RS +4 -.TP -.ie t \(bu -.el o -\fB__SUNW_D_VERSION=0x\fIMMmmmuuu\fR\fR -.sp -Where \fIMM\fR is the major release value in hexadecimal, \fImmm\fR is the minor release value in hexadecimal, and \fIuuu\fR is the -micro release value in hexadecimal. Refer to the \fISolaris Dynamic Tracing Guide\fR for more information about DTrace versioning. -.RE -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB-Z\fR\fR -.ad -.sp .6 -.RS 4n -Permit probe descriptions that match zero probes. If the \fB-Z\fR option is not specified, \fBdtrace\fR reports an error and exits if any probe descriptions specified in D program files (\fB-s\fR option) or on the command line (\fB-P\fR, \fB-m\fR, \fB-f\fR, \fB-n\fR, or \fB-i\fR options) contain descriptions that do not match any known probes. -.RE - -.SH OPERANDS -.sp -.LP -You can specify zero or more additional arguments on the \fBdtrace\fR command line to define a set of macro variables (\fB$1\fR, \fB$2\fR, and so forth). The additional arguments can be used in D programs specified using the \fB-s\fR option -or on the command line. The use of macro variables is described further in the \fISolaris Dynamic Tracing Guide\fR. -.SH EXIT STATUS -.sp -.LP -The following exit values are returned: -.sp -.ne 2 -.mk -.na -\fB0\fR -.ad -.RS 5n -.rt -Successful completion. -.sp -For D program requests, an exit status of \fB0\fR indicates that programs were successfully compiled, probes were successfully enabled, or anonymous state was successfully retrieved. \fBdtrace\fR returns \fB0\fR even if the specified tracing requests -encountered errors or drops. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB1\fR\fR -.ad -.RS 5n -.rt +.Bl -tag -width indent +.It Fl 32 | Fl 64 +The D compiler produces programs using the native data model of the operating +system kernel. +If the +.Fl 32 +option is specified, +.Nm +forces the D compiler to compile a D program using the 32-bit data model. +If the +.Fl 64 +option is specified, +.Nm +forces the D compiler to compile a D program using the 64-bit data model. +These options are typically not required as +.Nm +selects the native data model as the default. +The data model affects the sizes of integer types and other language properties. +D programs compiled for either data model can be executed on both 32-bit and +64-bit kernels. +The +.Fl 32 +and +.Fl 64 +options also determine the +.Xr elf 5 +file format (ELF32 or ELF64) produced by the +.Fl G +option. +.It Fl a +Claim anonymous tracing state and display the traced data. +You can combine the +.Fl a +option with the +.Fl e +option to force +.Nm +to exit immediately after consuming the anonymous tracing state rather than +continuing to wait for new data. +.It Fl A +Generate directives for anonymous tracing and write them to +.Pa /boot/dtrace.dof . +This option constructs a set of dtrace configuration file directives to enable +the specified probes for anonymous tracing and then exits. +By default, +.Nm +attempts to store the directives to the file +.Pa /boot/dtrace.dof . +This behavior can be modified using the +.Fl o +option to specify an alternate output file. +.It Fl b Ar bufsz +Set the principal trace buffer size to +.Ar bufsz . +The trace buffer size can include any of the size suffixes k, m, g, or t. +If the buffer space cannot be allocated, +.Nm dtrace +attempts to reduce the buffer size or exit depending on the setting of the +bufresize property. +.It Fl c Ar cmd +Run the specified command +.Ar cmd +and exit upon its completion. +If more than one +.Fl c +option is present on the command line, +.Nm dtrace +exits when all commands have exited, reporting the exit status for each child +process as it terminates. +The process ID of the first command is made available to any D programs +specified on the command line or using the +.Fl s +option through the +.Li $target +macro variable. +.It Fl C +Run the C preprocessor +.Xr cpp 1 +over D programs before compiling them. +You can pass options to the C preprocessor using the +.Fl D , +.Fl U , +.Fl I , +and +.Fl H +options. +You can select the degree of C standard conformance if you use the +.Fl X +option. +For a description of the set of tokens defined by the D compiler when invoking +the C preprocessor, see +.Fl X . +.It Fl D Ar name Op Ns = Ns value +Define +.Ar name +when invoking +.Xr cpp 1 +(enabled using the +.Fl C +option). +If you specify an additional +.Ar value , +the name is assigned the corresponding value. +This option passes the +.Fl D +option to each +.Xr cpp 1 +invocation. +.It Fl e +Exit after compiling any requests and consuming anonymous tracing state +.Fl ( a +option) but prior to enabling any probes. +You can combine this option with the +.Fl a +option to print anonymous tracing data and exit. +You can also combine this option with D compiler options. +This combination verifies that the programs compile without actually executing +them and enabling the corresponding instrumentation. +.It Fl f Oo Oo Ar provider : Oc Ar module : Oc Ar function Oo Oo Ar predicate \ + Oc Ar action Oc +Specify function name to trace or list +.Fl ( l +option). +The corresponding argument can include any of the probe description forms +.Ar provider:module:function , +.Ar module:function , +or +.Ar function . +Unspecified probe description fields are left blank and match any probes +regardless of the values in those fields. +If no qualifiers other than +.Ar function +are specified in the description, all probes with the corresponding +.Ar function +are matched. +The +.Fl f +argument can be suffixed with an optional D probe clause. +You can specify more than one +.Fl f +option on the command line at a time. +.It Fl F +Coalesce trace output by identifying function entry and return. +Function entry probe reports are indented and their output is prefixed with +.Ql -> . +Function return probe reports are unindented and their output is prefixed with +.Ql <- . +System call entry probe reports are indented and their output is prefixed with +.Ql => . +System call return probe reports are unindented and their output is prefixed +with +.Ql <= . +.It Fl G +Generate an ELF file containing an embedded DTrace program. +The DTrace probes specified in the program are saved inside of a relocatable ELF +object which can be linked into another program. +If the +.Fl o +option is present, the ELF file is saved using the pathname specified as the +argument for this operand. +If the +.Fl o +option is not present and the DTrace program is contained with a file whose name +is +.Ar filename.d , +then the ELF file is saved using the name +.Ar filename.o . +Otherwise the ELF file is saved using the name d.out. +.It Fl h +Generate a header file containing macros that correspond to probes in the +specified provider definitions. +This option should be used to generate a header file that is included by other +source files for later use with the +.Fl G +option. +If the +.Fl o +option is present, the header file is saved using the pathname specified as the +argument for that option. +If the +.Fl o +option is not present and the DTrace program is contained within a file whose +name is +.Ar filename.d , +then the header file is saved using the name +.Ar filename.h . +.It Fl H +Print the pathnames of included files when invoking +.Xr cpp 1 +(enabled using the +.Fl C +option). +This option passes the +.Fl H +option to each +.Xr cpp 1 +invocation, causing it to display the list of pathnames, one for each line, to +standard error. +.It Fl i Ar probe-id Op Oo Ar predicate Oc Ar action +Specify probe identifier +.Ar ( probe-id ) +to trace or list +.Ar ( l +option). +You can specify probe IDs using decimal integers as shown by `dtrace -l`. +The +.Fl i +argument can be suffixed with an optional D probe clause. +You can specify more than one +.Fl i +option at a time. +.It Fl I Ar path +Add the specified directory +.Ar path +to the search path for #include files when invoking +.Xr cpp 1 +(enabled using the +.Fl C +option). +This option passes the +.Fl I +option to each +.Xr cpp 1 +invocation. +The specified +.Ar path +is inserted into the search path ahead of the default directory list. +.It Fl l +List probes instead of enabling them. +If the +.Fl l +option is specified, +.Nm +produces a report of the probes matching the descriptions given using the +.Fl P , m , f , n , i , +and +.Fl s +options. +If none of these options are specified, this option lists all probes. +.It Fl L Ar path +Add the specified directory +.Ar path +to the search path for DTrace libraries. +DTrace libraries are used to contain common definitions that can be used when +writing D programs. +The specified +.Ar path +is added after the default library search path. +.It Fl m Oo Ar provider : Oc Ar module Oo Oo Ar predicate Oc Ar action Oc +Specify module name to trace or list +.Fl ( l +option). +The corresponding argument can include any of the probe description forms +.Ar provider:module +or +.Ar module . +Unspecified probe description fields are left blank and match any probes +regardless of the values in those fields. +If no qualifiers other than +.Ar module +are specified in the description, all probes with a corresponding +.Ar module +are matched. +The +.Fl m +argument can be suffixed with an optional D probe clause. +More than one +.Fl m +option can be specified on the command line at a time. +.It Fl n Oo Oo Oo Ar provider : Oc Ar module : Oc Ar function : Oc Ar name \ + Oo Oo Ar predicate Oc Ar action Oc +Specify probe name to trace or list +.Fl ( l +option). +The corresponding argument can include any of the probe description forms +.Ar provider:module:function:name , module:function:name , function:name , +or +.Ar name . +Unspecified probe description fields are left blank and match any probes +regardless of the values in those fields. +If no qualifiers other than +.Ar name +are specified in the description, all probes with a corresponding +.Ar name +are matched. +The +.Fl n +argument can be suffixed with an optional D probe clause. +More than one +.Fl n +option can be specified on the command line at a time. +.It Fl o Ar output +Specify the +.Ar output +file for the +.Fl A , G , +and +.Fl l +options, or for the traced data itself. +If the +.Fl A +option is present and +.Fl o +is not present, the default output file is +.Pa /boot/dtrace.dof . +If the +.Fl G +option is present and the +.Fl s +option's argument is of the form +.Ar filename.d +and +.Fl o +is not present, the default output file is +.Ar filename.o . +Otherwise the default output file is +.Ar d.out . +.It Fl p Ar pid +Grab the specified process-ID +.Ar pid , +cache its symbol tables, and exit upon its completion. +If more than one +.Fl p +option is present on the command line, +.Nm +exits when all commands have exited, reporting the exit status for each process +as it terminates. +The first process-ID is made available to any D programs specified on the +command line or using the +.Fl s +option through the +.Li $target +macro variable. +.It Fl P Ar provider Oo Oo Ar predicate Oc Ar action Oc +Specify provider name to trace or list +.Fl ( l +option). +The remaining probe description fields module, function, and name are left +blank and match any probes regardless of the values in those fields. +The +.Fl P +argument can be suffixed with an optional D probe clause. +You can specify more than one +.Fl P +option on the command line at a time. +.It Fl q +Set quiet mode. +.Nm +suppresses messages such as the number of probes matched by the specified +options and D programs and does not print column headers, the CPU ID, the probe +ID, or insert newlines into the output. +Only data traced and formatted by D program statements such as +.Ql dtrace() +and +.Ql printf() +is displayed to standard output. +.It Fl s Ar script +Compile the specified D program source file. +If the +.Fl e +option is present, the program is compiled but instrumentation is not enabled. +If the +.Fl l +option is present, the program is compiled and the set of probes matched by it +is listed, but instrumentation is not enabled. +If none of +.Fl e , l , G , +or +.Fl A +are present, the instrumentation specified by the D program is enabled and +tracing begins. +.It Fl S +Show D compiler intermediate code. +The D compiler produces a report of the intermediate code generated for each D +program to standard error. +.It Fl U Ar name +Undefine the specified +.Ar name +when invoking +.Xr cpp 1 +(enabled using the +.Fl C +option). +This option passes the +.Fl U +option to each +.Xr cpp 1 +invocation. +.It Fl v +Set verbose mode. +If the +.Fl v +option is specified, +.Nm +produces a program stability report showing the minimum interface stability and +dependency level for the specified D programs. +.It Fl V +Report the highest D programming interface version supported by +.Nm . +The version information is printed to standard output and the +.Nm +command exits. +.It Fl w +Permit destructive actions in D programs specified using the +.Fl s , P , m , f , n , +or +.Fl i +options. +If the +.Fl w +option is not specified, +.Nm +does not permit the compilation or enabling of a D program that contains +destructive actions. +.It Fl x Ar arg Op Ns = Ns value +Enable or modify a DTrace runtime option or D compiler option. +Boolean options are enabled by specifying their name. +Options with values are set by separating the option name and value with an +equals sign (=). +.It Fl X Cm a | c | s | t +Specify the degree of conformance to the ISO C standard that should be selected +when invoking +.Xr cpp 1 +(enabled using the +.Fl C +option). +The +.Fl X +option argument affects the value and presence of the __STDC__ macro depending +upon the value of the argument letter. +.sp +The +.Fl X +option supports the following arguments: +.Bl -tag -width indent +.It a +Default. +ISO C plus K&R compatibility extensions, with semantic changes required by ISO +C. +This is the default mode if +.Fl X +is not specified. +The predefined macro __STDC__ has a value of 0 when +.Xr cpp 1 +is invoked in conjunction with the +.Fl Xa +option. +.It c +Conformance. +Strictly conformant ISO C, without K&R C compatibility extensions. +The predefined macro __STDC__ has a value of 1 when +.Xr cpp 1 +is invoked in conjunction with the +.Fl \&Xc +option. +.It s +K&R C only. +The macro __STDC__ is not defined when +.Xr cpp 1 +is invoked in conjunction with the +.Fl Xs +option. +.It t +Transition. +ISO C plus K&R C compatibility extensions, without semantic changes required by +ISO C. +The predefined macro __STDC__ has a value of 0 when +.Xr cpp 1 +is invoked in conjunction with the +.Fl Xt +option. +.El +.Pp +As the +.Fl X +option only affects how the D compiler invokes the C preprocessor, the +.Fl Xa +and +.Fl Xt +options are equivalent from the perspective of D and both are provided only to +ease re-use of settings from a C build environment. +.Pp +Regardless of the +.Fl X +mode, the following additional C preprocessor definitions are always specified +and valid in all modes: +.Bl -bullet -offset indent +.It +__sun +.It +__unix +.It +__SVR4 +.It +__sparc (on SPARC systems only) +.It +__sparcv9 (on SPARC systems only when 64-bit programs are compiled) +.It +__i386 (on x86 systems only when 32-bit programs are compiled) +.It +__amd64 (on x86 systems only when 64-bit programs are compiled) +.It +__`uname -s`_`uname -r` (for example, +.Ql FreeBSD_9.2-RELEASE . +.It +__SUNW_D=1 +.It +.No __SUNW_D_VERSION=0x Ns Ar MMmmmuuu +.Pp +Where +.Ar MM +is the major release value in hexadecimal, +.Ar mmm +is the minor release value in hexadecimal, and +.Ar uuu +is the micro release value in hexadecimal. +.El +.It Fl Z +Permit probe descriptions that match zero probes. +If the +.Fl Z +option is not specified, +.Nm +reports an error and exits if any probe descriptions specified in D program +files +.Fl ( s +option) or on the command line +.Fl ( P , m , f , n , +or +.Fl i +options) contain descriptions that do not match any known probes. +.El +.Sh OPERANDS +You can specify zero or more additional arguments on the +.Nm +command line to define a set of macro variables and so forth). +The additional arguments can be used in D programs specified using the +.Fl s +option or on the command line. +.Sh FILES +.Bl -tag -width /boot/dtrace.dof -compact +.It Pa /boot/dtrace.dof +File for anonymous tracing directives. +.El +.Sh EXIT STATUS +The following exit statuses are returned: +.Bl -tag -width indent +.It 0 +Successful completion. +.Pp +For D program requests, an exit status of 0 indicates that programs were +successfully compiled, probes were successfully enabled, or anonymous state +was successfully retrieved. +.Nm +returns 0 even if the specified tracing requests encountered errors or drops. +.It 1 An error occurred. -.sp -For D program requests, an exit status of \fB1\fR indicates that program compilation failed or that the specified request could not be satisfied. -.RE - -.sp -.ne 2 -.mk -.na -\fB\fB2\fR\fR -.ad -.RS 5n -.rt +.Pp +For D program requests, an exit status of 1 indicates that program compilation +failed or that the specified request could not be satisfied. +.It 2 Invalid command line options or arguments were specified. -.RE - -.SH ATTRIBUTES -.sp -.LP -See \fBattributes\fR(5) for descriptions of the following attributes: -.sp - -.sp -.TS -tab() box; -cw(2.75i) |cw(2.75i) -lw(2.75i) |lw(2.75i) -. -ATTRIBUTE TYPEATTRIBUTE VALUE -_ -AvailabilitySUNWdtrc -_ -Interface StabilitySee below. -.TE - -.sp -.LP -The command-line syntax is Committed. The human-readable output is Uncommitted. -.SH SEE ALSO -.sp -.LP -\fBcpp\fR(1), \fBisainfo\fR(1), \fBlibdtrace\fR(3LIB), \fBdriver.conf\fR(4), \fBattributes\fR(5), \fBdtrace\fR(7D) -.sp -.LP -\fISolaris Dynamic Tracing Guide\fR +.El +.Sh SEE ALSO +.Xr cpp 1 , +.Xr dtruss 1 , +.Xr elf 5 , +.Xr SDT 9 +.Rs +.%T Solaris Dynamic Tracing Guide +.Re Index: src/external/cddl/osnet/dist/cmd/dtrace/dtrace.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/dtrace/dtrace.c,v retrieving revision 1.10 diff -u -p -r1.10 dtrace.c --- src/external/cddl/osnet/dist/cmd/dtrace/dtrace.c 5 Jun 2017 21:19:32 -0000 1.10 +++ src/external/cddl/osnet/dist/cmd/dtrace/dtrace.c 7 Jun 2017 18:38:01 -0000 @@ -23,8 +23,10 @@ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ #include #include @@ -41,13 +43,16 @@ #include #include #include -#if defined(sun) +#ifdef illumos #include #endif #include -#if defined(sun) +#ifdef illumos #include #endif +#ifdef __FreeBSD__ +#include +#endif typedef struct dtrace_cmd { void (*dc_func)(struct dtrace_cmd *); /* function to compile arg */ @@ -88,6 +93,9 @@ static int g_flowindent; static int g_intr; static int g_impatient; static int g_newline; +#if defined(__FreeBSD__) || defined(__NetBSD__) +static int g_siginfo; +#endif static int g_total; static int g_cflags; static int g_oflags; @@ -99,7 +107,7 @@ static int g_grabanon = 0; static const char *g_ofile = NULL; static FILE *g_ofp; static dtrace_hdl_t *g_dtp; -#if defined(sun) +#ifdef illumos static char *g_etcfile = "/etc/system"; static const char *g_etcbegin = "* vvvv Added by DTrace"; static const char *g_etcend = "* ^^^^ Added by DTrace"; @@ -195,6 +203,13 @@ fatal(const char *fmt, ...) verror(fmt, ap); va_end(ap); + /* + * Close the DTrace handle to ensure that any controlled processes are + * correctly restored and continued. + */ + if (g_dtp) + dtrace_close(g_dtp); + exit(E_ERROR); } @@ -202,7 +217,7 @@ fatal(const char *fmt, ...) static void __printflike(1, 2) __dead dfatal(const char *fmt, ...) { -#if !defined(sun) && defined(NEED_ERRLOC) +#if !defined(illumos) && defined(NEED_ERRLOC) char *p_errfile = NULL; int errline = 0; #endif @@ -223,7 +238,7 @@ dfatal(const char *fmt, ...) (void) fprintf(stderr, "%s\n", dtrace_errmsg(g_dtp, dtrace_errno(g_dtp))); } -#if !defined(sun) && defined(NEED_ERRLOC) +#if !defined(illumos) && defined(NEED_ERRLOC) dt_get_errloc(g_dtp, &p_errfile, &errline); if (p_errfile != NULL) printf("File '%s', line %d\n", p_errfile, errline); @@ -388,7 +403,42 @@ dof_prune(const char *fname) free(buf); } -#if defined(sun) +#ifdef __FreeBSD__ +/* + * Use nextboot(8) to tell the loader to load DTrace kernel modules during + * the next boot of the system. The nextboot(8) configuration is removed during + * boot, so it will not persist indefinitely. + */ +static void +bootdof_add(void) +{ + char * const nbargv[] = { + "nextboot", "-a", + "-e", "dtraceall_load=\"YES\"", + "-e", "dtrace_dof_load=\"YES\"", + "-e", "dtrace_dof_name=\"/boot/dtrace.dof\"", + "-e", "dtrace_dof_type=\"dtrace_dof\"", + NULL, + }; + pid_t child; + int err, status; + + err = posix_spawnp(&child, "nextboot", NULL, NULL, nbargv, + NULL); + if (err != 0) { + error("failed to execute nextboot: %s", strerror(err)); + exit(E_ERROR); + } + + if (waitpid(child, &status, 0) != child) + fatal("waiting for nextboot"); + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + error("nextboot returned with status %d", status); + exit(E_ERROR); + } +} +#endif +#ifdef illumos static void etcsystem_prune(void) { @@ -499,12 +549,13 @@ etcsystem_add(void) error("added forceload directives to %s\n", g_ofile); } -#endif +#endif /* illumos__ */ static void print_probe_info(const dtrace_probeinfo_t *p) { char buf[BUFSIZ]; + char *user; int i; oprintf("\n\tProbe Description Attributes\n"); @@ -528,10 +579,14 @@ print_probe_info(const dtrace_probeinfo_ oprintf("\n\tArgument Types\n"); for (i = 0; i < p->dtp_argc; i++) { + if (p->dtp_argv[i].dtt_flags & DTT_FL_USER) + user = "userland "; + else + user = ""; if (ctf_type_name(p->dtp_argv[i].dtt_ctfp, p->dtp_argv[i].dtt_type, buf, sizeof (buf)) == NULL) (void) strlcpy(buf, "(unknown)", sizeof (buf)); - oprintf("\t\targs[%d]: %s\n", i, buf); + oprintf("\t\targs[%d]: %s%s\n", i, user, buf); } if (p->dtp_argc == 0) @@ -629,25 +684,26 @@ anon_prog(const dtrace_cmd_t *dcp, dof_h p = (uchar_t *)dof; q = p + dof->dofh_loadsz; -#if defined(sun) - oprintf("dof-data-%d=0x%x", n, *p++); - - while (p < q) - oprintf(",0x%x", *p++); - - oprintf(";\n"); -#else +#ifdef __FreeBSD__ /* - * On FreeBSD, the DOF data is handled as a kernel environment (kenv) - * string. We use two hex characters per DOF byte. + * On FreeBSD, the DOF file is read directly during boot - just write + * two hex characters per byte. */ - oprintf("dof-data-%d=%02x", n, *p++); + oprintf("dof-data-%d=", n); while (p < q) oprintf("%02x", *p++); oprintf("\n"); #endif +#ifdef illumos + oprintf("dof-data-%d=0x%x", n, *p++); + + while (p < q) + oprintf(",0x%x", *p++); + + oprintf(";\n"); +#endif dtrace_dof_destroy(g_dtp, dof); } @@ -671,9 +727,12 @@ link_prog(dtrace_cmd_t *dcp) p[0] = '\0'; /* strip .d suffix */ (void) snprintf(dcp->dc_ofile, sizeof (dcp->dc_ofile), "%s.o", basename(dcp->dc_arg)); + } else if (g_cmdc > 1) { + (void) snprintf(dcp->dc_ofile, sizeof (dcp->dc_ofile), + "d.out.%td", dcp - g_cmdv); } else { (void) snprintf(dcp->dc_ofile, sizeof (dcp->dc_ofile), - g_cmdc > 1 ? "%s.%d" : "%s", "d.out", (int)(dcp - g_cmdv)); + "d.out"); } if (dtrace_program_link(g_dtp, dcp->dc_prog, DTRACE_D_PROBES, @@ -693,6 +752,9 @@ list_probe(dtrace_hdl_t *dtp, const dtra if (g_verbose && dtrace_probe_info(dtp, pdp, &p) == 0) print_probe_info(&p); + if (g_intr != 0) + return (1); + return (0); } @@ -868,16 +930,16 @@ setopthandler(const dtrace_setoptdata_t #define BUFDUMPSTR(ptr, field) \ (void) printf("%s: %20s => ", g_pname, #field); \ if ((ptr)->field != NULL) { \ - const char *xc = (ptr)->field; \ + const char *c = (ptr)->field; \ (void) printf("\""); \ do { \ - if (*xc == '\n') { \ + if (*c == '\n') { \ (void) printf("\\n"); \ continue; \ } \ \ - (void) printf("%c", *xc); \ - } while (*xc++ != '\0'); \ + (void) printf("%c", *c); \ + } while (*c++ != '\0'); \ (void) printf("\"\n"); \ } else { \ (void) printf("\n"); \ @@ -914,7 +976,7 @@ bufhandler(const dtrace_bufdata_t *bufda { "AGGFORMAT", DTRACE_BUFDATA_AGGFORMAT }, { "AGGLAST", DTRACE_BUFDATA_AGGLAST }, { "???", UINT32_MAX }, - { NULL, 0 } + { NULL } }; if (bufdata->dtbda_probe != NULL) { @@ -971,7 +1033,7 @@ bufhandler(const dtrace_bufdata_t *bufda uint8_t *data; int lim = rec->dtrd_size; - (void) snprintf(buf, end - buf, "%d (data: ", rec->dtrd_offset); + (void) sprintf(buf, "%d (data: ", rec->dtrd_offset); c = buf + strlen(buf); if (lim > sizeof (uint64_t)) @@ -1070,7 +1132,7 @@ chew(const dtrace_probedata_t *data, voi (void) snprintf(name, sizeof (name), "%s:%s", pd->dtpd_func, pd->dtpd_name); - oprintf("%3d %6d %32s ", (int)cpu, pd->dtpd_id, name); + oprintf("%3d %6d %32s ", cpu, pd->dtpd_id, name); } } else { int indent = data->dtpda_indent; @@ -1090,7 +1152,7 @@ chew(const dtrace_probedata_t *data, voi data->dtpda_prefix, pd->dtpd_func); } - oprintf("%3d %-41s ", (int)cpu, name); + oprintf("%3d %-41s ", cpu, name); } return (DTRACE_CONSUME_THIS); @@ -1102,19 +1164,19 @@ go(void) int i; struct { - const char *name; - const char *optname; + char *name; + char *optname; dtrace_optval_t val; } bufs[] = { - { "buffer size", "bufsize", 0 }, - { "aggregation size", "aggsize", 0 }, - { "speculation size", "specsize", 0 }, - { "dynamic variable size", "dynvarsize", 0 }, - { NULL, NULL, 0 } + { "buffer size", "bufsize" }, + { "aggregation size", "aggsize" }, + { "speculation size", "specsize" }, + { "dynamic variable size", "dynvarsize" }, + { NULL } }, rates[] = { - { "cleaning rate", "cleanrate", 0 }, - { "status rate", "statusrate", 0 }, - { NULL, NULL ,0 } + { "cleaning rate", "cleanrate" }, + { "status rate", "statusrate" }, + { NULL } }; for (i = 0; bufs[i].name != NULL; i++) { @@ -1159,7 +1221,7 @@ go(void) for (i = 0; rates[i].name != NULL; i++) { dtrace_optval_t nval; - const char *dir; + char *dir; if (rates[i].val == DTRACEOPT_UNSET) continue; @@ -1203,11 +1265,48 @@ intr(int signo) g_impatient = 1; } +#ifdef __FreeBSD__ +static void +siginfo(int signo __unused) +{ + + g_siginfo++; + g_newline = 1; +} +#endif + +static void +installsighands(void) +{ + struct sigaction act, oact; + + (void) sigemptyset(&act.sa_mask); + act.sa_flags = 0; + act.sa_handler = intr; + + if (sigaction(SIGINT, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN) + (void) sigaction(SIGINT, &act, NULL); + + if (sigaction(SIGTERM, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN) + (void) sigaction(SIGTERM, &act, NULL); + +#ifdef __FreeBSD__ + if (sigaction(SIGPIPE, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN) + (void) sigaction(SIGPIPE, &act, NULL); + + if (sigaction(SIGUSR1, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN) + (void) sigaction(SIGUSR1, &act, NULL); + + act.sa_handler = siginfo; + if (sigaction(SIGINFO, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN) + (void) sigaction(SIGINFO, &act, NULL); +#endif +} + int main(int argc, char *argv[]) { dtrace_bufdesc_t buf; - struct sigaction act, oact; dtrace_status_t status[2]; dtrace_optval_t opt; dtrace_cmd_t *dcp; @@ -1399,6 +1498,7 @@ main(int argc, char *argv[]) (void) dtrace_setopt(g_dtp, "bufsize", "4m"); (void) dtrace_setopt(g_dtp, "aggsize", "4m"); #endif + (void) dtrace_setopt(g_dtp, "temporal", "yes"); /* * If -G is specified, enable -xlink=dynamic and -xunodefs to permit @@ -1676,19 +1776,19 @@ main(int argc, char *argv[]) case DMODE_ANON: if (g_ofile == NULL) -#if defined(sun) +#ifdef illumos g_ofile = "/kernel/drv/dtrace.conf"; -#else +#endif +#ifdef __FreeBSD__ /* * On FreeBSD, anonymous DOF data is written to - * the DTrace DOF file that the boot loader will - * read if booting with the DTrace option. + * the DTrace DOF file. */ g_ofile = "/boot/dtrace.dof"; #endif dof_prune(g_ofile); /* strip out any old DOF directives */ -#if defined(sun) +#ifdef illumos etcsystem_prune(); /* string out any forceload directives */ #endif @@ -1721,7 +1821,11 @@ main(int argc, char *argv[]) * that itself contains a #pragma D option quiet. */ error("saved anonymous enabling in %s\n", g_ofile); -#if defined(sun) + +#ifdef __FreeBSD__ + bootdof_add(); +#endif +#ifdef illumos etcsystem_add(); error("run update_drv(1M) or reboot to enable changes\n"); #endif @@ -1758,6 +1862,8 @@ main(int argc, char *argv[]) if (g_ofile != NULL && (g_ofp = fopen(g_ofile, "a")) == NULL) fatal("failed to open output file '%s'", g_ofile); + installsighands(); + oprintf("%5s %10s %17s %33s %s\n", "ID", "PROVIDER", "MODULE", "FUNCTION", "NAME"); @@ -1779,7 +1885,7 @@ main(int argc, char *argv[]) } if (g_ofile == NULL) { - char *pv; + char *p; if (g_cmdc > 1) { (void) fprintf(stderr, "%s: -h requires an " @@ -1789,8 +1895,8 @@ main(int argc, char *argv[]) return (E_USAGE); } - if ((pv = strrchr(g_cmdv[0].dc_arg, '.')) == NULL || - strcmp(pv, ".d") != 0) { + if ((p = strrchr(g_cmdv[0].dc_arg, '.')) == NULL || + strcmp(p, ".d") != 0) { (void) fprintf(stderr, "%s: -h requires an " "output file if no scripts are " "specified\n", g_pname); @@ -1798,9 +1904,9 @@ main(int argc, char *argv[]) return (E_USAGE); } - pv[0] = '\0'; /* strip .d suffix */ - g_ofile = pv = g_cmdv[0].dc_ofile; - (void) snprintf(pv, sizeof (g_cmdv[0].dc_ofile), + p[0] = '\0'; /* strip .d suffix */ + g_ofile = p = g_cmdv[0].dc_ofile; + (void) snprintf(p, sizeof (g_cmdv[0].dc_ofile), "%s.h", basename(g_cmdv[0].dc_arg)); } @@ -1843,20 +1949,7 @@ main(int argc, char *argv[]) if (opt != DTRACEOPT_UNSET) notice("allowing destructive actions\n"); - (void) sigemptyset(&act.sa_mask); - act.sa_flags = 0; - act.sa_handler = intr; - - if (sigaction(SIGINT, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN) - (void) sigaction(SIGINT, &act, NULL); - - if (sigaction(SIGTERM, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN) - (void) sigaction(SIGTERM, &act, NULL); - -#if !defined(sun) - if (sigaction(SIGUSR1, NULL, &oact) == 0 && oact.sa_handler != SIG_IGN) - (void) sigaction(SIGUSR1, &act, NULL); -#endif + installsighands(); /* * Now that tracing is active and we are ready to consume trace data, @@ -1872,6 +1965,13 @@ main(int argc, char *argv[]) if (!g_intr && !done) dtrace_sleep(g_dtp); +#if defined(__FreeBSD__) || defined(__NetBSD__) + if (g_siginfo) { + (void)dtrace_aggregate_print(g_dtp, g_ofp, NULL); + g_siginfo = 0; + } +#endif + if (g_newline) { /* * Output a newline just to make the output look Index: src/external/cddl/osnet/dist/cmd/zdb/zdb.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zdb/zdb.c,v retrieving revision 1.6 diff -u -p -r1.6 zdb.c --- src/external/cddl/osnet/dist/cmd/zdb/zdb.c 28 Mar 2014 03:18:24 -0000 1.6 +++ src/external/cddl/osnet/dist/cmd/zdb/zdb.c 27 Mar 2017 06:26:21 -0000 @@ -18,12 +18,15 @@ * * CDDL HEADER END */ + /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ #include +#include #include #include #include @@ -34,6 +37,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -52,22 +58,30 @@ #include #include #include -#undef ZFS_MAXNAMELEN +#include +#include #undef verify #include -#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ - zio_compress_table[(idx)].ci_name : "UNKNOWN") -#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ - zio_checksum_table[(idx)].ci_name : "UNKNOWN") -#define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \ - dmu_ot[(idx)].ot_name : "UNKNOWN") -#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : DMU_OT_NUMTYPES) +#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ + zio_compress_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ + zio_checksum_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \ + dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ? \ + dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN") +#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ + (((idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA) ? \ + DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES)) #ifndef lint -extern int zfs_recover; +extern boolean_t zfs_recover; +extern uint64_t zfs_arc_max, zfs_arc_meta_limit; +extern int zfs_vdev_async_read_max_active; #else -int zfs_recover; +boolean_t zfs_recover; +uint64_t zfs_arc_max, zfs_arc_meta_limit; +int zfs_vdev_async_read_max_active; #endif const char cmdname[] = "zdb"; @@ -76,9 +90,12 @@ uint8_t dump_opt[256]; typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); extern void dump_intent_log(zilog_t *); -uint64_t *zopt_object = NULL; -int zopt_objects = 0; -libzfs_handle_t *g_zfs; +static uint64_t *zopt_object = NULL; +static int zopt_objects = 0; +static libzfs_handle_t *g_zfs; +static uint64_t max_inflight = 1000; + +static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *); /* * These libumem hooks provide a reasonable set of defaults for the allocator's @@ -100,13 +117,17 @@ static void usage(void) { (void) fprintf(stderr, - "Usage: %s [-CumdibcsvhL] poolname [object...]\n" - " %s [-div] dataset [object...]\n" - " %s -m [-L] poolname [vdev [metaslab...]]\n" - " %s -R poolname vdev:offset:size[:flags]\n" - " %s -S poolname\n" - " %s -l [-u] device\n" - " %s -C\n\n", + "Usage: %s [-CumMdibcsDvhLXFPAG] [-t txg] [-e [-p path...]] " + "[-U config] [-I inflight I/Os] [-x dumpdir] poolname [object...]\n" + " %s [-divPA] [-e -p path...] [-U config] dataset " + "[object...]\n" + " %s -mM [-LXFPA] [-t txg] [-e [-p path...]] [-U config] " + "poolname [vdev [metaslab...]]\n" + " %s -R [-A] [-e [-p path...]] poolname " + "vdev:offset:size[:flags]\n" + " %s -S [-PA] [-e [-p path...]] [-U config] poolname\n" + " %s -l [-uA] device\n" + " %s -C [-A] [-U config]\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " @@ -123,9 +144,11 @@ usage(void) (void) fprintf(stderr, " -h pool history\n"); (void) fprintf(stderr, " -b block statistics\n"); (void) fprintf(stderr, " -m metaslabs\n"); + (void) fprintf(stderr, " -M metaslab groups\n"); (void) fprintf(stderr, " -c checksum all metadata (twice for " "all data) blocks\n"); (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); + (void) fprintf(stderr, " -D dedup statistics\n"); (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); (void) fprintf(stderr, " -v verbose (applies to all others)\n"); (void) fprintf(stderr, " -l dump label contents\n"); @@ -134,7 +157,7 @@ usage(void) (void) fprintf(stderr, " -R read and display block from a " "device\n\n"); (void) fprintf(stderr, " Below options are intended for use " - "with other options (except -l):\n"); + "with other options:\n"); (void) fprintf(stderr, " -A ignore assertions (-A), enable " "panic recovery (-AA) or both (-AAA)\n"); (void) fprintf(stderr, " -F attempt automatic rewind within " @@ -147,14 +170,31 @@ usage(void) "has altroot/not in a cachefile\n"); (void) fprintf(stderr, " -p -- use one or more with " "-e to specify path to vdev dir\n"); + (void) fprintf(stderr, " -x -- " + "dump all read blocks into specified directory\n"); + (void) fprintf(stderr, " -P print numbers in parseable form\n"); (void) fprintf(stderr, " -t -- highest txg to use when " "searching for uberblocks\n"); + (void) fprintf(stderr, " -I -- " + "specify the maximum number of " + "checksumming I/Os [default is 200]\n"); + (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before " + "exiting\n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); exit(1); } +static void +dump_debug_buffer() +{ + if (dump_opt['G']) { + (void) printf("\n"); + zfs_dbgmsg_print("zdb"); + } +} + /* * Called for usage errors that are discovered after a call to spa_open(), * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. @@ -171,6 +211,8 @@ fatal(const char *fmt, ...) va_end(ap); (void) fprintf(stderr, "\n"); + dump_debug_buffer(); + exit(1); } @@ -193,18 +235,48 @@ dump_packed_nvlist(objset_t *os, uint64_ nvlist_free(nv); } -const char dump_zap_stars[] = "****************************************"; -const int dump_zap_width = sizeof (dump_zap_stars) - 1; +/* ARGSUSED */ +static void +dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) +{ + spa_history_phys_t *shp = data; + + if (shp == NULL) + return; + + (void) printf("\t\tpool_create_len = %llu\n", + (u_longlong_t)shp->sh_pool_create_len); + (void) printf("\t\tphys_max_off = %llu\n", + (u_longlong_t)shp->sh_phys_max_off); + (void) printf("\t\tbof = %llu\n", + (u_longlong_t)shp->sh_bof); + (void) printf("\t\teof = %llu\n", + (u_longlong_t)shp->sh_eof); + (void) printf("\t\trecords_lost = %llu\n", + (u_longlong_t)shp->sh_records_lost); +} static void -dump_zap_histogram(uint64_t histo[ZAP_HISTOGRAM_SIZE]) +zdb_nicenum(uint64_t num, char *buf) +{ + if (dump_opt['P']) + (void) sprintf(buf, "%llu", (longlong_t)num); + else + nicenum(num, buf); +} + +const char histo_stars[] = "****************************************"; +const int histo_width = sizeof (histo_stars) - 1; + +static void +dump_histogram(const uint64_t *histo, int size, int offset) { int i; - int minidx = ZAP_HISTOGRAM_SIZE - 1; + int minidx = size - 1; int maxidx = 0; uint64_t max = 0; - for (i = 0; i < ZAP_HISTOGRAM_SIZE; i++) { + for (i = 0; i < size; i++) { if (histo[i] > max) max = histo[i]; if (histo[i] > 0 && i > maxidx) @@ -213,12 +285,14 @@ dump_zap_histogram(uint64_t histo[ZAP_HI minidx = i; } - if (max < dump_zap_width) - max = dump_zap_width; + if (max < histo_width) + max = histo_width; - for (i = minidx; i <= maxidx; i++) - (void) printf("\t\t\t%u: %6llu %s\n", i, (u_longlong_t)histo[i], - &dump_zap_stars[(max - histo[i]) * dump_zap_width / max]); + for (i = minidx; i <= maxidx; i++) { + (void) printf("\t\t\t%3u: %6llu %s\n", + i + offset, (u_longlong_t)histo[i], + &histo_stars[(max - histo[i]) * histo_width / max]); + } } static void @@ -269,19 +343,19 @@ dump_zap_stats(objset_t *os, uint64_t ob (u_longlong_t)zs.zs_salt); (void) printf("\t\tLeafs with 2^n pointers:\n"); - dump_zap_histogram(zs.zs_leafs_with_2n_pointers); + dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks with n*5 entries:\n"); - dump_zap_histogram(zs.zs_blocks_with_n5_entries); + dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBlocks n/10 full:\n"); - dump_zap_histogram(zs.zs_blocks_n_tenths_full); + dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tEntries with n chunks:\n"); - dump_zap_histogram(zs.zs_entries_using_n_chunks); + dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); (void) printf("\t\tBuckets with n entries:\n"); - dump_zap_histogram(zs.zs_buckets_with_n_entries); + dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); } /*ARGSUSED*/ @@ -359,6 +433,79 @@ dump_zap(objset_t *os, uint64_t object, zap_cursor_fini(&zc); } +static void +dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) +{ + bpobj_phys_t *bpop = data; + char bytes[32], comp[32], uncomp[32]; + + if (bpop == NULL) + return; + + zdb_nicenum(bpop->bpo_bytes, bytes); + zdb_nicenum(bpop->bpo_comp, comp); + zdb_nicenum(bpop->bpo_uncomp, uncomp); + + (void) printf("\t\tnum_blkptrs = %llu\n", + (u_longlong_t)bpop->bpo_num_blkptrs); + (void) printf("\t\tbytes = %s\n", bytes); + if (size >= BPOBJ_SIZE_V1) { + (void) printf("\t\tcomp = %s\n", comp); + (void) printf("\t\tuncomp = %s\n", uncomp); + } + if (size >= sizeof (*bpop)) { + (void) printf("\t\tsubobjs = %llu\n", + (u_longlong_t)bpop->bpo_subobjs); + (void) printf("\t\tnum_subobjs = %llu\n", + (u_longlong_t)bpop->bpo_num_subobjs); + } + + if (dump_opt['d'] < 5) + return; + + for (uint64_t i = 0; i < bpop->bpo_num_blkptrs; i++) { + char blkbuf[BP_SPRINTF_LEN]; + blkptr_t bp; + + int err = dmu_read(os, object, + i * sizeof (bp), sizeof (bp), &bp, 0); + if (err != 0) { + (void) printf("got error %u from dmu_read\n", err); + break; + } + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp); + (void) printf("\t%s\n", blkbuf); + } +} + +/* ARGSUSED */ +static void +dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) +{ + dmu_object_info_t doi; + + VERIFY0(dmu_object_info(os, object, &doi)); + uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); + + int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); + if (err != 0) { + (void) printf("got error %u from dmu_read\n", err); + kmem_free(subobjs, doi.doi_max_offset); + return; + } + + int64_t last_nonzero = -1; + for (uint64_t i = 0; i < doi.doi_max_offset / 8; i++) { + if (subobjs[i] != 0) + last_nonzero = i; + } + + for (int64_t i = 0; i <= last_nonzero; i++) { + (void) printf("\t%llu\n", (longlong_t)subobjs[i]); + } + kmem_free(subobjs, doi.doi_max_offset); +} + /*ARGSUSED*/ static void dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) @@ -369,6 +516,71 @@ dump_ddt_zap(objset_t *os, uint64_t obje /*ARGSUSED*/ static void +dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = ", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + (void) printf(" %llx : [%d:%d:%d]\n", + (u_longlong_t)attr.za_first_integer, + (int)ATTR_LENGTH(attr.za_first_integer), + (int)ATTR_BSWAP(attr.za_first_integer), + (int)ATTR_NUM(attr.za_first_integer)); + } + zap_cursor_fini(&zc); +} + +/*ARGSUSED*/ +static void +dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + uint16_t *layout_attrs; + int i; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = [", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + + VERIFY(attr.za_integer_length == 2); + layout_attrs = umem_zalloc(attr.za_num_integers * + attr.za_integer_length, UMEM_NOFAIL); + + VERIFY(zap_lookup(os, object, attr.za_name, + attr.za_integer_length, + attr.za_num_integers, layout_attrs) == 0); + + for (i = 0; i != attr.za_num_integers; i++) + (void) printf(" %d ", (int)layout_attrs[i]); + (void) printf("]\n"); + umem_free(layout_attrs, + attr.za_num_integers * attr.za_integer_length); + } + zap_cursor_fini(&zc); +} + +/*ARGSUSED*/ +static void dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) { zap_cursor_t zc; @@ -405,26 +617,89 @@ dump_zpldir(objset_t *os, uint64_t objec zap_cursor_fini(&zc); } +int +get_dtl_refcount(vdev_t *vd) +{ + int refcount = 0; + + if (vd->vdev_ops->vdev_op_leaf) { + space_map_t *sm = vd->vdev_dtl_sm; + + if (sm != NULL && + sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) + return (1); + return (0); + } + + for (int c = 0; c < vd->vdev_children; c++) + refcount += get_dtl_refcount(vd->vdev_child[c]); + return (refcount); +} + +int +get_metaslab_refcount(vdev_t *vd) +{ + int refcount = 0; + + if (vd->vdev_top == vd && !vd->vdev_removing) { + for (int m = 0; m < vd->vdev_ms_count; m++) { + space_map_t *sm = vd->vdev_ms[m]->ms_sm; + + if (sm != NULL && + sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) + refcount++; + } + } + for (int c = 0; c < vd->vdev_children; c++) + refcount += get_metaslab_refcount(vd->vdev_child[c]); + + return (refcount); +} + +static int +verify_spacemap_refcounts(spa_t *spa) +{ + uint64_t expected_refcount = 0; + uint64_t actual_refcount; + + (void) feature_get_refcount(spa, + &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], + &expected_refcount); + actual_refcount = get_dtl_refcount(spa->spa_root_vdev); + actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); + + if (expected_refcount != actual_refcount) { + (void) printf("space map refcount mismatch: expected %lld != " + "actual %lld\n", + (longlong_t)expected_refcount, + (longlong_t)actual_refcount); + return (2); + } + return (0); +} + static void -dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm) +dump_spacemap(objset_t *os, space_map_t *sm) { uint64_t alloc, offset, entry; - uint8_t mapshift = sm->sm_shift; - uint64_t mapstart = sm->sm_start; char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", "INVALID", "INVALID", "INVALID", "INVALID" }; - if (smo->smo_object == 0) + if (sm == NULL) return; /* * Print out the freelist entries in both encoded and decoded form. */ alloc = 0; - for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) { - VERIFY(0 == dmu_read(os, smo->smo_object, offset, + for (offset = 0; offset < space_map_length(sm); + offset += sizeof (entry)) { + uint8_t mapshift = sm->sm_shift; + + VERIFY0(dmu_read(os, space_map_object(sm), offset, sizeof (entry), &entry, DMU_READ_PREFETCH)); if (SM_DEBUG_DECODE(entry)) { + (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n", (u_longlong_t)(offset / sizeof (entry)), ddata[SM_DEBUG_ACTION_DECODE(entry)], @@ -436,10 +711,10 @@ dump_spacemap(objset_t *os, space_map_ob (u_longlong_t)(offset / sizeof (entry)), SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F', (u_longlong_t)((SM_OFFSET_DECODE(entry) << - mapshift) + mapstart), + mapshift) + sm->sm_start), (u_longlong_t)((SM_OFFSET_DECODE(entry) << - mapshift) + mapstart + (SM_RUN_DECODE(entry) << - mapshift)), + mapshift) + sm->sm_start + + (SM_RUN_DECODE(entry) << mapshift)), (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift)); if (SM_TYPE_DECODE(entry) == SM_ALLOC) alloc += SM_RUN_DECODE(entry) << mapshift; @@ -447,26 +722,28 @@ dump_spacemap(objset_t *os, space_map_ob alloc -= SM_RUN_DECODE(entry) << mapshift; } } - if (alloc != smo->smo_alloc) { + if (alloc != space_map_allocated(sm)) { (void) printf("space_map_object alloc (%llu) INCONSISTENT " "with space map summary (%llu)\n", - (u_longlong_t)smo->smo_alloc, (u_longlong_t)alloc); + (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc); } } static void dump_metaslab_stats(metaslab_t *msp) { - char maxbuf[5]; - space_map_t *sm = &msp->ms_map; - avl_tree_t *t = sm->sm_pp_root; - int free_pct = sm->sm_space * 100 / sm->sm_size; + char maxbuf[32]; + range_tree_t *rt = msp->ms_tree; + avl_tree_t *t = &msp->ms_size_tree; + int free_pct = range_tree_space(rt) * 100 / msp->ms_size; - nicenum(space_map_maxsize(sm), maxbuf, sizeof(maxbuf)); + zdb_nicenum(metaslab_block_maxsize(msp), maxbuf); (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", "segments", avl_numnodes(t), "maxsize", maxbuf, "freepct", free_pct); + (void) printf("\tIn-memory histogram:\n"); + dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); } static void @@ -474,33 +751,45 @@ dump_metaslab(metaslab_t *msp) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; - space_map_t *sm = &msp->ms_map; - space_map_obj_t *smo = &msp->ms_smo; - char freebuf[5]; + space_map_t *sm = msp->ms_sm; + char freebuf[32]; - nicenum(sm->sm_size - smo->smo_alloc, freebuf, sizeof(freebuf)); + zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf); (void) printf( "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", - (u_longlong_t)(sm->sm_start / sm->sm_size), - (u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf); + (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, + (u_longlong_t)space_map_object(sm), freebuf); - if (dump_opt['m'] > 1 && !dump_opt['L']) { + if (dump_opt['m'] > 2 && !dump_opt['L']) { mutex_enter(&msp->ms_lock); - space_map_load_wait(sm); - if (!sm->sm_loaded) - VERIFY(space_map_load(sm, zfs_metaslab_ops, - SM_FREE, smo, spa->spa_meta_objset) == 0); + metaslab_load_wait(msp); + if (!msp->ms_loaded) { + VERIFY0(metaslab_load(msp)); + range_tree_stat_verify(msp->ms_tree); + } dump_metaslab_stats(msp); - space_map_unload(sm); + metaslab_unload(msp); mutex_exit(&msp->ms_lock); } - if (dump_opt['d'] > 5 || dump_opt['m'] > 2) { - ASSERT(sm->sm_size == (1ULL << vd->vdev_ms_shift)); + if (dump_opt['m'] > 1 && sm != NULL && + spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { + /* + * The space map histogram represents free space in chunks + * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). + */ + (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", + (u_longlong_t)msp->ms_fragmentation); + dump_histogram(sm->sm_phys->smp_histogram, + SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); + } + + if (dump_opt['d'] > 5 || dump_opt['m'] > 3) { + ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); mutex_enter(&msp->ms_lock); - dump_spacemap(spa->spa_meta_objset, smo, sm); + dump_spacemap(spa->spa_meta_objset, msp->ms_sm); mutex_exit(&msp->ms_lock); } } @@ -518,6 +807,47 @@ print_vdev_metaslab_header(vdev_t *vd) } static void +dump_metaslab_groups(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + metaslab_class_t *mc = spa_normal_class(spa); + uint64_t fragmentation; + + metaslab_class_histogram_verify(mc); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (mg->mg_class != mc) + continue; + + metaslab_group_histogram_verify(mg); + mg->mg_fragmentation = metaslab_group_fragmentation(mg); + + (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" + "fragmentation", + (u_longlong_t)tvd->vdev_id, + (u_longlong_t)tvd->vdev_ms_count); + if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { + (void) printf("%3s\n", "-"); + } else { + (void) printf("%3llu%%\n", + (u_longlong_t)mg->mg_fragmentation); + } + dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); + } + + (void) printf("\tpool %s\tfragmentation", spa_name(spa)); + fragmentation = metaslab_class_fragmentation(mc); + if (fragmentation == ZFS_FRAG_INVALID) + (void) printf("\t%3s\n", "-"); + else + (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); + dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); +} + +static void dump_metaslabs(spa_t *spa) { vdev_t *vd, *rvd = spa->spa_root_vdev; @@ -572,7 +902,7 @@ dump_dde(const ddt_t *ddt, const ddt_ent if (ddp->ddp_phys_birth == 0) continue; ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); - snprintf_blkptr(blkbuf, sizeof(blkbuf), &blk); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); (void) printf("index %llx refcnt %llu %s %s\n", (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, types[p], blkbuf); @@ -617,13 +947,15 @@ dump_ddt(ddt_t *ddt, enum ddt_type type, return; ASSERT(error == 0); - count = ddt_object_count(ddt, type, class); + error = ddt_object_count(ddt, type, class, &count); + ASSERT(error == 0); + if (count == 0) + return; + dspace = doi.doi_physical_blocks_512 << 9; mspace = doi.doi_fill_count * doi.doi_data_block_size; - ASSERT(count != 0); /* we should have destroyed it */ - - ddt_object_name(ddt, type, class, name, sizeof(name)); + ddt_object_name(ddt, type, class, name); (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", name, @@ -687,9 +1019,9 @@ dump_all_ddts(spa_t *spa) } static void -dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size) +dump_dtl_seg(void *arg, uint64_t start, uint64_t size) { - char *prefix = (void *)sm; + char *prefix = arg; (void) printf("%s [%llu,%llu) length %llu\n", prefix, @@ -719,28 +1051,32 @@ dump_dtl(vdev_t *vd, int indent) required ? "DTL-required" : "DTL-expendable"); for (int t = 0; t < DTL_TYPES; t++) { - space_map_t *sm = &vd->vdev_dtl[t]; - if (sm->sm_space == 0) + range_tree_t *rt = vd->vdev_dtl[t]; + if (range_tree_space(rt) == 0) continue; (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", indent + 2, "", name[t]); - mutex_enter(sm->sm_lock); - space_map_walk(sm, dump_dtl_seg, (void *)prefix); - mutex_exit(sm->sm_lock); + mutex_enter(rt->rt_lock); + range_tree_walk(rt, dump_dtl_seg, prefix); + mutex_exit(rt->rt_lock); if (dump_opt['d'] > 5 && vd->vdev_children == 0) - dump_spacemap(spa->spa_meta_objset, - &vd->vdev_dtl_smo, sm); + dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm); } for (int c = 0; c < vd->vdev_children; c++) dump_dtl(vd->vdev_child[c], indent + 4); } +/* from spa_history.c: spa_history_create_obj() */ +#define HIS_BUF_LEN_DEF (128 << 10) +#define HIS_BUF_LEN_MAX (1 << 30) + static void dump_history(spa_t *spa) { nvlist_t **events = NULL; - char buf[SPA_MAXBLOCKSIZE]; + char *buf = NULL; + uint64_t bufsize = HIS_BUF_LEN_DEF; uint64_t resid, len, off = 0; uint_t num = 0; int error; @@ -749,8 +1085,11 @@ dump_history(spa_t *spa) char tbuf[30]; char internalstr[MAXPATHLEN]; + if ((buf = malloc(bufsize)) == NULL) + (void) fprintf(stderr, "Unable to read history: " + "out of memory\n"); do { - len = sizeof (buf); + len = bufsize; if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { (void) fprintf(stderr, "Unable to read history: " @@ -760,34 +1099,52 @@ dump_history(spa_t *spa) if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) break; - off -= resid; + + /* + * If the history block is too big, double the buffer + * size and try again. + */ + if (resid == len) { + free(buf); + buf = NULL; + + bufsize <<= 1; + if ((bufsize >= HIS_BUF_LEN_MAX) || + ((buf = malloc(bufsize)) == NULL)) { + (void) fprintf(stderr, "Unable to read history: " + "out of memory\n"); + return; + } + } } while (len != 0); + free(buf); (void) printf("\nHistory:\n"); for (int i = 0; i < num; i++) { uint64_t time, txg, ievent; char *cmd, *intstr; + boolean_t printed = B_FALSE; if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, &time) != 0) - continue; + goto next; if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, &cmd) != 0) { if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_INT_EVENT, &ievent) != 0) - continue; + goto next; verify(nvlist_lookup_uint64(events[i], ZPOOL_HIST_TXG, &txg) == 0); verify(nvlist_lookup_string(events[i], ZPOOL_HIST_INT_STR, &intstr) == 0); - if (ievent >= LOG_END) - continue; + if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) + goto next; (void) snprintf(internalstr, sizeof (internalstr), "[internal %s txg:%lld] %s", - hist_event_table[ievent], txg, + zfs_history_event_names[ievent], txg, intstr); cmd = internalstr; } @@ -795,6 +1152,14 @@ dump_history(spa_t *spa) (void) localtime_r(&tsec, &t); (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); (void) printf("%s %s\n", tbuf, cmd); + printed = B_TRUE; + +next: + if (dump_opt['h'] > 1) { + if (!printed) + (void) printf("unrecognized record:\n"); + dump_nvlist(events[i], 2); + } } } @@ -805,7 +1170,8 @@ dump_dnode(objset_t *os, uint64_t object } static uint64_t -blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb) +blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, + const zbookmark_phys_t *zb) { if (dnp == NULL) { ASSERT(zb->zb_level < 0); @@ -822,47 +1188,63 @@ blkid2offset(const dnode_phys_t *dnp, co } static void -snprintf_blkptr_compact(char *blkbuf, size_t blklen, blkptr_t *bp) +snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp) { - dva_t *dva = bp->blk_dva; + const dva_t *dva = bp->blk_dva; int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; - size_t len; - if (dump_opt['b'] >= 5) { - snprintf_blkptr(blkbuf, blklen, bp); + if (dump_opt['b'] >= 6) { + snprintf_blkptr(blkbuf, buflen, bp); return; } - blkbuf[0] = '\0'; + if (BP_IS_EMBEDDED(bp)) { + (void) sprintf(blkbuf, + "EMBEDDED et=%u %llxL/%llxP B=%llu", + (int)BPE_GET_ETYPE(bp), + (u_longlong_t)BPE_GET_LSIZE(bp), + (u_longlong_t)BPE_GET_PSIZE(bp), + (u_longlong_t)bp->blk_birth); + return; + } - len = 0; - for (int i = 0; i < ndvas; i++) { - len += snprintf(blkbuf + len, blklen - len, "%llu:%llx:%llx ", + blkbuf[0] = '\0'; + for (int i = 0; i < ndvas; i++) + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), "%llu:%llx:%llx ", (u_longlong_t)DVA_GET_VDEV(&dva[i]), (u_longlong_t)DVA_GET_OFFSET(&dva[i]), (u_longlong_t)DVA_GET_ASIZE(&dva[i])); - if (len > blklen) - len = blklen; - } - snprintf(blkbuf + len, blklen - len, - "%llxL/%llxP F=%llu B=%llu/%llu", - (u_longlong_t)BP_GET_LSIZE(bp), - (u_longlong_t)BP_GET_PSIZE(bp), - (u_longlong_t)bp->blk_fill, - (u_longlong_t)bp->blk_birth, - (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); + if (BP_IS_HOLE(bp)) { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + "%llxL B=%llu", + (u_longlong_t)BP_GET_LSIZE(bp), + (u_longlong_t)bp->blk_birth); + } else { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + "%llxL/%llxP F=%llu B=%llu/%llu", + (u_longlong_t)BP_GET_LSIZE(bp), + (u_longlong_t)BP_GET_PSIZE(bp), + (u_longlong_t)BP_GET_FILL(bp), + (u_longlong_t)bp->blk_birth, + (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); + } } static void -print_indirect(blkptr_t *bp, const zbookmark_t *zb, +print_indirect(blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp) { char blkbuf[BP_SPRINTF_LEN]; int l; - ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); - ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); + if (!BP_IS_EMBEDDED(bp)) { + ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); + ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); + } (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); @@ -876,13 +1258,13 @@ print_indirect(blkptr_t *bp, const zbook } } - snprintf_blkptr_compact(blkbuf, sizeof(blkbuf), bp); + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } static int visit_indirect(spa_t *spa, const dnode_phys_t *dnp, - blkptr_t *bp, const zbookmark_t *zb) + blkptr_t *bp, const zbookmark_phys_t *zb) { int err = 0; @@ -891,23 +1273,24 @@ visit_indirect(spa_t *spa, const dnode_p print_indirect(bp, zb, dnp); - if (BP_GET_LEVEL(bp) > 0) { - uint32_t flags = ARC_WAIT; + if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { + arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; uint64_t fill = 0; - err = arc_read_nolock(NULL, spa, bp, arc_getbuf_func, &buf, + err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err) return (err); + ASSERT(buf->b_data); /* recursively visit blocks below this */ cbp = buf->b_data; for (i = 0; i < epb; i++, cbp++) { - zbookmark_t czb; + zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, @@ -915,11 +1298,11 @@ visit_indirect(spa_t *spa, const dnode_p err = visit_indirect(spa, dnp, cbp, &czb); if (err) break; - fill += cbp->blk_fill; + fill += BP_GET_FILL(cbp); } if (!err) - ASSERT3U(fill, ==, bp->blk_fill); - (void) arc_buf_remove_ref(buf, &buf); + ASSERT3U(fill, ==, BP_GET_FILL(bp)); + arc_buf_destroy(buf, &buf); } return (err); @@ -931,7 +1314,7 @@ dump_indirect(dnode_t *dn) { dnode_phys_t *dnp = dn->dn_phys; int j; - zbookmark_t czb; + zbookmark_phys_t czb; (void) printf("Indirect blocks:\n"); @@ -952,7 +1335,7 @@ dump_dsl_dir(objset_t *os, uint64_t obje { dsl_dir_phys_t *dd = data; time_t crtime; - char nice[6]; + char nice[32]; if (dd == NULL) return; @@ -969,15 +1352,15 @@ dump_dsl_dir(objset_t *os, uint64_t obje (u_longlong_t)dd->dd_origin_obj); (void) printf("\t\tchild_dir_zapobj = %llu\n", (u_longlong_t)dd->dd_child_dir_zapobj); - nicenum(dd->dd_used_bytes, nice, sizeof(nice)); + zdb_nicenum(dd->dd_used_bytes, nice); (void) printf("\t\tused_bytes = %s\n", nice); - nicenum(dd->dd_compressed_bytes, nice, sizeof(nice)); + zdb_nicenum(dd->dd_compressed_bytes, nice); (void) printf("\t\tcompressed_bytes = %s\n", nice); - nicenum(dd->dd_uncompressed_bytes, nice, sizeof(nice)); + zdb_nicenum(dd->dd_uncompressed_bytes, nice); (void) printf("\t\tuncompressed_bytes = %s\n", nice); - nicenum(dd->dd_quota, nice, sizeof(nice)); + zdb_nicenum(dd->dd_quota, nice); (void) printf("\t\tquota = %s\n", nice); - nicenum(dd->dd_reserved, nice, sizeof(nice)); + zdb_nicenum(dd->dd_reserved, nice); (void) printf("\t\treserved = %s\n", nice); (void) printf("\t\tprops_zapobj = %llu\n", (u_longlong_t)dd->dd_props_zapobj); @@ -987,7 +1370,7 @@ dump_dsl_dir(objset_t *os, uint64_t obje (u_longlong_t)dd->dd_flags); #define DO(which) \ - nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, sizeof(nice)); \ + zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \ (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) DO(HEAD); DO(SNAP); @@ -1003,7 +1386,7 @@ dump_dsl_dataset(objset_t *os, uint64_t { dsl_dataset_phys_t *ds = data; time_t crtime; - char used[6], compressed[6], uncompressed[6], unique[6]; + char used[32], compressed[32], uncompressed[32], unique[32]; char blkbuf[BP_SPRINTF_LEN]; if (ds == NULL) @@ -1011,11 +1394,11 @@ dump_dsl_dataset(objset_t *os, uint64_t ASSERT(size == sizeof (*ds)); crtime = ds->ds_creation_time; - nicenum(ds->ds_used_bytes, used, sizeof(used)); - nicenum(ds->ds_compressed_bytes, compressed, sizeof(compressed)); - nicenum(ds->ds_uncompressed_bytes, uncompressed, sizeof(uncompressed)); - nicenum(ds->ds_unique_bytes, unique, sizeof(unique)); - snprintf_blkptr(blkbuf, sizeof(blkbuf), &ds->ds_bp); + zdb_nicenum(ds->ds_referenced_bytes, used); + zdb_nicenum(ds->ds_compressed_bytes, compressed); + zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed); + zdb_nicenum(ds->ds_unique_bytes, unique); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); (void) printf("\t\tdir_obj = %llu\n", (u_longlong_t)ds->ds_dir_obj); @@ -1053,63 +1436,166 @@ dump_dsl_dataset(objset_t *os, uint64_t (void) printf("\t\tbp = %s\n", blkbuf); } +/* ARGSUSED */ +static int +dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + char blkbuf[BP_SPRINTF_LEN]; + + if (bp->blk_birth != 0) { + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("\t%s\n", blkbuf); + } + return (0); +} + static void -dump_bplist(objset_t *mos, uint64_t object, char *name) +dump_bptree(objset_t *os, uint64_t obj, char *name) { - bplist_t bpl = { 0 }; - blkptr_t blk, *bp = &blk; - uint64_t itor = 0; - char bytes[6]; - char comp[6]; - char uncomp[6]; + char bytes[32]; + bptree_phys_t *bt; + dmu_buf_t *db; if (dump_opt['d'] < 3) return; - bplist_init(&bpl); - VERIFY(0 == bplist_open(&bpl, mos, object)); - if (bplist_empty(&bpl)) { - bplist_close(&bpl); - bplist_fini(&bpl); + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + zdb_nicenum(bt->bt_bytes, bytes); + (void) printf("\n %s: %llu datasets, %s\n", + name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); + dmu_buf_rele(db, FTAG); + + if (dump_opt['d'] < 5) return; - } - nicenum(bpl.bpl_phys->bpl_bytes, bytes, sizeof(bytes)); - if (bpl.bpl_dbuf->db_size == sizeof (bplist_phys_t)) { - nicenum(bpl.bpl_phys->bpl_comp, comp, sizeof(comp)); - nicenum(bpl.bpl_phys->bpl_uncomp, uncomp, sizeof(uncomp)); - (void) printf("\n %s: %llu entries, %s (%s/%s comp)\n", - name, (u_longlong_t)bpl.bpl_phys->bpl_entries, + (void) printf("\n"); + + (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); +} + +/* ARGSUSED */ +static int +dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + char blkbuf[BP_SPRINTF_LEN]; + + ASSERT(bp->blk_birth != 0); + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp); + (void) printf("\t%s\n", blkbuf); + return (0); +} + +static void +dump_full_bpobj(bpobj_t *bpo, char *name, int indent) +{ + char bytes[32]; + char comp[32]; + char uncomp[32]; + + if (dump_opt['d'] < 3) + return; + + zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes); + if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { + zdb_nicenum(bpo->bpo_phys->bpo_comp, comp); + zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp); + (void) printf(" %*s: object %llu, %llu local blkptrs, " + "%llu subobjs in object %llu, %s (%s/%s comp)\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, + (u_longlong_t)bpo->bpo_phys->bpo_subobjs, bytes, comp, uncomp); + + for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { + uint64_t subobj; + bpobj_t subbpo; + int error; + VERIFY0(dmu_read(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + i * sizeof (subobj), sizeof (subobj), &subobj, 0)); + error = bpobj_open(&subbpo, bpo->bpo_os, subobj); + if (error != 0) { + (void) printf("ERROR %u while trying to open " + "subobj id %llu\n", + error, (u_longlong_t)subobj); + continue; + } + dump_full_bpobj(&subbpo, "subobj", indent + 1); + bpobj_close(&subbpo); + } } else { - (void) printf("\n %s: %llu entries, %s\n", - name, (u_longlong_t)bpl.bpl_phys->bpl_entries, bytes); + (void) printf(" %*s: object %llu, %llu blkptrs, %s\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + bytes); } - if (dump_opt['d'] < 5) { - bplist_close(&bpl); - bplist_fini(&bpl); + if (dump_opt['d'] < 5) return; + + + if (indent == 0) { + (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); + (void) printf("\n"); } +} - (void) printf("\n"); +static void +dump_deadlist(dsl_deadlist_t *dl) +{ + dsl_deadlist_entry_t *dle; + uint64_t unused; + char bytes[32]; + char comp[32]; + char uncomp[32]; - while (bplist_iterate(&bpl, &itor, bp) == 0) { - char blkbuf[BP_SPRINTF_LEN]; + if (dump_opt['d'] < 3) + return; - ASSERT(bp->blk_birth != 0); - snprintf_blkptr_compact(blkbuf, sizeof(blkbuf), bp); - (void) printf("\tItem %3llu: %s\n", - (u_longlong_t)itor - 1, blkbuf); + if (dl->dl_oldfmt) { + dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); + return; } - bplist_close(&bpl); - bplist_fini(&bpl); + zdb_nicenum(dl->dl_phys->dl_used, bytes); + zdb_nicenum(dl->dl_phys->dl_comp, comp); + zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp); + (void) printf("\n Deadlist: %s (%s/%s comp)\n", + bytes, comp, uncomp); + + if (dump_opt['d'] < 4) + return; + + (void) printf("\n"); + + /* force the tree to be loaded */ + dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused); + + for (dle = avl_first(&dl->dl_tree); dle; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + if (dump_opt['d'] >= 5) { + char buf[128]; + (void) snprintf(buf, sizeof (buf), "mintxg %llu -> " + "obj %llu", (longlong_t)dle->dle_mintxg, + (longlong_t)dle->dle_bpobj.bpo_object); + dump_full_bpobj(&dle->dle_bpobj, buf, 0); + } else { + (void) printf("mintxg %llu -> obj %llu\n", + (longlong_t)dle->dle_mintxg, + (longlong_t)dle->dle_bpobj.bpo_object); + } + } } static avl_tree_t idx_tree; static avl_tree_t domain_tree; static boolean_t fuid_table_loaded; +static boolean_t sa_loaded; +sa_attr_type_t *sa_attr_table; static void fuid_table_destroy() @@ -1124,7 +1610,7 @@ fuid_table_destroy() * print uid or gid information. * For normal POSIX id just the id is printed in decimal format. * For CIFS files with FUID the fuid is printed in hex followed by - * the doman-rid string. + * the domain-rid string. */ static void print_idstr(uint64_t id, const char *id_type) @@ -1142,12 +1628,12 @@ print_idstr(uint64_t id, const char *id_ } static void -dump_uidgid(objset_t *os, znode_phys_t *zp) +dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) { uint32_t uid_idx, gid_idx; - uid_idx = FUID_INDEX(zp->zp_uid); - gid_idx = FUID_INDEX(zp->zp_gid); + uid_idx = FUID_INDEX(uid); + gid_idx = FUID_INDEX(gid); /* Load domain table, if not already loaded */ if (!fuid_table_loaded && (uid_idx || gid_idx)) { @@ -1162,50 +1648,111 @@ dump_uidgid(objset_t *os, znode_phys_t * fuid_table_loaded = B_TRUE; } - print_idstr(zp->zp_uid, "uid"); - print_idstr(zp->zp_gid, "gid"); + print_idstr(uid, "uid"); + print_idstr(gid, "gid"); } /*ARGSUSED*/ static void dump_znode(objset_t *os, uint64_t object, void *data, size_t size) { - znode_phys_t *zp = data; - time_t z_crtime, z_atime, z_mtime, z_ctime; char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ + sa_handle_t *hdl; + uint64_t xattr, rdev, gen; + uint64_t uid, gid, mode, fsize, parent, links; + uint64_t pflags; + uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; + time_t z_crtime, z_atime, z_mtime, z_ctime; + sa_bulk_attr_t bulk[12]; + int idx = 0; int error; - ASSERT(size >= sizeof (znode_phys_t)); + if (!sa_loaded) { + uint64_t sa_attrs = 0; + uint64_t version; + + VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &version) == 0); + if (version >= ZPL_VERSION_SA) { + VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, + 8, 1, &sa_attrs) == 0); + } + if ((error = sa_setup(os, sa_attrs, zfs_attr_table, + ZPL_END, &sa_attr_table)) != 0) { + (void) printf("sa_setup failed errno %d, can't " + "display znode contents\n", error); + return; + } + sa_loaded = B_TRUE; + } + + if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { + (void) printf("Failed to get handle for SA znode\n"); + return; + } + + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, + &links, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, + &mode, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], + NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, + &fsize, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, + acctm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, + modtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, + crtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, + chgtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, + &pflags, 8); + + if (sa_bulk_lookup(hdl, bulk, idx)) { + (void) sa_handle_destroy(hdl); + return; + } error = zfs_obj_to_path(os, object, path, sizeof (path)); if (error != 0) { (void) snprintf(path, sizeof (path), "\?\?\?", (u_longlong_t)object); } - if (dump_opt['d'] < 3) { (void) printf("\t%s\n", path); + (void) sa_handle_destroy(hdl); return; } - z_crtime = (time_t)zp->zp_crtime[0]; - z_atime = (time_t)zp->zp_atime[0]; - z_mtime = (time_t)zp->zp_mtime[0]; - z_ctime = (time_t)zp->zp_ctime[0]; + z_crtime = (time_t)crtm[0]; + z_atime = (time_t)acctm[0]; + z_mtime = (time_t)modtm[0]; + z_ctime = (time_t)chgtm[0]; (void) printf("\tpath %s\n", path); - dump_uidgid(os, zp); + dump_uidgid(os, uid, gid); (void) printf("\tatime %s", ctime(&z_atime)); (void) printf("\tmtime %s", ctime(&z_mtime)); (void) printf("\tctime %s", ctime(&z_ctime)); (void) printf("\tcrtime %s", ctime(&z_crtime)); - (void) printf("\tgen %llu\n", (u_longlong_t)zp->zp_gen); - (void) printf("\tmode %llo\n", (u_longlong_t)zp->zp_mode); - (void) printf("\tsize %llu\n", (u_longlong_t)zp->zp_size); - (void) printf("\tparent %llu\n", (u_longlong_t)zp->zp_parent); - (void) printf("\tlinks %llu\n", (u_longlong_t)zp->zp_links); - (void) printf("\txattr %llu\n", (u_longlong_t)zp->zp_xattr); - (void) printf("\trdev 0x%016llx\n", (u_longlong_t)zp->zp_rdev); + (void) printf("\tgen %llu\n", (u_longlong_t)gen); + (void) printf("\tmode %llo\n", (u_longlong_t)mode); + (void) printf("\tsize %llu\n", (u_longlong_t)fsize); + (void) printf("\tparent %llu\n", (u_longlong_t)parent); + (void) printf("\tlinks %llu\n", (u_longlong_t)links); + (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); + if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, + sizeof (uint64_t)) == 0) + (void) printf("\txattr %llu\n", (u_longlong_t)xattr); + if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, + sizeof (uint64_t)) == 0) + (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); + sa_handle_destroy(hdl); } /*ARGSUSED*/ @@ -1226,8 +1773,8 @@ static object_viewer_t *object_viewer[DM dump_uint64, /* object array */ dump_none, /* packed nvlist */ dump_packed_nvlist, /* packed nvlist size */ - dump_none, /* bplist */ - dump_none, /* bplist header */ + dump_none, /* bpobj */ + dump_bpobj, /* bpobj header */ dump_none, /* SPA space map header */ dump_none, /* SPA space map */ dump_none, /* ZIL intent log */ @@ -1251,7 +1798,7 @@ static object_viewer_t *object_viewer[DM dump_zap, /* other ZAP */ dump_zap, /* persistent error log */ dump_uint8, /* SPA history */ - dump_uint64, /* SPA history offsets */ + dump_history_offsets, /* SPA history offsets */ dump_zap, /* Pool properties */ dump_zap, /* DSL permissions */ dump_acl, /* ZFS ACL */ @@ -1265,7 +1812,17 @@ static object_viewer_t *object_viewer[DM dump_zap, /* snapshot refcount tags */ dump_ddt_zap, /* DDT ZAP object */ dump_zap, /* DDT statistics */ - dump_unknown /* Unknown type, must be last */ + dump_znode, /* SA object */ + dump_zap, /* SA Master Node */ + dump_sa_attrs, /* SA attribute registration */ + dump_sa_layouts, /* SA attribute layouts */ + dump_zap, /* DSL scrub translations */ + dump_none, /* fake dedup BP */ + dump_zap, /* deadlist */ + dump_none, /* deadlist hdr */ + dump_zap, /* dsl clones */ + dump_bpobj_subobjs, /* bpobj subobjs */ + dump_unknown, /* Unknown type, must be last */ }; static void @@ -1276,7 +1833,8 @@ dump_object(objset_t *os, uint64_t objec dnode_t *dn; void *bonus = NULL; size_t bsize = 0; - char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], fill[7]; + char iblk[32], dblk[32], lsize[32], asize[32], fill[32]; + char bonus_size[32]; char aux[50]; int error; @@ -1288,7 +1846,7 @@ dump_object(objset_t *os, uint64_t objec } if (object == 0) { - dn = os->os_meta_dnode; + dn = DMU_META_DNODE(os); } else { error = dmu_bonus_hold(os, object, FTAG, &db); if (error) @@ -1296,16 +1854,16 @@ dump_object(objset_t *os, uint64_t objec object, error); bonus = db->db_data; bsize = db->db_size; - dn = ((dmu_buf_impl_t *)db)->db_dnode; + dn = DB_DNODE((dmu_buf_impl_t *)db); } dmu_object_info_from_dnode(dn, &doi); - nicenum(doi.doi_metadata_block_size, iblk, sizeof(iblk)); - nicenum(doi.doi_data_block_size, dblk, sizeof(dblk)); - nicenum(doi.doi_max_offset, lsize, sizeof(lsize)); - nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof(asize)); - nicenum(doi.doi_bonus_size, bonus_size, sizeof(bonus_size)); - (void) snprintf(fill, sizeof(fill), "%6.2f", 100.0 * doi.doi_fill_count * + zdb_nicenum(doi.doi_metadata_block_size, iblk); + zdb_nicenum(doi.doi_data_block_size, dblk); + zdb_nicenum(doi.doi_max_offset, lsize); + zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize); + zdb_nicenum(doi.doi_bonus_size, bonus_size); + (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / doi.doi_max_offset); @@ -1332,11 +1890,13 @@ dump_object(objset_t *os, uint64_t objec } if (verbosity >= 4) { - (void) printf("\tdnode flags: %s%s\n", + (void) printf("\tdnode flags: %s%s%s\n", (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? "USED_BYTES " : "", (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? - "USERUSED_ACCOUNTED " : ""); + "USERUSED_ACCOUNTED " : "", + (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? + "SPILL_BLKPTR" : ""); (void) printf("\tdnode maxblkid: %llu\n", (longlong_t)dn->dn_phys->dn_maxblkid); @@ -1364,7 +1924,7 @@ dump_object(objset_t *os, uint64_t objec } for (;;) { - char segsize[6]; + char segsize[32]; error = dnode_next_offset(dn, 0, &start, minlvl, blkfill, 0); if (error) @@ -1372,7 +1932,7 @@ dump_object(objset_t *os, uint64_t objec end = start; error = dnode_next_offset(dn, DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); - nicenum(end - start, segsize, sizeof(segsize)); + zdb_nicenum(end - start, segsize); (void) printf("\t\tsegment [%016llx, %016llx)" " size %5s\n", (u_longlong_t)start, (u_longlong_t)end, segsize); @@ -1395,39 +1955,38 @@ dump_dir(objset_t *os) dmu_objset_stats_t dds; uint64_t object, object_count; uint64_t refdbytes, usedobjs, scratch; - char numbuf[8]; + char numbuf[32]; char blkbuf[BP_SPRINTF_LEN + 20]; - char osname[MAXNAMELEN]; + char osname[ZFS_MAX_DATASET_NAME_LEN]; char *type = "UNKNOWN"; int verbosity = dump_opt['d']; int print_header = 1; int i, error; - size_t len; + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); dmu_objset_fast_stat(os, &dds); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (dds.dds_type < DMU_OST_NUMTYPES) type = objset_types[dds.dds_type]; if (dds.dds_type == DMU_OST_META) { dds.dds_creation_txg = TXG_INITIAL; - usedobjs = os->os_rootbp->blk_fill; - refdbytes = os->os_spa->spa_dsl_pool-> - dp_mos_dir->dd_phys->dd_used_bytes; + usedobjs = BP_GET_FILL(os->os_rootbp); + refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> + dd_used_bytes; } else { dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); } - ASSERT3U(usedobjs, ==, os->os_rootbp->blk_fill); + ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); - nicenum(refdbytes, numbuf, sizeof(numbuf)); + zdb_nicenum(refdbytes, numbuf); if (verbosity >= 4) { - size_t blklen = sizeof(blkbuf); - len = snprintf(blkbuf, blklen, ", rootbp "); - if (len > blklen) - len = blklen; - snprintf_blkptr(blkbuf + len, blklen - len, os->os_rootbp); + (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); + (void) snprintf_blkptr(blkbuf + strlen(blkbuf), + sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); } else { blkbuf[0] = '\0'; } @@ -1452,19 +2011,18 @@ dump_dir(objset_t *os) dump_intent_log(dmu_objset_zil(os)); if (dmu_objset_ds(os) != NULL) - dump_bplist(dmu_objset_pool(os)->dp_meta_objset, - dmu_objset_ds(os)->ds_phys->ds_deadlist_obj, "Deadlist"); + dump_deadlist(&dmu_objset_ds(os)->ds_deadlist); if (verbosity < 2) return; - if (os->os_rootbp->blk_birth == 0) + if (BP_IS_HOLE(os->os_rootbp)) return; dump_object(os, 0, verbosity, &print_header); object_count = 0; - if (os->os_userused_dnode && - os->os_userused_dnode->dn_type != 0) { + if (DMU_USERUSED_DNODE(os) != NULL && + DMU_USERUSED_DNODE(os)->dn_type != 0) { dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header); dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header); } @@ -1499,7 +2057,7 @@ dump_uberblock(uberblock_t *ub, const ch (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); if (dump_opt['u'] >= 3) { char blkbuf[BP_SPRINTF_LEN]; - snprintf_blkptr(blkbuf, sizeof(blkbuf), &ub->ub_rootbp); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); (void) printf("\trootbp = %s\n", blkbuf); } (void) printf(footer ? footer : ""); @@ -1538,13 +2096,13 @@ dump_cachefile(const char *cachefile) nvlist_t *config; if ((fd = open64(cachefile, O_RDONLY)) < 0) { - (void) printf("cannot open '%s': %s\n", cachefile, + (void) fprintf(stderr, "cannot open '%s': %s\n", cachefile, strerror(errno)); exit(1); } if (fstat64(fd, &statbuf) != 0) { - (void) printf("failed to stat '%s': %s\n", cachefile, + (void) fprintf(stderr, "failed to stat '%s': %s\n", cachefile, strerror(errno)); exit(1); } @@ -1604,19 +2162,41 @@ dump_label(const char *dev) { int fd; vdev_label_t label; - char *buf = label.vl_vdev_phys.vp_nvlist; + char *path, *buf = label.vl_vdev_phys.vp_nvlist; size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist); struct stat64 statbuf; uint64_t psize, ashift; + int len = strlen(dev) + 1; + + if (strncmp(dev, ZFS_DISK_ROOTD, strlen(ZFS_DISK_ROOTD)) == 0) { + len++; + path = malloc(len); + (void) snprintf(path, len, "%s%s", ZFS_RDISK_ROOTD, + dev + strlen(ZFS_DISK_ROOTD)); + } else { + path = strdup(dev); + } - if ((fd = open64(dev, O_RDONLY)) < 0) { - (void) printf("cannot open '%s': %s\n", dev, strerror(errno)); + if ((fd = open64(path, O_RDONLY)) < 0) { + (void) printf("cannot open '%s': %s\n", path, strerror(errno)); + free(path); exit(1); } if (fstat64(fd, &statbuf) != 0) { - (void) printf("failed to stat '%s': %s\n", dev, + (void) printf("failed to stat '%s': %s\n", path, strerror(errno)); + free(path); + (void) close(fd); + exit(1); + } + + if (S_ISBLK(statbuf.st_mode)) { + (void) printf("cannot use '%s': character device required\n", + path); + free(path); + (void) close(fd); + exit(1); } psize = statbuf.st_size; @@ -1652,8 +2232,13 @@ dump_label(const char *dev) if (dump_opt['u']) dump_label_uberblocks(&label, ashift); } + + free(path); + (void) close(fd); } +static uint64_t dataset_feature_count[SPA_FEATURES]; + /*ARGSUSED*/ static int dump_one_dir(const char *dsname, void *arg) @@ -1666,20 +2251,34 @@ dump_one_dir(const char *dsname, void *a (void) printf("Could not open %s, error %d\n", dsname, error); return (0); } + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (!dmu_objset_ds(os)->ds_feature_inuse[f]) + continue; + ASSERT(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET); + dataset_feature_count[f]++; + } + dump_dir(os); dmu_objset_disown(os, FTAG); fuid_table_destroy(); + sa_loaded = B_FALSE; return (0); } /* * Block statistics. */ +#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) typedef struct zdb_blkstats { - uint64_t zb_asize; - uint64_t zb_lsize; - uint64_t zb_psize; - uint64_t zb_count; + uint64_t zb_asize; + uint64_t zb_lsize; + uint64_t zb_psize; + uint64_t zb_count; + uint64_t zb_gangs; + uint64_t zb_ditto_samevdev; + uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; } zdb_blkstats_t; /* @@ -1687,11 +2286,13 @@ typedef struct zdb_blkstats { */ #define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) #define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) -#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 2) +#define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) +#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) static char *zdb_ot_extname[] = { "deferred free", "dedup ditto", + "other", "Total", }; @@ -1701,13 +2302,20 @@ typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; + uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; + uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] + [BPE_PAYLOAD_SIZE]; + uint64_t zcb_start; + uint64_t zcb_lastprint; + uint64_t zcb_totalasize; uint64_t zcb_errors[256]; int zcb_readfails; int zcb_haderrors; + spa_t *zcb_spa; } zdb_cb_t; static void -zdb_count_block(spa_t *spa, zilog_t *zilog, zdb_cb_t *zcb, const blkptr_t *bp, +zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, dmu_object_type_t type) { uint64_t refcnt = 0; @@ -1720,12 +2328,50 @@ zdb_count_block(spa_t *spa, zilog_t *zil for (int i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; + int equal; zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_count++; + + /* + * The histogram is only big enough to record blocks up to + * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, + * "other", bucket. + */ + int idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; + idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); + zb->zb_psize_histogram[idx]++; + + zb->zb_gangs += BP_COUNT_GANG(bp); + + switch (BP_GET_NDVAS(bp)) { + case 2: + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + zb->zb_ditto_samevdev++; + break; + case 3: + equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + + (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2])) + + (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2])); + if (equal != 0) + zb->zb_ditto_samevdev++; + break; + } + + } + + if (BP_IS_EMBEDDED(bp)) { + zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; + zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] + [BPE_GET_PSIZE(bp)]++; + return; } if (dump_opt['L']) @@ -1735,7 +2381,7 @@ zdb_count_block(spa_t *spa, zilog_t *zil ddt_t *ddt; ddt_entry_t *dde; - ddt = ddt_select(spa, bp); + ddt = ddt_select(zcb->zcb_spa, bp); ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_FALSE); @@ -1751,115 +2397,149 @@ zdb_count_block(spa_t *spa, zilog_t *zil ddt_exit(ddt); } - VERIFY3U(zio_wait(zio_claim(NULL, spa, - refcnt ? 0 : spa_first_txg(spa), + VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, + refcnt ? 0 : spa_first_txg(zcb->zcb_spa), bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); } +/* ARGSUSED */ +static void +zdb_blkptr_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + int ioerr = zio->io_error; + zdb_cb_t *zcb = zio->io_private; + zbookmark_phys_t *zb = &zio->io_bookmark; + + zio_data_buf_free(zio->io_data, zio->io_size); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_inflight--; + cv_broadcast(&spa->spa_scrub_io_cv); + + if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + char blkbuf[BP_SPRINTF_LEN]; + + zcb->zcb_haderrors = 1; + zcb->zcb_errors[ioerr]++; + + if (dump_opt['b'] >= 2) + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + else + blkbuf[0] = '\0'; + + (void) printf("zdb_blkptr_cb: " + "Got error %d reading " + "<%llu, %llu, %lld, %llx> %s -- skipping\n", + ioerr, + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, + (u_longlong_t)zb->zb_blkid, + blkbuf); + } + mutex_exit(&spa->spa_scrub_lock); +} + +/* ARGSUSED */ static int zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { zdb_cb_t *zcb = arg; - char blkbuf[BP_SPRINTF_LEN]; dmu_object_type_t type; boolean_t is_metadata; if (bp == NULL) return (0); + if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("objset %llu object %llu " + "level %lld offset 0x%llx %s\n", + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (u_longlong_t)blkid2offset(dnp, bp, zb), + blkbuf); + } + + if (BP_IS_HOLE(bp)) + return (0); + type = BP_GET_TYPE(bp); - zdb_count_block(spa, zilog, zcb, bp, type); + zdb_count_block(zcb, zilog, bp, + (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); - is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata); + is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); - if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) { - int ioerr; + if (!BP_IS_EMBEDDED(bp) && + (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); - void *data = malloc(size); + void *data = zio_data_buf_alloc(size); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) flags |= ZIO_FLAG_SPECULATIVE; - ioerr = zio_wait(zio_read(NULL, spa, bp, data, size, - NULL, NULL, ZIO_PRIORITY_ASYNC_READ, flags, zb)); - - free(data); - - if (ioerr && !(flags & ZIO_FLAG_SPECULATIVE)) { - zcb->zcb_haderrors = 1; - zcb->zcb_errors[ioerr]++; - - if (dump_opt['b'] >= 2) - snprintf_blkptr(blkbuf, sizeof(blkbuf), bp); - else - blkbuf[0] = '\0'; + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > max_inflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); - (void) printf("zdb_blkptr_cb: " - "Got error %d reading " - "<%llu, %llu, %lld, %llx> %s -- skipping\n", - ioerr, - (u_longlong_t)zb->zb_objset, - (u_longlong_t)zb->zb_object, - (u_longlong_t)zb->zb_level, - (u_longlong_t)zb->zb_blkid, - blkbuf); - } + zio_nowait(zio_read(NULL, spa, bp, data, size, + zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); } zcb->zcb_readfails = 0; - if (dump_opt['b'] >= 4) { - snprintf_blkptr(blkbuf, sizeof(blkbuf), bp); - (void) printf("objset %llu object %llu " - "level %lld offset 0x%llx %s\n", - (u_longlong_t)zb->zb_objset, - (u_longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, - (u_longlong_t)blkid2offset(dnp, bp, zb), - blkbuf); + /* only call gethrtime() every 100 blocks */ + static int iters; + if (++iters > 100) + iters = 0; + else + return (0); + + if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { + uint64_t now = gethrtime(); + char buf[10]; + uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; + int kb_per_sec = + 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); + int sec_remaining = + (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; + + zfs_nicenum(bytes, buf, sizeof (buf)); + (void) fprintf(stderr, + "\r%5s completed (%4dMB/s) " + "estimated time remaining: %uhr %02umin %02usec ", + buf, kb_per_sec / 1024, + sec_remaining / 60 / 60, + sec_remaining / 60 % 60, + sec_remaining % 60); + + zcb->zcb_lastprint = now; } return (0); } static void -zdb_leak(space_map_t *sm, uint64_t start, uint64_t size) +zdb_leak(void *arg, uint64_t start, uint64_t size) { - vdev_t *vd = sm->sm_ppd; + vdev_t *vd = arg; (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); } -/* ARGSUSED */ -static void -zdb_space_map_load(space_map_t *sm) -{ -} - -static void -zdb_space_map_unload(space_map_t *sm) -{ - space_map_vacate(sm, zdb_leak, sm); -} - -/* ARGSUSED */ -static void -zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size) -{ -} - -static space_map_ops_t zdb_space_map_ops = { - zdb_space_map_load, - zdb_space_map_unload, - NULL, /* alloc */ - zdb_space_map_claim, - NULL, /* free */ - NULL /* maxsize */ +static metaslab_ops_t zdb_metaslab_ops = { + NULL /* alloc */ }; static void @@ -1884,8 +2564,7 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t * ddt_bp_create(ddb.ddb_checksum, &dde.dde_key, ddp, &blk); if (p == DDT_PHYS_DITTO) { - zdb_count_block(spa, NULL, zcb, &blk, - ZDB_OT_DITTO); + zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); } else { zcb->zcb_dedup_asize += BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); @@ -1906,21 +2585,63 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t * static void zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) { + zcb->zcb_spa = spa; + if (!dump_opt['L']) { vdev_t *rvd = spa->spa_root_vdev; - for (int c = 0; c < rvd->vdev_children; c++) { + + /* + * We are going to be changing the meaning of the metaslab's + * ms_tree. Ensure that the allocator doesn't try to + * use the tree. + */ + spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; + spa->spa_log_class->mc_ops = &zdb_metaslab_ops; + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; - for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_group_t *mg = vd->vdev_mg; + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; + ASSERT3P(msp->ms_group, ==, mg); mutex_enter(&msp->ms_lock); - space_map_unload(&msp->ms_map); - VERIFY(space_map_load(&msp->ms_map, - &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo, - spa->spa_meta_objset) == 0); - msp->ms_map.sm_ppd = vd; + metaslab_unload(msp); + + /* + * For leak detection, we overload the metaslab + * ms_tree to contain allocated segments + * instead of free segments. As a result, + * we can't use the normal metaslab_load/unload + * interfaces. + */ + if (msp->ms_sm != NULL) { + (void) fprintf(stderr, + "\rloading space map for " + "vdev %llu of %llu, " + "metaslab %llu of %llu ...", + (longlong_t)c, + (longlong_t)rvd->vdev_children, + (longlong_t)m, + (longlong_t)vd->vdev_ms_count); + + /* + * We don't want to spend the CPU + * manipulating the size-ordered + * tree, so clear the range_tree + * ops. + */ + msp->ms_tree->rt_ops = NULL; + VERIFY0(space_map_load(msp->ms_sm, + msp->ms_tree, SM_ALLOC)); + + if (!msp->ms_loaded) { + msp->ms_loaded = B_TRUE; + } + } mutex_exit(&msp->ms_lock); } } + (void) fprintf(stderr, "\n"); } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -1937,16 +2658,50 @@ zdb_leak_fini(spa_t *spa) vdev_t *rvd = spa->spa_root_vdev; for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; + metaslab_group_t *mg = vd->vdev_mg; for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; + ASSERT3P(mg, ==, msp->ms_group); mutex_enter(&msp->ms_lock); - space_map_unload(&msp->ms_map); + + /* + * The ms_tree has been overloaded to + * contain allocated segments. Now that we + * finished traversing all blocks, any + * block that remains in the ms_tree + * represents an allocated block that we + * did not claim during the traversal. + * Claimed blocks would have been removed + * from the ms_tree. + */ + range_tree_vacate(msp->ms_tree, zdb_leak, vd); + + if (msp->ms_loaded) { + msp->ms_loaded = B_FALSE; + } + mutex_exit(&msp->ms_lock); } } } } +/* ARGSUSED */ +static int +count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + zdb_cb_t *zcb = arg; + + if (dump_opt['b'] >= 5) { + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("[%s] %s\n", + "deferred free", blkbuf); + } + zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); + return (0); +} + static int dump_block_stats(spa_t *spa) { @@ -1954,9 +2709,9 @@ dump_block_stats(spa_t *spa) zdb_blkstats_t *zb, *tzb; uint64_t norm_alloc, norm_space, total_alloc, total_found; int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD; - int leaks = 0; + boolean_t leaks = B_FALSE; - (void) printf("\nTraversing all blocks %s%s%s%s%s...\n", + (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] == 1) ? "metadata " : "", dump_opt['c'] ? "checksums " : "", @@ -1976,32 +2731,39 @@ dump_block_stats(spa_t *spa) /* * If there's a deferred-free bplist, process that first. */ - if (spa->spa_deferred_bplist_obj != 0) { - bplist_t *bpl = &spa->spa_deferred_bplist; - blkptr_t blk; - uint64_t itor = 0; - - VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset, - spa->spa_deferred_bplist_obj)); - - while (bplist_iterate(bpl, &itor, &blk) == 0) { - if (dump_opt['b'] >= 4) { - char blkbuf[BP_SPRINTF_LEN]; - snprintf_blkptr(blkbuf, sizeof(blkbuf), &blk); - (void) printf("[%s] %s\n", - "deferred free", blkbuf); - } - zdb_count_block(spa, NULL, &zcb, &blk, ZDB_OT_DEFERRED); - } - - bplist_close(bpl); + (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, + count_block_cb, &zcb, NULL); + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, + count_block_cb, &zcb, NULL); + } + if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { + VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, + spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, + &zcb, NULL)); } if (dump_opt['c'] > 1) flags |= TRAVERSE_PREFETCH_DATA; + zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); + zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); + /* + * If we've traversed the data blocks then we need to wait for those + * I/Os to complete. We leverage "The Godfather" zio to wait on + * all async I/Os to complete. + */ + if (dump_opt['c']) { + for (int i = 0; i < max_ncpus; i++) { + (void) zio_wait(spa->spa_async_zio_root[i]); + spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + } + } + if (zcb.zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); @@ -2037,7 +2799,7 @@ dump_block_stats(spa_t *spa) (u_longlong_t)total_alloc, (dump_opt['L']) ? "unreachable" : "leaked", (longlong_t)(total_alloc - total_found)); - leaks = 1; + leaks = B_TRUE; } if (tzb->zb_count == 0) @@ -2046,6 +2808,8 @@ dump_block_stats(spa_t *spa) (void) printf("\n"); (void) printf("\tbp count: %10llu\n", (u_longlong_t)tzb->zb_count); + (void) printf("\tganged count: %10llu\n", + (longlong_t)tzb->zb_gangs); (void) printf("\tbp logical: %10llu avg: %6llu\n", (u_longlong_t)tzb->zb_lsize, (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); @@ -2067,13 +2831,36 @@ dump_block_stats(spa_t *spa) (void) printf("\tSPA allocated: %10llu used: %5.2f%%\n", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); + for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { + if (zcb.zcb_embedded_blocks[i] == 0) + continue; + (void) printf("\n"); + (void) printf("\tadditional, non-pointer bps of type %u: " + "%10llu\n", + i, (u_longlong_t)zcb.zcb_embedded_blocks[i]); + + if (dump_opt['b'] >= 3) { + (void) printf("\t number of (compressed) bytes: " + "number of bps\n"); + dump_histogram(zcb.zcb_embedded_histogram[i], + sizeof (zcb.zcb_embedded_histogram[i]) / + sizeof (zcb.zcb_embedded_histogram[i][0]), 0); + } + } + + if (tzb->zb_ditto_samevdev != 0) { + (void) printf("\tDittoed blocks on same vdev: %llu\n", + (longlong_t)tzb->zb_ditto_samevdev); + } + if (dump_opt['b'] >= 2) { int l, t, level; (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" "\t avg\t comp\t%%Total\tType\n"); for (t = 0; t <= ZDB_OT_TOTAL; t++) { - char csize[6], lsize[6], psize[6], asize[6], avg[6]; + char csize[32], lsize[32], psize[32], asize[32]; + char avg[32], gang[32]; char *typename; if (t < DMU_OT_NUMTYPES) @@ -2109,11 +2896,12 @@ dump_block_stats(spa_t *spa) zcb.zcb_type[ZB_TOTAL][t].zb_asize) continue; - nicenum(zb->zb_count, csize, sizeof(csize)); - nicenum(zb->zb_lsize, lsize, sizeof(lsize)); - nicenum(zb->zb_psize, psize, sizeof(psize)); - nicenum(zb->zb_asize, asize, sizeof(asize)); - nicenum(zb->zb_asize / zb->zb_count, avg, sizeof(avg)); + zdb_nicenum(zb->zb_count, csize); + zdb_nicenum(zb->zb_lsize, lsize); + zdb_nicenum(zb->zb_psize, psize); + zdb_nicenum(zb->zb_asize, asize); + zdb_nicenum(zb->zb_asize / zb->zb_count, avg); + zdb_nicenum(zb->zb_gangs, gang); (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" "\t%5.2f\t%6.2f\t", @@ -2126,6 +2914,19 @@ dump_block_stats(spa_t *spa) else (void) printf(" L%d %s\n", level, typename); + + if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { + (void) printf("\t number of ganged " + "blocks: %s\n", gang); + } + + if (dump_opt['b'] >= 4) { + (void) printf("psize " + "(in 512-byte sectors): " + "number of blocks\n"); + dump_histogram(zb->zb_psize_histogram, + PSIZE_HISTO_SIZE, 0); + } } } } @@ -2153,25 +2954,25 @@ typedef struct zdb_ddt_entry { /* ARGSUSED */ static int zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { avl_tree_t *t = arg; avl_index_t where; zdb_ddt_entry_t *zdde, zdde_search; - if (bp == NULL) + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { (void) printf("traversing objset %llu, %llu objects, " "%lu blocks so far\n", (u_longlong_t)zb->zb_objset, - (u_longlong_t)bp->blk_fill, + (u_longlong_t)BP_GET_FILL(bp), avl_numnodes(t)); } if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || - BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) + BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) return (0); ddt_key_fill(&zdde_search.zdde_key, bp); @@ -2226,7 +3027,8 @@ dump_simulated_ddt(spa_t *spa) dds.dds_ref_psize = zdde->zdde_ref_psize; dds.dds_ref_dsize = zdde->zdde_ref_dsize; - ddt_stat_add(&ddh_total.ddh_stat[highbit(refcnt) - 1], &dds, 0); + ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], + &dds, 0); umem_free(zdde, sizeof (*zdde)); } @@ -2269,28 +3071,72 @@ dump_zpool(spa_t *spa) if (dump_opt['d'] > 2 || dump_opt['m']) dump_metaslabs(spa); + if (dump_opt['M']) + dump_metaslab_groups(spa); if (dump_opt['d'] || dump_opt['i']) { dump_dir(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { - dump_bplist(dp->dp_meta_objset, - spa->spa_deferred_bplist_obj, "Deferred frees"); + dump_full_bpobj(&spa->spa_deferred_bpobj, + "Deferred frees", 0); + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + dump_full_bpobj( + &spa->spa_dsl_pool->dp_free_bpobj, + "Pool snapshot frees", 0); + } + + if (spa_feature_is_active(spa, + SPA_FEATURE_ASYNC_DESTROY)) { + dump_bptree(spa->spa_meta_objset, + spa->spa_dsl_pool->dp_bptree_obj, + "Pool dataset frees"); + } dump_dtl(spa->spa_root_vdev, 0); } (void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + uint64_t refcount; + + if (!(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET)) { + ASSERT0(dataset_feature_count[f]); + continue; + } + (void) feature_get_refcount(spa, + &spa_feature_table[f], &refcount); + if (dataset_feature_count[f] != refcount) { + (void) printf("%s feature refcount mismatch: " + "%lld datasets != %lld refcount\n", + spa_feature_table[f].fi_uname, + (longlong_t)dataset_feature_count[f], + (longlong_t)refcount); + rc = 2; + } else { + (void) printf("Verified %s feature refcount " + "of %llu is correct\n", + spa_feature_table[f].fi_uname, + (longlong_t)refcount); + } + } } - if (dump_opt['b'] || dump_opt['c']) + if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) rc = dump_block_stats(spa); + if (rc == 0) + rc = verify_spacemap_refcounts(spa); + if (dump_opt['s']) show_pool_stats(spa); if (dump_opt['h']) dump_history(spa); - if (rc != 0) + if (rc != 0) { + dump_debug_buffer(); exit(rc); + } } #define ZDB_FLAG_CHECKSUM 0x0001 @@ -2312,7 +3158,7 @@ zdb_print_blkptr(blkptr_t *bp, int flags if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); - snprintf_blkptr(blkbuf, sizeof(blkbuf), bp); + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("%s\n", blkbuf); } @@ -2508,6 +3354,7 @@ zdb_read_block(char *thing, spa_t *spa) free(dup); return; } + i += p - &flagstr[i + 1]; /* skip over the number */ } } @@ -2666,7 +3513,7 @@ find_zpool(char **target, nvlist_t **con nvlist_t *match = NULL; char *name = NULL; char *sepp = NULL; - char sep; + char sep = '\0'; int count = 0; importargs_t args = { 0 }; @@ -2742,13 +3589,25 @@ main(int argc, char **argv) nvlist_t *policy = NULL; uint64_t max_txg = UINT64_MAX; int rewind = ZPOOL_NEVER_REWIND; + char *spa_config_path_env; + boolean_t target_is_spa = B_TRUE; (void) setrlimit(RLIMIT_NOFILE, &rl); (void) enable_extended_FILE_stdio(-1, -1); dprintf_setup(&argc, argv); - while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:")) != -1) { + /* + * If there is an environment variable SPA_CONFIG_PATH it overrides + * default spa_config_path setting. If -U flag is specified it will + * override this environment variable settings once again. + */ + spa_config_path_env = getenv("SPA_CONFIG_PATH"); + if (spa_config_path_env != NULL) + spa_config_path = spa_config_path_env; + + while ((c = getopt(argc, argv, + "bcdhilmMI:suCDRSAFLXx:evp:t:U:PG")) != -1) { switch (c) { case 'b': case 'c': @@ -2761,8 +3620,10 @@ main(int argc, char **argv) case 'u': case 'C': case 'D': + case 'M': case 'R': case 'S': + case 'G': dump_opt[c]++; dump_all = 0; break; @@ -2771,10 +3632,17 @@ main(int argc, char **argv) case 'L': case 'X': case 'e': + case 'P': dump_opt[c]++; break; - case 'v': - verbose++; + case 'I': + max_inflight = strtoull(optarg, NULL, 0); + if (max_inflight == 0) { + (void) fprintf(stderr, "maximum number " + "of inflight I/Os must be greater " + "than 0\n"); + usage(); + } break; case 'p': if (searchdirs == NULL) { @@ -2802,6 +3670,12 @@ main(int argc, char **argv) case 'U': spa_config_path = optarg; break; + case 'v': + verbose++; + break; + case 'x': + vn_dumpdir = optarg; + break; default: usage(); break; @@ -2813,15 +3687,29 @@ main(int argc, char **argv) usage(); } + /* + * ZDB does not typically re-read blocks; therefore limit the ARC + * to 256 MB, which can be used entirely for metadata. + */ + zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; + + /* + * "zdb -c" uses checksum-verifying scrub i/os which are async reads. + * "zdb -b" uses traversal prefetch which uses async reads. + * For good performance, let several of them be active at once. + */ + zfs_vdev_async_read_max_active = 10; + kernel_init(FREAD); g_zfs = libzfs_init(); - ASSERT(g_zfs != NULL); + if (g_zfs == NULL) + fatal("Fail to initialize zfs"); if (dump_all) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { - if (dump_all && !strchr("elAFLRSX", c)) + if (dump_all && !strchr("elAFLRSXP", c)) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; @@ -2875,13 +3763,31 @@ main(int argc, char **argv) fatal("can't open '%s': %s", target, strerror(ENOMEM)); } - if ((error = spa_import(name, cfg, NULL)) != 0) - error = spa_import_verbatim(name, cfg, NULL); + if ((error = spa_import(name, cfg, NULL, + ZFS_IMPORT_MISSING_LOG)) != 0) { + error = spa_import(name, cfg, NULL, + ZFS_IMPORT_VERBATIM); + } } } + if (strpbrk(target, "/@") != NULL) { + size_t targetlen; + + target_is_spa = B_FALSE; + /* + * Remove any trailing slash. Later code would get confused + * by it, but we want to allow it so that "pool/" can + * indicate that we want to dump the topmost filesystem, + * rather than the whole pool. + */ + targetlen = strlen(target); + if (targetlen != 0 && target[targetlen - 1] == '/') + target[targetlen - 1] = '\0'; + } + if (error == 0) { - if (strpbrk(target, "/@") == NULL || dump_opt['R']) { + if (target_is_spa || dump_opt['R']) { error = spa_open_rewind(target, &spa, FTAG, policy, NULL); if (error) { @@ -2927,7 +3833,13 @@ main(int argc, char **argv) argv[i], strerror(errno)); } } - (os != NULL) ? dump_dir(os) : dump_zpool(spa); + if (os != NULL) { + dump_dir(os); + } else if (zopt_objects > 0 && !dump_opt['m']) { + dump_dir(spa->spa_meta_objset); + } else { + dump_zpool(spa); + } } else { flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; flagbits['c'] = ZDB_FLAG_CHECKSUM; @@ -2945,6 +3857,9 @@ main(int argc, char **argv) (os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG); fuid_table_destroy(); + sa_loaded = B_FALSE; + + dump_debug_buffer(); libzfs_fini(g_zfs); kernel_fini(); Index: src/external/cddl/osnet/dist/cmd/zdb/zdb_il.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zdb/zdb_il.c,v retrieving revision 1.2 diff -u -p -r1.2 zdb_il.c --- src/external/cddl/osnet/dist/cmd/zdb/zdb_il.c 28 Mar 2014 03:46:56 -0000 1.2 +++ src/external/cddl/osnet/dist/cmd/zdb/zdb_il.c 17 Jul 2014 16:19:55 -0000 @@ -24,6 +24,10 @@ */ /* + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + */ + +/* * Print intent log header and statistics. */ @@ -47,7 +51,7 @@ print_log_bp(const blkptr_t *bp, const c { char blkbuf[BP_SPRINTF_LEN]; - snprintf_blkptr(blkbuf, sizeof(blkbuf), bp); + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); (void) printf("%s%s\n", prefix, blkbuf); } @@ -118,7 +122,7 @@ zil_prt_rec_write(zilog_t *zilog, int tx { char *data, *dlimit; blkptr_t *bp = &lr->lr_blkptr; - zbookmark_t zb; + zbookmark_phys_t zb; char buf[SPA_MAXBLOCKSIZE]; int verbose = MAX(dump_opt['d'], dump_opt['i']); int error; @@ -132,6 +136,7 @@ zil_prt_rec_write(zilog_t *zilog, int tx if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { (void) printf("%shas blkptr, %s\n", prefix, + !BP_IS_HOLE(bp) && bp->blk_birth >= spa_first_txg(zilog->zl_spa) ? "will claim" : "won't claim"); print_log_bp(bp, prefix); @@ -139,8 +144,6 @@ zil_prt_rec_write(zilog_t *zilog, int tx if (BP_IS_HOLE(bp)) { (void) printf("\t\t\tLSIZE 0x%llx\n", (u_longlong_t)BP_GET_LSIZE(bp)); - } - if (bp->blk_birth == 0) { bzero(buf, sizeof (buf)); (void) printf("%s\n", prefix); return; @@ -314,7 +317,7 @@ print_log_block(zilog_t *zilog, blkptr_t if (verbose >= 5) { (void) strcpy(blkbuf, ", "); snprintf_blkptr(blkbuf + strlen(blkbuf), - sizeof(blkbuf) - strlen(blkbuf), bp); + sizeof (blkbuf) - strlen(blkbuf), bp); } else { blkbuf[0] = '\0'; } @@ -362,7 +365,7 @@ dump_intent_log(zilog_t *zilog) int verbose = MAX(dump_opt['d'], dump_opt['i']); int i; - if (zh->zh_log.blk_birth == 0 || verbose < 1) + if (BP_IS_HOLE(&zh->zh_log) || verbose < 1) return; (void) printf("\n ZIL header: claim_txg %llu, " Index: src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zfs_iter.c --- src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c 27 Feb 2010 22:29:21 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c 10 Oct 2016 11:14:24 -0000 @@ -18,9 +18,12 @@ * * CDDL HEADER END */ + /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 Pawel Jakub Dawidek. All rights reserved. + * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #include @@ -69,7 +72,7 @@ uu_avl_pool_t *avl_pool; * Include snaps if they were requested or if this a zfs list where types * were not specified and the "listsnapshots" property is set on this pool. */ -static int +static boolean_t zfs_include_snapshots(zfs_handle_t *zhp, callback_data_t *cb) { zpool_handle_t *zph; @@ -89,8 +92,9 @@ static int zfs_callback(zfs_handle_t *zhp, void *data) { callback_data_t *cb = data; - int dontclose = 0; - int include_snaps = zfs_include_snapshots(zhp, cb); + boolean_t should_close = B_TRUE; + boolean_t include_snaps = zfs_include_snapshots(zhp, cb); + boolean_t include_bmarks = (cb->cb_types & ZFS_TYPE_BOOKMARK); if ((zfs_get_type(zhp) & cb->cb_types) || ((zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) && include_snaps)) { @@ -108,14 +112,15 @@ zfs_callback(zfs_handle_t *zhp, void *da cb->cb_props_table); if (zfs_expand_proplist(zhp, cb->cb_proplist, - (cb->cb_flags & ZFS_ITER_RECVD_PROPS)) + (cb->cb_flags & ZFS_ITER_RECVD_PROPS), + (cb->cb_flags & ZFS_ITER_LITERAL_PROPS)) != 0) { free(node); return (-1); } } uu_avl_insert(cb->cb_avl, node, idx); - dontclose = 1; + should_close = B_FALSE; } else { free(node); } @@ -130,12 +135,18 @@ zfs_callback(zfs_handle_t *zhp, void *da cb->cb_depth++; if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) (void) zfs_iter_filesystems(zhp, zfs_callback, data); - if ((zfs_get_type(zhp) != ZFS_TYPE_SNAPSHOT) && include_snaps) - (void) zfs_iter_snapshots(zhp, zfs_callback, data); + if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT | + ZFS_TYPE_BOOKMARK)) == 0) && include_snaps) + (void) zfs_iter_snapshots(zhp, + (cb->cb_flags & ZFS_ITER_SIMPLE) != 0, zfs_callback, + data); + if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT | + ZFS_TYPE_BOOKMARK)) == 0) && include_bmarks) + (void) zfs_iter_bookmarks(zhp, zfs_callback, data); cb->cb_depth--; } - if (!dontclose) + if (should_close) zfs_close(zhp); return (0); @@ -185,6 +196,14 @@ zfs_free_sort_columns(zfs_sort_column_t } } +boolean_t +zfs_sort_only_by_name(const zfs_sort_column_t *sc) +{ + + return (sc != NULL && sc->sc_next == NULL && + sc->sc_prop == ZFS_PROP_NAME); +} + /* ARGSUSED */ static int zfs_compare(const void *larg, const void *rarg, void *unused) @@ -225,7 +244,13 @@ zfs_compare(const void *larg, const void lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); - if (lcreate < rcreate) + /* + * Both lcreate and rcreate being 0 means we don't have + * properties and we should compare full name. + */ + if (lcreate == 0 && rcreate == 0) + ret = strcmp(lat + 1, rat + 1); + else if (lcreate < rcreate) ret = -1; else if (lcreate > rcreate) ret = 1; @@ -291,7 +316,14 @@ zfs_sort(const void *larg, const void *r if (rvalid) verify(nvlist_lookup_string(rval, ZPROP_VALUE, &rstr) == 0); + } else if (psc->sc_prop == ZFS_PROP_NAME) { + lvalid = rvalid = B_TRUE; + + (void) strlcpy(lbuf, zfs_get_name(l), sizeof (lbuf)); + (void) strlcpy(rbuf, zfs_get_name(r), sizeof (rbuf)); + lstr = lbuf; + rstr = rbuf; } else if (zfs_prop_is_string(psc->sc_prop)) { lvalid = (zfs_prop_get(l, psc->sc_prop, lbuf, sizeof (lbuf), NULL, NULL, 0, B_TRUE) == 0); @@ -351,11 +383,8 @@ zfs_for_each(int argc, char **argv, int avl_pool = uu_avl_pool_create("zfs_pool", sizeof (zfs_node_t), offsetof(zfs_node_t, zn_avlnode), zfs_sort, UU_DEFAULT); - if (avl_pool == NULL) { - (void) fprintf(stderr, - gettext("internal error: out of memory\n")); - exit(1); - } + if (avl_pool == NULL) + nomem(); cb.cb_sortcol = sortcol; cb.cb_flags = flags; @@ -400,11 +429,8 @@ zfs_for_each(int argc, char **argv, int sizeof (cb.cb_props_table)); } - if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) { - (void) fprintf(stderr, - gettext("internal error: out of memory\n")); - exit(1); - } + if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) + nomem(); if (argc == 0) { /* @@ -454,11 +480,8 @@ zfs_for_each(int argc, char **argv, int /* * Finally, clean up the AVL tree. */ - if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) { - (void) fprintf(stderr, - gettext("internal error: out of memory")); - exit(1); - } + if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) + nomem(); while ((node = uu_avl_walk_next(walk)) != NULL) { uu_avl_remove(cb.cb_avl, node); Index: src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zfs_iter.h --- src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h 27 Feb 2010 22:29:20 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h 10 Oct 2016 11:14:24 -0000 @@ -18,9 +18,12 @@ * * CDDL HEADER END */ + /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. + * Copyright 2013 Nexenta Systems, Inc. All rights reserved. */ #ifndef ZFS_ITER_H @@ -43,11 +46,14 @@ typedef struct zfs_sort_column { #define ZFS_ITER_PROP_LISTSNAPS (1 << 2) #define ZFS_ITER_DEPTH_LIMIT (1 << 3) #define ZFS_ITER_RECVD_PROPS (1 << 4) +#define ZFS_ITER_SIMPLE (1 << 5) +#define ZFS_ITER_LITERAL_PROPS (1 << 6) int zfs_for_each(int, char **, int options, zfs_type_t, zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *); int zfs_add_sort_column(zfs_sort_column_t **, const char *, boolean_t); void zfs_free_sort_columns(zfs_sort_column_t *); +boolean_t zfs_sort_only_by_name(const zfs_sort_column_t *); #ifdef __cplusplus } Index: src/external/cddl/osnet/dist/cmd/zfs/zfs_main.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zfs/zfs_main.c,v retrieving revision 1.5 diff -u -p -r1.5 zfs_main.c --- src/external/cddl/osnet/dist/cmd/zfs/zfs_main.c 10 Apr 2015 22:28:27 -0000 1.5 +++ src/external/cddl/osnet/dist/cmd/zfs/zfs_main.c 22 Apr 2017 16:42:58 -0000 @@ -20,8 +20,16 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright 2012 Milan Jurik. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. + * Copyright (c) 2012 Martin Matuska . All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Igor Kozhukhov . + * Copyright 2016 Nexenta Systems, Inc. */ #include @@ -41,24 +49,43 @@ #include #include #include -#include +#include +#include #include #include #include #include #include +#include +#include +#include +#ifdef __FreeBSD__ +#include +#endif +#ifdef __NetBSD__ +#include +#endif #include +#include +#include +#include #include +#ifdef illumos +#include +#include +#include +#endif #include "zfs_iter.h" #include "zfs_util.h" +#include "zfs_comutil.h" libzfs_handle_t *g_zfs; static FILE *mnttab_file; static char history_str[HIS_MAX_RECORD_LEN]; -const char *pypath = "/usr/lib/zfs/pyzfs.py"; +static boolean_t log_history = B_TRUE; static int zfs_do_clone(int argc, char **argv); static int zfs_do_create(int argc, char **argv); @@ -79,9 +106,17 @@ static int zfs_do_send(int argc, char ** static int zfs_do_receive(int argc, char **argv); static int zfs_do_promote(int argc, char **argv); static int zfs_do_userspace(int argc, char **argv); -static int zfs_do_python(int argc, char **argv); +static int zfs_do_allow(int argc, char **argv); +static int zfs_do_unallow(int argc, char **argv); static int zfs_do_hold(int argc, char **argv); +static int zfs_do_holds(int argc, char **argv); static int zfs_do_release(int argc, char **argv); +static int zfs_do_diff(int argc, char **argv); +#ifdef __FreeBSD__ +static int zfs_do_jail(int argc, char **argv); +static int zfs_do_unjail(int argc, char **argv); +#endif +static int zfs_do_bookmark(int argc, char **argv); /* * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. @@ -108,6 +143,8 @@ typedef enum { HELP_GET, HELP_INHERIT, HELP_UPGRADE, + HELP_JAIL, + HELP_UNJAIL, HELP_LIST, HELP_MOUNT, HELP_PROMOTE, @@ -126,7 +163,9 @@ typedef enum { HELP_GROUPSPACE, HELP_HOLD, HELP_HOLDS, - HELP_RELEASE + HELP_RELEASE, + HELP_DIFF, + HELP_BOOKMARK, } zfs_help_t; typedef struct zfs_command { @@ -153,6 +192,7 @@ static zfs_command_t command_table[] = { { "clone", zfs_do_clone, HELP_CLONE }, { "promote", zfs_do_promote, HELP_PROMOTE }, { "rename", zfs_do_rename, HELP_RENAME }, + { "bookmark", zfs_do_bookmark, HELP_BOOKMARK }, { NULL }, { "list", zfs_do_list, HELP_LIST }, { NULL }, @@ -171,13 +211,19 @@ static zfs_command_t command_table[] = { { "send", zfs_do_send, HELP_SEND }, { "receive", zfs_do_receive, HELP_RECEIVE }, { NULL }, - { "allow", zfs_do_python, HELP_ALLOW }, + { "allow", zfs_do_allow, HELP_ALLOW }, { NULL }, - { "unallow", zfs_do_python, HELP_UNALLOW }, + { "unallow", zfs_do_unallow, HELP_UNALLOW }, { NULL }, { "hold", zfs_do_hold, HELP_HOLD }, - { "holds", zfs_do_python, HELP_HOLDS }, + { "holds", zfs_do_holds, HELP_HOLDS }, { "release", zfs_do_release, HELP_RELEASE }, + { "diff", zfs_do_diff, HELP_DIFF }, +#ifdef __FreeBSD__ + { NULL }, + { "jail", zfs_do_jail, HELP_JAIL }, + { "unjail", zfs_do_unjail, HELP_UNJAIL }, +#endif }; #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) @@ -192,16 +238,19 @@ get_usage(zfs_help_t idx) return (gettext("\tclone [-p] [-o property=value] ... " " \n")); case HELP_CREATE: - return (gettext("\tcreate [-p] [-o property=value] ... " + return (gettext("\tcreate [-pu] [-o property=value] ... " "\n" "\tcreate [-ps] [-b blocksize] [-o property=value] ... " "-V \n")); case HELP_DESTROY: - return (gettext("\tdestroy [-rRf] \n" - "\tdestroy [-rRd] \n")); + return (gettext("\tdestroy [-fnpRrv] \n" + "\tdestroy [-dnpRrv] " + "@[%][,...]\n" + "\tdestroy #\n")); case HELP_GET: return (gettext("\tget [-rHp] [-d max] " - "[-o \"all\" | field[,...]] [-s source[,...]]\n" + "[-o \"all\" | field[,...]]\n" + "\t [-t type[,...]] [-s source[,...]]\n" "\t <\"all\" | property[,...]> " "[filesystem|volume|snapshot] ...\n")); case HELP_INHERIT: @@ -210,10 +259,13 @@ get_usage(zfs_help_t idx) case HELP_UPGRADE: return (gettext("\tupgrade [-v]\n" "\tupgrade [-r] [-V version] <-a | filesystem ...>\n")); + case HELP_JAIL: + return (gettext("\tjail \n")); + case HELP_UNJAIL: + return (gettext("\tunjail \n")); case HELP_LIST: - return (gettext("\tlist [-rH][-d max] " - "[-o property[,...]] [-t type[,...]] [-s property] ...\n" - "\t [-S property] ... " + return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] " + "[-s property]...\n\t [-S property]... [-t type[,...]] " "[filesystem|volume|snapshot] ...\n")); case HELP_MOUNT: return (gettext("\tmount\n" @@ -221,28 +273,35 @@ get_usage(zfs_help_t idx) case HELP_PROMOTE: return (gettext("\tpromote \n")); case HELP_RECEIVE: - return (gettext("\treceive [-vnF] \n" - "\treceive [-vnF] -d \n")); + return (gettext("\treceive|recv [-vnsFu] \n" + "\treceive|recv [-vnsFu] [-o origin=] [-d | -e] " + "\n" + "\treceive|recv -A \n")); case HELP_RENAME: - return (gettext("\trename " + return (gettext("\trename [-f] " "\n" - "\trename -p \n" - "\trename -r ")); + "\trename [-f] -p \n" + "\trename -r \n" + "\trename -u [-p] ")); case HELP_ROLLBACK: return (gettext("\trollback [-rRf] \n")); case HELP_SEND: - return (gettext("\tsend [-RDp] [-[iI] snapshot] \n")); + return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] " + "\n" + "\tsend [-Le] [-i snapshot|bookmark] " + "\n" + "\tsend [-nvPe] -t \n")); case HELP_SET: - return (gettext("\tset " + return (gettext("\tset ... " " ...\n")); case HELP_SHARE: return (gettext("\tshare <-a | filesystem>\n")); case HELP_SNAPSHOT: - return (gettext("\tsnapshot [-r] [-o property=value] ... " - "\n")); + return (gettext("\tsnapshot|snap [-r] [-o property=value] ... " + "@ ...\n")); case HELP_UNMOUNT: - return (gettext("\tunmount [-f] " + return (gettext("\tunmount|umount [-f] " "<-a | filesystem|mountpoint>\n")); case HELP_UNSHARE: return (gettext("\tunshare " @@ -268,41 +327,66 @@ get_usage(zfs_help_t idx) "\tunallow [-r] -s @setname [[,...]] " "\n")); case HELP_USERSPACE: - return (gettext("\tuserspace [-hniHp] [-o field[,...]] " - "[-sS field] ... [-t type[,...]]\n" - "\t \n")); + return (gettext("\tuserspace [-Hinp] [-o field[,...]] " + "[-s field] ...\n" + "\t [-S field] ... [-t type[,...]] " + "\n")); case HELP_GROUPSPACE: - return (gettext("\tgroupspace [-hniHpU] [-o field[,...]] " - "[-sS field] ... [-t type[,...]]\n" - "\t \n")); + return (gettext("\tgroupspace [-Hinp] [-o field[,...]] " + "[-s field] ...\n" + "\t [-S field] ... [-t type[,...]] " + "\n")); case HELP_HOLD: return (gettext("\thold [-r] ...\n")); case HELP_HOLDS: - return (gettext("\tholds [-r] ...\n")); + return (gettext("\tholds [-Hp] [-r|-d depth] " + " ...\n")); case HELP_RELEASE: return (gettext("\trelease [-r] ...\n")); + case HELP_DIFF: + return (gettext("\tdiff [-FHt] " + "[snapshot|filesystem]\n")); + case HELP_BOOKMARK: + return (gettext("\tbookmark \n")); } abort(); /* NOTREACHED */ } +void +nomem(void) +{ + (void) fprintf(stderr, gettext("internal error: out of memory\n")); + exit(1); +} + /* * Utility function to guarantee malloc() success. */ + void * safe_malloc(size_t size) { void *data; - if ((data = calloc(1, size)) == NULL) { - (void) fprintf(stderr, "internal error: out of memory\n"); - exit(1); - } + if ((data = calloc(1, size)) == NULL) + nomem(); return (data); } +static char * +safe_strdup(char *str) +{ + char *dupstr = strdup(str); + + if (dupstr == NULL) + nomem(); + + return (dupstr); +} + /* * Callback routine that will print out information for each of * the properties. @@ -391,6 +475,8 @@ usage(boolean_t requested) (void) fprintf(fp, "YES NO | none\n"); (void) fprintf(fp, "\t%-15s ", "groupquota@..."); (void) fprintf(fp, "YES NO | none\n"); + (void) fprintf(fp, "\t%-15s ", "written@"); + (void) fprintf(fp, " NO NO \n"); (void) fprintf(fp, gettext("\nSizes are specified in bytes " "with standard units such as K, M, G, etc.\n")); @@ -423,15 +509,18 @@ usage(boolean_t requested) exit(requested ? 0 : 2); } +/* + * Take a property=value argument string and add it to the given nvlist. + * Modifies the argument inplace. + */ static int -parseprop(nvlist_t *props) +parseprop(nvlist_t *props, char *propname) { - char *propname = optarg; char *propval, *strval; if ((propval = strchr(propname, '=')) == NULL) { (void) fprintf(stderr, gettext("missing " - "'=' for -o option\n")); + "'=' for property=value argument\n")); return (-1); } *propval = '\0'; @@ -441,11 +530,8 @@ parseprop(nvlist_t *props) "specified multiple times\n"), propname); return (-1); } - if (nvlist_add_string(props, propname, propval) != 0) { - (void) fprintf(stderr, gettext("internal " - "error: out of memory\n")); - return (-1); - } + if (nvlist_add_string(props, propname, propval) != 0) + nomem(); return (0); } @@ -458,7 +544,7 @@ parse_depth(char *opt, int *flags) depth = (int)strtol(opt, &tmp, 0); if (*tmp) { (void) fprintf(stderr, - gettext("%s is not an integer\n"), optarg); + gettext("%s is not an integer\n"), opt); usage(B_FALSE); } if (depth < 0) { @@ -470,6 +556,71 @@ parse_depth(char *opt, int *flags) return (depth); } +#define PROGRESS_DELAY 2 /* seconds */ + +static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"; +static time_t pt_begin; +static char *pt_header = NULL; +static boolean_t pt_shown; + +static void +start_progress_timer(void) +{ + pt_begin = time(NULL) + PROGRESS_DELAY; + pt_shown = B_FALSE; +} + +static void +set_progress_header(char *header) +{ + assert(pt_header == NULL); + pt_header = safe_strdup(header); + if (pt_shown) { + (void) printf("%s: ", header); + (void) fflush(stdout); + } +} + +static void +update_progress(char *update) +{ + if (!pt_shown && time(NULL) > pt_begin) { + int len = strlen(update); + + (void) printf("%s: %s%*.*s", pt_header, update, len, len, + pt_reverse); + (void) fflush(stdout); + pt_shown = B_TRUE; + } else if (pt_shown) { + int len = strlen(update); + + (void) printf("%s%*.*s", update, len, len, pt_reverse); + (void) fflush(stdout); + } +} + +static void +finish_progress(char *done) +{ + if (pt_shown) { + (void) printf("%s\n", done); + (void) fflush(stdout); + } + free(pt_header); + pt_header = NULL; +} + +/* + * Check if the dataset is mountable and should be automatically mounted. + */ +static boolean_t +should_auto_mount(zfs_handle_t *zhp) +{ + if (!zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, zfs_get_type(zhp))) + return (B_FALSE); + return (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON); +} + /* * zfs clone [-p] [-o prop=value] ... * @@ -486,20 +637,17 @@ zfs_do_clone(int argc, char **argv) zfs_handle_t *zhp = NULL; boolean_t parents = B_FALSE; nvlist_t *props; - int ret; + int ret = 0; int c; - if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) { - (void) fprintf(stderr, gettext("internal error: " - "out of memory\n")); - return (1); - } + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); /* check options */ while ((c = getopt(argc, argv, "o:p")) != -1) { switch (c) { case 'o': - if (parseprop(props)) + if (parseprop(props, optarg) != 0) return (1); break; case 'p': @@ -558,8 +706,22 @@ zfs_do_clone(int argc, char **argv) clone = zfs_open(g_zfs, argv[1], ZFS_TYPE_DATASET); if (clone != NULL) { - if ((ret = zfs_mount(clone, NULL, 0)) == 0) - ret = zfs_share(clone); + /* + * If the user doesn't want the dataset + * automatically mounted, then skip the mount/share + * step. + */ + if (should_auto_mount(clone)) { + if ((ret = zfs_mount(clone, NULL, 0)) != 0) { + (void) fprintf(stderr, gettext("clone " + "successfully created, " + "but not mounted\n")); + } else if ((ret = zfs_share(clone)) != 0) { + (void) fprintf(stderr, gettext("clone " + "successfully created, " + "but not shared\n")); + } + } zfs_close(clone); } } @@ -578,7 +740,7 @@ usage: } /* - * zfs create [-p] [-o prop=value] ... fs + * zfs create [-pu] [-o prop=value] ... fs * zfs create [-ps] [-b blocksize] [-o prop=value] ... -V vol size * * Create a new dataset. This command can be used to create filesystems @@ -591,30 +753,29 @@ usage: * SPA_VERSION_REFRESERVATION, we set a refreservation instead. * * The '-p' flag creates all the non-existing ancestors of the target first. + * + * The '-u' flag prevents mounting of newly created file system. */ static int zfs_do_create(int argc, char **argv) { zfs_type_t type = ZFS_TYPE_FILESYSTEM; zfs_handle_t *zhp = NULL; - uint64_t volsize; + uint64_t volsize = 0; int c; boolean_t noreserve = B_FALSE; boolean_t bflag = B_FALSE; boolean_t parents = B_FALSE; + boolean_t nomount = B_FALSE; int ret = 1; nvlist_t *props; uint64_t intval; - int canmount; - if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) { - (void) fprintf(stderr, gettext("internal error: " - "out of memory\n")); - return (1); - } + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); /* check options */ - while ((c = getopt(argc, argv, ":V:b:so:p")) != -1) { + while ((c = getopt(argc, argv, ":V:b:so:pu")) != -1) { switch (c) { case 'V': type = ZFS_TYPE_VOLUME; @@ -626,12 +787,8 @@ zfs_do_create(int argc, char **argv) } if (nvlist_add_uint64(props, - zfs_prop_to_name(ZFS_PROP_VOLSIZE), - intval) != 0) { - (void) fprintf(stderr, gettext("internal " - "error: out of memory\n")); - goto error; - } + zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0) + nomem(); volsize = intval; break; case 'p': @@ -648,24 +805,23 @@ zfs_do_create(int argc, char **argv) if (nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), - intval) != 0) { - (void) fprintf(stderr, gettext("internal " - "error: out of memory\n")); - goto error; - } + intval) != 0) + nomem(); break; case 'o': - if (parseprop(props)) + if (parseprop(props, optarg) != 0) goto error; break; case 's': noreserve = B_TRUE; break; + case 'u': + nomount = B_TRUE; + break; case ':': (void) fprintf(stderr, gettext("missing size " "argument\n")); goto badusage; - break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -678,6 +834,11 @@ zfs_do_create(int argc, char **argv) "used when creating a volume\n")); goto badusage; } + if (nomount && type != ZFS_TYPE_FILESYSTEM) { + (void) fprintf(stderr, gettext("'-u' can only be " + "used when creating a file system\n")); + goto badusage; + } argc -= optind; argv += optind; @@ -695,12 +856,14 @@ zfs_do_create(int argc, char **argv) if (type == ZFS_TYPE_VOLUME && !noreserve) { zpool_handle_t *zpool_handle; + nvlist_t *real_props = NULL; uint64_t spa_version; char *p; zfs_prop_t resv_prop; char *strval; + char msg[1024]; - if (p = strchr(argv[0], '/')) + if ((p = strchr(argv[0], '/')) != NULL) *p = '\0'; zpool_handle = zpool_open(g_zfs, argv[0]); if (p != NULL) @@ -709,21 +872,29 @@ zfs_do_create(int argc, char **argv) goto error; spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); - zpool_close(zpool_handle); if (spa_version >= SPA_VERSION_REFRESERVATION) resv_prop = ZFS_PROP_REFRESERVATION; else resv_prop = ZFS_PROP_RESERVATION; - volsize = zvol_volsize_to_reservation(volsize, props); + + (void) snprintf(msg, sizeof (msg), + gettext("cannot create '%s'"), argv[0]); + if (props && (real_props = zfs_valid_proplist(g_zfs, type, + props, 0, NULL, zpool_handle, msg)) == NULL) { + zpool_close(zpool_handle); + goto error; + } + zpool_close(zpool_handle); + + volsize = zvol_volsize_to_reservation(volsize, real_props); + nvlist_free(real_props); if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), &strval) != 0) { if (nvlist_add_uint64(props, zfs_prop_to_name(resv_prop), volsize) != 0) { - (void) fprintf(stderr, gettext("internal " - "error: out of memory\n")); nvlist_free(props); - return (1); + nomem(); } } } @@ -748,20 +919,17 @@ zfs_do_create(int argc, char **argv) if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL) goto error; - /* - * if the user doesn't want the dataset automatically mounted, - * then skip the mount/share step - */ - canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); + ret = 0; /* * Mount and/or share the new filesystem as appropriate. We provide a * verbose error message to let the user know that their filesystem was * in fact created, even if we failed to mount or share it. + * If the user doesn't want the dataset automatically mounted, + * then skip the mount/share step altogether. */ - ret = 0; - if (canmount == ZFS_CANMOUNT_ON) { + if (!nomount && should_auto_mount(zhp)) { if (zfs_mount(zhp, NULL, 0) != 0) { (void) fprintf(stderr, gettext("filesystem " "successfully created, but not mounted\n")); @@ -799,15 +967,25 @@ badusage: */ typedef struct destroy_cbdata { boolean_t cb_first; - int cb_force; - int cb_recurse; - int cb_error; - int cb_needforce; - int cb_doclones; - boolean_t cb_closezhp; + boolean_t cb_force; + boolean_t cb_recurse; + boolean_t cb_error; + boolean_t cb_doclones; zfs_handle_t *cb_target; - char *cb_snapname; boolean_t cb_defer_destroy; + boolean_t cb_verbose; + boolean_t cb_parsable; + boolean_t cb_dryrun; + nvlist_t *cb_nvl; + nvlist_t *cb_batchedsnaps; + + /* first snap in contiguous run */ + char *cb_firstsnap; + /* previous snap in contiguous run */ + char *cb_prevsnap; + int64_t cb_snapused; + char *cb_snapspec; + char *cb_bookmark; } destroy_cbdata_t; /* @@ -837,7 +1015,7 @@ destroy_check_dependent(zfs_handle_t *zh (void) fprintf(stderr, gettext("use '-r' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; - cbp->cb_error = 1; + cbp->cb_error = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); @@ -858,7 +1036,8 @@ destroy_check_dependent(zfs_handle_t *zh (void) fprintf(stderr, gettext("use '-R' to destroy " "the following datasets:\n")); cbp->cb_first = B_FALSE; - cbp->cb_error = 1; + cbp->cb_error = B_TRUE; + cbp->cb_dryrun = B_TRUE; } (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); @@ -872,7 +1051,20 @@ out: static int destroy_callback(zfs_handle_t *zhp, void *data) { - destroy_cbdata_t *cbp = data; + destroy_cbdata_t *cb = data; + const char *name = zfs_get_name(zhp); + + if (cb->cb_verbose) { + if (cb->cb_parsable) { + (void) printf("destroy\t%s\n", name); + } else if (cb->cb_dryrun) { + (void) printf(gettext("would destroy %s\n"), + name); + } else { + (void) printf(gettext("will destroy %s\n"), + name); + } + } /* * Ignore pools (which we've already flagged as an error before getting @@ -883,14 +1075,31 @@ destroy_callback(zfs_handle_t *zhp, void zfs_close(zhp); return (0); } + if (cb->cb_dryrun) { + zfs_close(zhp); + return (0); + } /* - * Bail out on the first error. + * We batch up all contiguous snapshots (even of different + * filesystems) and destroy them with one ioctl. We can't + * simply do all snap deletions and then all fs deletions, + * because we must delete a clone before its origin. */ - if (zfs_unmount(zhp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0 || - zfs_destroy(zhp, cbp->cb_defer_destroy) != 0) { - zfs_close(zhp); - return (-1); + if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) { + fnvlist_add_boolean(cb->cb_batchedsnaps, name); + } else { + int error = zfs_destroy_snaps_nvl(g_zfs, + cb->cb_batchedsnaps, B_FALSE); + fnvlist_free(cb->cb_batchedsnaps); + cb->cb_batchedsnaps = fnvlist_alloc(); + + if (error != 0 || + zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 || + zfs_destroy(zhp, cb->cb_defer_destroy) != 0) { + zfs_close(zhp); + return (-1); + } } zfs_close(zhp); @@ -898,66 +1107,183 @@ destroy_callback(zfs_handle_t *zhp, void } static int -destroy_snap_clones(zfs_handle_t *zhp, void *arg) +destroy_print_cb(zfs_handle_t *zhp, void *arg) { - destroy_cbdata_t *cbp = arg; - char thissnap[MAXPATHLEN]; - zfs_handle_t *szhp; - boolean_t closezhp = cbp->cb_closezhp; - int rv; + destroy_cbdata_t *cb = arg; + const char *name = zfs_get_name(zhp); + int err = 0; - (void) snprintf(thissnap, sizeof (thissnap), - "%s@%s", zfs_get_name(zhp), cbp->cb_snapname); + if (nvlist_exists(cb->cb_nvl, name)) { + if (cb->cb_firstsnap == NULL) + cb->cb_firstsnap = strdup(name); + if (cb->cb_prevsnap != NULL) + free(cb->cb_prevsnap); + /* this snap continues the current range */ + cb->cb_prevsnap = strdup(name); + if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL) + nomem(); + if (cb->cb_verbose) { + if (cb->cb_parsable) { + (void) printf("destroy\t%s\n", name); + } else if (cb->cb_dryrun) { + (void) printf(gettext("would destroy %s\n"), + name); + } else { + (void) printf(gettext("will destroy %s\n"), + name); + } + } + } else if (cb->cb_firstsnap != NULL) { + /* end of this range */ + uint64_t used = 0; + err = lzc_snaprange_space(cb->cb_firstsnap, + cb->cb_prevsnap, &used); + cb->cb_snapused += used; + free(cb->cb_firstsnap); + cb->cb_firstsnap = NULL; + free(cb->cb_prevsnap); + cb->cb_prevsnap = NULL; + } + zfs_close(zhp); + return (err); +} - libzfs_print_on_error(g_zfs, B_FALSE); - szhp = zfs_open(g_zfs, thissnap, ZFS_TYPE_SNAPSHOT); - libzfs_print_on_error(g_zfs, B_TRUE); - if (szhp) { - /* - * Destroy any clones of this snapshot - */ - if (zfs_iter_dependents(szhp, B_FALSE, destroy_callback, - cbp) != 0) { - zfs_close(szhp); - if (closezhp) - zfs_close(zhp); - return (-1); +static int +destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) +{ + int err = 0; + assert(cb->cb_firstsnap == NULL); + assert(cb->cb_prevsnap == NULL); + err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb); + if (cb->cb_firstsnap != NULL) { + uint64_t used = 0; + if (err == 0) { + err = lzc_snaprange_space(cb->cb_firstsnap, + cb->cb_prevsnap, &used); } - zfs_close(szhp); + cb->cb_snapused += used; + free(cb->cb_firstsnap); + cb->cb_firstsnap = NULL; + free(cb->cb_prevsnap); + cb->cb_prevsnap = NULL; } + return (err); +} - cbp->cb_closezhp = B_TRUE; - rv = zfs_iter_filesystems(zhp, destroy_snap_clones, arg); - if (closezhp) - zfs_close(zhp); - return (rv); +static int +snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg) +{ + destroy_cbdata_t *cb = arg; + int err = 0; + + /* Check for clones. */ + if (!cb->cb_doclones && !cb->cb_defer_destroy) { + cb->cb_target = zhp; + cb->cb_first = B_TRUE; + err = zfs_iter_dependents(zhp, B_TRUE, + destroy_check_dependent, cb); + } + + if (err == 0) { + if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp))) + nomem(); + } + zfs_close(zhp); + return (err); +} + +static int +gather_snapshots(zfs_handle_t *zhp, void *arg) +{ + destroy_cbdata_t *cb = arg; + int err = 0; + + err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb); + if (err == ENOENT) + err = 0; + if (err != 0) + goto out; + + if (cb->cb_verbose) { + err = destroy_print_snapshots(zhp, cb); + if (err != 0) + goto out; + } + + if (cb->cb_recurse) + err = zfs_iter_filesystems(zhp, gather_snapshots, cb); + +out: + zfs_close(zhp); + return (err); +} + +static int +destroy_clones(destroy_cbdata_t *cb) +{ + nvpair_t *pair; + for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL); + pair != NULL; + pair = nvlist_next_nvpair(cb->cb_nvl, pair)) { + zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair), + ZFS_TYPE_SNAPSHOT); + if (zhp != NULL) { + boolean_t defer = cb->cb_defer_destroy; + int err = 0; + + /* + * We can't defer destroy non-snapshots, so set it to + * false while destroying the clones. + */ + cb->cb_defer_destroy = B_FALSE; + err = zfs_iter_dependents(zhp, B_FALSE, + destroy_callback, cb); + cb->cb_defer_destroy = defer; + zfs_close(zhp); + if (err != 0) + return (err); + } + } + return (0); } static int zfs_do_destroy(int argc, char **argv) { destroy_cbdata_t cb = { 0 }; + int rv = 0; + int err = 0; int c; - zfs_handle_t *zhp; - char *cp; + zfs_handle_t *zhp = NULL; + char *at, *pound; zfs_type_t type = ZFS_TYPE_DATASET; /* check options */ - while ((c = getopt(argc, argv, "dfrR")) != -1) { + while ((c = getopt(argc, argv, "vpndfrR")) != -1) { switch (c) { + case 'v': + cb.cb_verbose = B_TRUE; + break; + case 'p': + cb.cb_verbose = B_TRUE; + cb.cb_parsable = B_TRUE; + break; + case 'n': + cb.cb_dryrun = B_TRUE; + break; case 'd': cb.cb_defer_destroy = B_TRUE; type = ZFS_TYPE_SNAPSHOT; break; case 'f': - cb.cb_force = 1; + cb.cb_force = B_TRUE; break; case 'r': - cb.cb_recurse = 1; + cb.cb_recurse = B_TRUE; break; case 'R': - cb.cb_recurse = 1; - cb.cb_doclones = 1; + cb.cb_recurse = B_TRUE; + cb.cb_doclones = B_TRUE; break; case '?': default: @@ -972,7 +1298,7 @@ zfs_do_destroy(int argc, char **argv) /* check number of arguments */ if (argc == 0) { - (void) fprintf(stderr, gettext("missing path argument\n")); + (void) fprintf(stderr, gettext("missing dataset argument\n")); usage(B_FALSE); } if (argc > 1) { @@ -980,112 +1306,195 @@ zfs_do_destroy(int argc, char **argv) usage(B_FALSE); } - /* - * If we are doing recursive destroy of a snapshot, then the - * named snapshot may not exist. Go straight to libzfs. - */ - if (cb.cb_recurse && (cp = strchr(argv[0], '@'))) { - int ret; + at = strchr(argv[0], '@'); + pound = strchr(argv[0], '#'); + if (at != NULL) { + + /* Build the list of snaps to destroy in cb_nvl. */ + cb.cb_nvl = fnvlist_alloc(); - *cp = '\0'; - if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL) + *at = '\0'; + zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) return (1); - *cp = '@'; - cp++; - if (cb.cb_doclones) { - boolean_t defer = cb.cb_defer_destroy; + cb.cb_snapspec = at + 1; + if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 || + cb.cb_error) { + rv = 1; + goto out; + } + + if (nvlist_empty(cb.cb_nvl)) { + (void) fprintf(stderr, gettext("could not find any " + "snapshots to destroy; check snapshot names.\n")); + rv = 1; + goto out; + } - /* - * Temporarily ignore the defer_destroy setting since - * it's not supported for clones. - */ - cb.cb_defer_destroy = B_FALSE; - cb.cb_snapname = cp; - if (destroy_snap_clones(zhp, &cb) != 0) { - zfs_close(zhp); - return (1); + if (cb.cb_verbose) { + char buf[16]; + zfs_nicenum(cb.cb_snapused, buf, sizeof (buf)); + if (cb.cb_parsable) { + (void) printf("reclaim\t%llu\n", + cb.cb_snapused); + } else if (cb.cb_dryrun) { + (void) printf(gettext("would reclaim %s\n"), + buf); + } else { + (void) printf(gettext("will reclaim %s\n"), + buf); } - cb.cb_defer_destroy = defer; } - ret = zfs_destroy_snaps(zhp, cp, cb.cb_defer_destroy); - zfs_close(zhp); - if (ret) { + if (!cb.cb_dryrun) { + if (cb.cb_doclones) { + cb.cb_batchedsnaps = fnvlist_alloc(); + err = destroy_clones(&cb); + if (err == 0) { + err = zfs_destroy_snaps_nvl(g_zfs, + cb.cb_batchedsnaps, B_FALSE); + } + if (err != 0) { + rv = 1; + goto out; + } + } + if (err == 0) { + err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl, + cb.cb_defer_destroy); + } + } + + if (err != 0) + rv = 1; + } else if (pound != NULL) { + int err; + nvlist_t *nvl; + + if (cb.cb_dryrun) { (void) fprintf(stderr, - gettext("no snapshots destroyed\n")); + "dryrun is not supported with bookmark\n"); + return (-1); } - return (ret != 0); - } - /* Open the given dataset */ - if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL) - return (1); + if (cb.cb_defer_destroy) { + (void) fprintf(stderr, + "defer destroy is not supported with bookmark\n"); + return (-1); + } - cb.cb_target = zhp; + if (cb.cb_recurse) { + (void) fprintf(stderr, + "recursive is not supported with bookmark\n"); + return (-1); + } - /* - * Perform an explicit check for pools before going any further. - */ - if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL && - zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { - (void) fprintf(stderr, gettext("cannot destroy '%s': " - "operation does not apply to pools\n"), - zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use 'zfs destroy -r " - "%s' to destroy all datasets in the pool\n"), - zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use 'zpool destroy %s' " - "to destroy the pool itself\n"), zfs_get_name(zhp)); - zfs_close(zhp); - return (1); - } + if (!zfs_bookmark_exists(argv[0])) { + (void) fprintf(stderr, gettext("bookmark '%s' " + "does not exist.\n"), argv[0]); + return (1); + } - /* - * Check for any dependents and/or clones. - */ - cb.cb_first = B_TRUE; - if (!cb.cb_doclones && !cb.cb_defer_destroy && - zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, - &cb) != 0) { - zfs_close(zhp); - return (1); - } + nvl = fnvlist_alloc(); + fnvlist_add_boolean(nvl, argv[0]); - if (cb.cb_error || (!cb.cb_defer_destroy && - (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0))) { - zfs_close(zhp); - return (1); - } + err = lzc_destroy_bookmarks(nvl, NULL); + if (err != 0) { + (void) zfs_standard_error(g_zfs, err, + "cannot destroy bookmark"); + } - /* - * Do the real thing. The callback will close the handle regardless of - * whether it succeeds or not. - */ + nvlist_free(cb.cb_nvl); - if (destroy_callback(zhp, &cb) != 0) - return (1); + return (err); + } else { + /* Open the given dataset */ + if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL) + return (1); - return (0); -} + cb.cb_target = zhp; -static boolean_t -is_recvd_column(zprop_get_cbdata_t *cbp) -{ - int i; - zfs_get_column_t col; + /* + * Perform an explicit check for pools before going any further. + */ + if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL && + zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { + (void) fprintf(stderr, gettext("cannot destroy '%s': " + "operation does not apply to pools\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use 'zfs destroy -r " + "%s' to destroy all datasets in the pool\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use 'zpool destroy %s' " + "to destroy the pool itself\n"), zfs_get_name(zhp)); + rv = 1; + goto out; + } - for (i = 0; i < ZFS_GET_NCOLS && - (col = cbp->cb_columns[i]) != GET_COL_NONE; i++) - if (col == GET_COL_RECVD) - return (B_TRUE); - return (B_FALSE); -} + /* + * Check for any dependents and/or clones. + */ + cb.cb_first = B_TRUE; + if (!cb.cb_doclones && + zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, + &cb) != 0) { + rv = 1; + goto out; + } -/* - * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...] - * < all | property[,property]... > < fs | snap | vol > ... - * + if (cb.cb_error) { + rv = 1; + goto out; + } + + cb.cb_batchedsnaps = fnvlist_alloc(); + if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, + &cb) != 0) { + rv = 1; + goto out; + } + + /* + * Do the real thing. The callback will close the + * handle regardless of whether it succeeds or not. + */ + err = destroy_callback(zhp, &cb); + zhp = NULL; + if (err == 0) { + err = zfs_destroy_snaps_nvl(g_zfs, + cb.cb_batchedsnaps, cb.cb_defer_destroy); + } + if (err != 0) + rv = 1; + } + +out: + fnvlist_free(cb.cb_batchedsnaps); + fnvlist_free(cb.cb_nvl); + if (zhp != NULL) + zfs_close(zhp); + return (rv); +} + +static boolean_t +is_recvd_column(zprop_get_cbdata_t *cbp) +{ + int i; + zfs_get_column_t col; + + for (i = 0; i < ZFS_GET_NCOLS && + (col = cbp->cb_columns[i]) != GET_COL_NONE; i++) + if (col == GET_COL_RECVD) + return (B_TRUE); + return (B_FALSE); +} + +/* + * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...] + * < all | property[,property]... > < fs | snap | vol > ... + * * -r recurse over any child datasets * -H scripted mode. Headers are stripped, and fields are separated * by tabs instead of spaces. @@ -1110,7 +1519,7 @@ get_callback(zfs_handle_t *zhp, void *da char buf[ZFS_MAXPROPLEN]; char rbuf[ZFS_MAXPROPLEN]; zprop_source_t sourcetype; - char source[ZFS_MAXNAMELEN]; + char source[ZFS_MAX_DATASET_NAME_LEN]; zprop_get_cbdata_t *cbp = data; nvlist_t *user_props = zfs_get_user_props(zhp); zprop_list_t *pl = cbp->cb_proplist; @@ -1166,6 +1575,17 @@ get_callback(zfs_handle_t *zhp, void *da zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, buf, sourcetype, source, NULL); + } else if (zfs_prop_written(pl->pl_user_prop)) { + sourcetype = ZPROP_SRC_LOCAL; + + if (zfs_prop_get_written(zhp, pl->pl_user_prop, + buf, sizeof (buf), cbp->cb_literal) != 0) { + sourcetype = ZPROP_SRC_NONE; + (void) strlcpy(buf, "-", sizeof (buf)); + } + + zprop_print_one_property(zfs_get_name(zhp), cbp, + pl->pl_user_prop, buf, sourcetype, source, NULL); } else { if (nvlist_lookup_nvlist(user_props, pl->pl_user_prop, &propval) != 0) { @@ -1210,9 +1630,10 @@ static int zfs_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; - int i, c, flags = 0; + int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS; + int types = ZFS_TYPE_DATASET; char *value, *fields; - int ret; + int ret = 0; int limit = 0; zprop_list_t fake_name = { 0 }; @@ -1227,7 +1648,7 @@ zfs_do_get(int argc, char **argv) cb.cb_type = ZFS_TYPE_DATASET; /* check options */ - while ((c = getopt(argc, argv, ":d:o:s:rHp")) != -1) { + while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; @@ -1302,7 +1723,7 @@ zfs_do_get(int argc, char **argv) default: (void) fprintf(stderr, gettext("invalid column name " - "'%s'\n"), value); + "'%s'\n"), suboptarg); usage(B_FALSE); } } @@ -1339,7 +1760,43 @@ zfs_do_get(int argc, char **argv) default: (void) fprintf(stderr, gettext("invalid source " - "'%s'\n"), value); + "'%s'\n"), suboptarg); + usage(B_FALSE); + } + } + break; + + case 't': + types = 0; + flags &= ~ZFS_ITER_PROP_LISTSNAPS; + while (*optarg != '\0') { + static char *type_subopts[] = { "filesystem", + "volume", "snapshot", "bookmark", + "all", NULL }; + + switch (getsubopt(&optarg, type_subopts, + &value)) { + case 0: + types |= ZFS_TYPE_FILESYSTEM; + break; + case 1: + types |= ZFS_TYPE_VOLUME; + break; + case 2: + types |= ZFS_TYPE_SNAPSHOT; + break; + case 3: + types |= ZFS_TYPE_BOOKMARK; + break; + case 4: + types = ZFS_TYPE_DATASET | + ZFS_TYPE_BOOKMARK; + break; + + default: + (void) fprintf(stderr, + gettext("invalid type '%s'\n"), + suboptarg); usage(B_FALSE); } } @@ -1388,7 +1845,7 @@ zfs_do_get(int argc, char **argv) cb.cb_first = B_TRUE; /* run for each object */ - ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, 0, + ret = zfs_for_each(argc, argv, flags, types, NULL, &cb.cb_proplist, limit, get_callback, &cb); if (cb.cb_proplist == &fake_name) @@ -1449,7 +1906,7 @@ zfs_do_inherit(int argc, char **argv) zfs_prop_t prop; inherit_cbdata_t cb = { 0 }; char *propname; - int ret; + int ret = 0; int flags = 0; boolean_t received = B_FALSE; @@ -1500,9 +1957,13 @@ zfs_do_inherit(int argc, char **argv) if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || prop == ZFS_PROP_REFQUOTA || - prop == ZFS_PROP_REFRESERVATION) + prop == ZFS_PROP_REFRESERVATION) { (void) fprintf(stderr, gettext("use 'zfs set " "%s=none' to clear\n"), propname); + (void) fprintf(stderr, gettext("use 'zfs " + "inherit -S %s' to revert to received " + "value\n"), propname); + } return (1); } if (received && (prop == ZFS_PROP_VOLSIZE || @@ -1538,7 +1999,7 @@ typedef struct upgrade_cbdata { uint64_t cb_version; boolean_t cb_newer; boolean_t cb_foundone; - char cb_lastfs[ZFS_MAXNAMELEN]; + char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN]; } upgrade_cbdata_t; static int @@ -1594,31 +2055,25 @@ upgrade_set_callback(zfs_handle_t *zhp, { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); - int i; - static struct { int zplver; int spaver; } table[] = { - {ZPL_VERSION_FUID, SPA_VERSION_FUID}, - {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE}, - {0, 0} - }; + int needed_spa_version; + int spa_version; + if (zfs_spa_version(zhp, &spa_version) < 0) + return (-1); - for (i = 0; table[i].zplver; i++) { - if (cb->cb_version >= table[i].zplver) { - int spa_version; + needed_spa_version = zfs_spa_version_map(cb->cb_version); - if (zfs_spa_version(zhp, &spa_version) < 0) - return (-1); + if (needed_spa_version < 0) + return (-1); - if (spa_version < table[i].spaver) { - /* can't upgrade */ - (void) printf(gettext("%s: can not be " - "upgraded; the pool version needs to first " - "be upgraded\nto version %d\n\n"), - zfs_get_name(zhp), table[i].spaver); - cb->cb_numfailed++; - return (0); - } - } + if (spa_version < needed_spa_version) { + /* can't upgrade */ + (void) printf(gettext("%s: can not be " + "upgraded; the pool version needs to first " + "be upgraded\nto version %d\n\n"), + zfs_get_name(zhp), needed_spa_version); + cb->cb_numfailed++; + return (0); } /* upgrade */ @@ -1630,9 +2085,11 @@ upgrade_set_callback(zfs_handle_t *zhp, /* * If they did "zfs upgrade -a", then we could * be doing ioctls to different pools. We need - * to log this history once to each pool. + * to log this history once to each pool, and bypass + * the normal history logging that happens in main(). */ - verify(zpool_stage_history(g_zfs, history_str) == 0); + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; } if (zfs_prop_set(zhp, "version", verstr) == 0) cb->cb_numupgraded++; @@ -1661,9 +2118,9 @@ zfs_do_upgrade(int argc, char **argv) { boolean_t all = B_FALSE; boolean_t showversions = B_FALSE; - int ret; + int ret = 0; upgrade_cbdata_t cb = { 0 }; - char c; + int c; int flags = ZFS_ITER_ARGS_CAN_BE_PATHS; /* check options */ @@ -1716,15 +2173,14 @@ zfs_do_upgrade(int argc, char **argv) "---------------\n"); (void) printf(gettext(" 1 Initial ZFS filesystem version\n")); (void) printf(gettext(" 2 Enhanced directory entries\n")); - (void) printf(gettext(" 3 Case insensitive and File system " - "unique identifier (FUID)\n")); + (void) printf(gettext(" 3 Case insensitive and filesystem " + "user identifier (FUID)\n")); (void) printf(gettext(" 4 userquota, groupquota " "properties\n")); + (void) printf(gettext(" 5 System attributes\n")); (void) printf(gettext("\nFor more information on a particular " - "version, including supported releases, see:\n\n")); - (void) printf("http://www.opensolaris.org/os/community/zfs/" - "version/zpl/N\n\n"); - (void) printf(gettext("Where 'N' is the version number.\n")); + "version, including supported releases,\n")); + (void) printf("see the ZFS Administration Guide.\n\n"); ret = 0; } else if (argc || all) { /* Upgrade filesystems */ @@ -1768,183 +2224,820 @@ zfs_do_upgrade(int argc, char **argv) } /* - * zfs userspace + * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...] + * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot + * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...] + * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot + * + * -H Scripted mode; elide headers and separate columns by tabs. + * -i Translate SID to POSIX ID. + * -n Print numeric ID instead of user/group name. + * -o Control which fields to display. + * -p Use exact (parsable) numeric output. + * -s Specify sort columns, descending order. + * -S Specify sort columns, ascending order. + * -t Control which object types to display. + * + * Displays space consumed by, and quotas on, each user in the specified + * filesystem or snapshot. */ -static int -userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space) -{ - zfs_userquota_prop_t *typep = arg; - zfs_userquota_prop_t p = *typep; - char *name = NULL; - char *ug, *propname; - char namebuf[32]; - char sizebuf[32]; - if (domain == NULL || domain[0] == '\0') { - if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) { - struct group *g = getgrgid(rid); - if (g) - name = g->gr_name; - } else { - struct passwd *p = getpwuid(rid); - if (p) - name = p->pw_name; - } - } +/* us_field_types, us_field_hdr and us_field_names should be kept in sync */ +enum us_field_types { + USFIELD_TYPE, + USFIELD_NAME, + USFIELD_USED, + USFIELD_QUOTA +}; +static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA" }; +static char *us_field_names[] = { "type", "name", "used", "quota" }; +#define USFIELD_LAST (sizeof (us_field_names) / sizeof (char *)) + +#define USTYPE_PSX_GRP (1 << 0) +#define USTYPE_PSX_USR (1 << 1) +#define USTYPE_SMB_GRP (1 << 2) +#define USTYPE_SMB_USR (1 << 3) +#define USTYPE_ALL \ + (USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR) + +static int us_type_bits[] = { + USTYPE_PSX_GRP, + USTYPE_PSX_USR, + USTYPE_SMB_GRP, + USTYPE_SMB_USR, + USTYPE_ALL +}; +static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup", + "smbuser", "all" }; - if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) - ug = "group"; - else - ug = "user"; +typedef struct us_node { + nvlist_t *usn_nvl; + uu_avl_node_t usn_avlnode; + uu_list_node_t usn_listnode; +} us_node_t; + +typedef struct us_cbdata { + nvlist_t **cb_nvlp; + uu_avl_pool_t *cb_avl_pool; + uu_avl_t *cb_avl; + boolean_t cb_numname; + boolean_t cb_nicenum; + boolean_t cb_sid2posix; + zfs_userquota_prop_t cb_prop; + zfs_sort_column_t *cb_sortcol; + size_t cb_width[USFIELD_LAST]; +} us_cbdata_t; + +static boolean_t us_populated = B_FALSE; + +typedef struct { + zfs_sort_column_t *si_sortcol; + boolean_t si_numname; +} us_sort_info_t; - if (p == ZFS_PROP_USERUSED || p == ZFS_PROP_GROUPUSED) - propname = "used"; - else - propname = "quota"; +static int +us_field_index(char *field) +{ + int i; - if (name == NULL) { - (void) snprintf(namebuf, sizeof (namebuf), - "%llu", (longlong_t)rid); - name = namebuf; + for (i = 0; i < USFIELD_LAST; i++) { + if (strcmp(field, us_field_names[i]) == 0) + return (i); } - zfs_nicenum(space, sizebuf, sizeof (sizebuf)); - (void) printf("%s %s %s%c%s %s\n", propname, ug, domain, - domain[0] ? '-' : ' ', name, sizebuf); - - return (0); + return (-1); } static int -zfs_do_userspace(int argc, char **argv) +us_compare(const void *larg, const void *rarg, void *unused) { - zfs_handle_t *zhp; - zfs_userquota_prop_t p; - int error; + const us_node_t *l = larg; + const us_node_t *r = rarg; + us_sort_info_t *si = (us_sort_info_t *)unused; + zfs_sort_column_t *sortcol = si->si_sortcol; + boolean_t numname = si->si_numname; + nvlist_t *lnvl = l->usn_nvl; + nvlist_t *rnvl = r->usn_nvl; + int rc = 0; + boolean_t lvb, rvb; + + for (; sortcol != NULL; sortcol = sortcol->sc_next) { + char *lvstr = ""; + char *rvstr = ""; + uint32_t lv32 = 0; + uint32_t rv32 = 0; + uint64_t lv64 = 0; + uint64_t rv64 = 0; + zfs_prop_t prop = sortcol->sc_prop; + const char *propname = NULL; + boolean_t reverse = sortcol->sc_reverse; + + switch (prop) { + case ZFS_PROP_TYPE: + propname = "type"; + (void) nvlist_lookup_uint32(lnvl, propname, &lv32); + (void) nvlist_lookup_uint32(rnvl, propname, &rv32); + if (rv32 != lv32) + rc = (rv32 < lv32) ? 1 : -1; + break; + case ZFS_PROP_NAME: + propname = "name"; + if (numname) { + (void) nvlist_lookup_uint64(lnvl, propname, + &lv64); + (void) nvlist_lookup_uint64(rnvl, propname, + &rv64); + if (rv64 != lv64) + rc = (rv64 < lv64) ? 1 : -1; + } else { + (void) nvlist_lookup_string(lnvl, propname, + &lvstr); + (void) nvlist_lookup_string(rnvl, propname, + &rvstr); + rc = strcmp(lvstr, rvstr); + } + break; + case ZFS_PROP_USED: + case ZFS_PROP_QUOTA: + if (!us_populated) + break; + if (prop == ZFS_PROP_USED) + propname = "used"; + else + propname = "quota"; + (void) nvlist_lookup_uint64(lnvl, propname, &lv64); + (void) nvlist_lookup_uint64(rnvl, propname, &rv64); + if (rv64 != lv64) + rc = (rv64 < lv64) ? 1 : -1; + break; - /* - * Try the python version. If the execv fails, we'll continue - * and do a simplistic implementation. - */ - (void) execv(pypath, argv-1); + default: + break; + } - (void) printf("internal error: %s not found\n" - "falling back on built-in implementation, " - "some features will not work\n", pypath); + if (rc != 0) { + if (rc < 0) + return (reverse ? 1 : -1); + else + return (reverse ? -1 : 1); + } + } - if ((zhp = zfs_open(g_zfs, argv[argc-1], ZFS_TYPE_DATASET)) == NULL) - return (1); + /* + * If entries still seem to be the same, check if they are of the same + * type (smbentity is added only if we are doing SID to POSIX ID + * translation where we can have duplicate type/name combinations). + */ + if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 && + nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 && + lvb != rvb) + return (lvb < rvb ? -1 : 1); - (void) printf("PROP TYPE NAME VALUE\n"); + return (0); +} - for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { - error = zfs_userspace(zhp, p, userspace_cb, &p); - if (error) - break; +static inline const char * +us_type2str(unsigned field_type) +{ + switch (field_type) { + case USTYPE_PSX_USR: + return ("POSIX User"); + case USTYPE_PSX_GRP: + return ("POSIX Group"); + case USTYPE_SMB_USR: + return ("SMB User"); + case USTYPE_SMB_GRP: + return ("SMB Group"); + default: + return ("Undefined"); } - return (error); } -/* - * list [-r][-d max] [-H] [-o property[,property]...] [-t type[,type]...] - * [-s property [-s property]...] [-S property [-S property]...] - * ... - * - * -r Recurse over all children - * -d Limit recursion by depth. - * -H Scripted mode; elide headers and separate columns by tabs - * -o Control which fields to display. - * -t Control which object types to display. - * -s Specify sort columns, descending order. - * -S Specify sort columns, ascending order. - * - * When given no arguments, lists all filesystems in the system. - * Otherwise, list the specified datasets, optionally recursing down them if - * '-r' is specified. - */ -typedef struct list_cbdata { - boolean_t cb_first; - boolean_t cb_scripted; - zprop_list_t *cb_proplist; -} list_cbdata_t; - -/* - * Given a list of columns to display, output appropriate headers for each one. - */ -static void -print_header(zprop_list_t *pl) +static int +userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space) { - char headerbuf[ZFS_MAXPROPLEN]; - const char *header; - int i; - boolean_t first = B_TRUE; - boolean_t right_justify; + us_cbdata_t *cb = (us_cbdata_t *)arg; + zfs_userquota_prop_t prop = cb->cb_prop; + char *name = NULL; + char *propname; + char sizebuf[32]; + us_node_t *node; + uu_avl_pool_t *avl_pool = cb->cb_avl_pool; + uu_avl_t *avl = cb->cb_avl; + uu_avl_index_t idx; + nvlist_t *props; + us_node_t *n; + zfs_sort_column_t *sortcol = cb->cb_sortcol; + unsigned type = 0; + const char *typestr; + size_t namelen; + size_t typelen; + size_t sizelen; + int typeidx, nameidx, sizeidx; + us_sort_info_t sortinfo = { sortcol, cb->cb_numname }; + boolean_t smbentity = B_FALSE; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + node = safe_malloc(sizeof (us_node_t)); + uu_avl_node_init(node, &node->usn_avlnode, avl_pool); + node->usn_nvl = props; + + if (domain != NULL && domain[0] != '\0') { + /* SMB */ + char sid[MAXNAMELEN + 32]; + uid_t id; +#ifdef illumos + int err; + int flag = IDMAP_REQ_FLG_USE_CACHE; +#endif - for (; pl != NULL; pl = pl->pl_next) { - if (!first) { - (void) printf(" "); + smbentity = B_TRUE; + + (void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid); + + if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) { + type = USTYPE_SMB_GRP; +#ifdef illumos + err = sid_to_id(sid, B_FALSE, &id); +#endif } else { - first = B_FALSE; + type = USTYPE_SMB_USR; +#ifdef illumos + err = sid_to_id(sid, B_TRUE, &id); +#endif } - right_justify = B_FALSE; - if (pl->pl_prop != ZPROP_INVAL) { - header = zfs_prop_column_name(pl->pl_prop); - right_justify = zfs_prop_align_right(pl->pl_prop); +#ifdef illumos + if (err == 0) { + rid = id; + if (!cb->cb_sid2posix) { + if (type == USTYPE_SMB_USR) { + (void) idmap_getwinnamebyuid(rid, flag, + &name, NULL); + } else { + (void) idmap_getwinnamebygid(rid, flag, + &name, NULL); + } + if (name == NULL) + name = sid; + } + } +#endif + } + + if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') { + /* POSIX or -i */ + if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) { + type = USTYPE_PSX_GRP; + if (!cb->cb_numname) { + struct group *g; + + if ((g = getgrgid(rid)) != NULL) + name = g->gr_name; + } } else { - for (i = 0; pl->pl_user_prop[i] != '\0'; i++) - headerbuf[i] = toupper(pl->pl_user_prop[i]); - headerbuf[i] = '\0'; - header = headerbuf; + type = USTYPE_PSX_USR; + if (!cb->cb_numname) { + struct passwd *p; + + if ((p = getpwuid(rid)) != NULL) + name = p->pw_name; + } } + } - if (pl->pl_next == NULL && !right_justify) - (void) printf("%s", header); - else if (right_justify) - (void) printf("%*s", pl->pl_width, header); - else - (void) printf("%-*s", pl->pl_width, header); + /* + * Make sure that the type/name combination is unique when doing + * SID to POSIX ID translation (hence changing the type from SMB to + * POSIX). + */ + if (cb->cb_sid2posix && + nvlist_add_boolean_value(props, "smbentity", smbentity) != 0) + nomem(); + + /* Calculate/update width of TYPE field */ + typestr = us_type2str(type); + typelen = strlen(gettext(typestr)); + typeidx = us_field_index("type"); + if (typelen > cb->cb_width[typeidx]) + cb->cb_width[typeidx] = typelen; + if (nvlist_add_uint32(props, "type", type) != 0) + nomem(); + + /* Calculate/update width of NAME field */ + if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) { + if (nvlist_add_uint64(props, "name", rid) != 0) + nomem(); + namelen = snprintf(NULL, 0, "%u", rid); + } else { + if (nvlist_add_string(props, "name", name) != 0) + nomem(); + namelen = strlen(name); + } + nameidx = us_field_index("name"); + if (namelen > cb->cb_width[nameidx]) + cb->cb_width[nameidx] = namelen; + + /* + * Check if this type/name combination is in the list and update it; + * otherwise add new node to the list. + */ + if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) { + uu_avl_insert(avl, node, idx); + } else { + nvlist_free(props); + free(node); + node = n; + props = node->usn_nvl; } - (void) printf("\n"); + /* Calculate/update width of USED/QUOTA fields */ + if (cb->cb_nicenum) + zfs_nicenum(space, sizebuf, sizeof (sizebuf)); + else + (void) snprintf(sizebuf, sizeof (sizebuf), "%llu", space); + sizelen = strlen(sizebuf); + if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED) { + propname = "used"; + if (!nvlist_exists(props, "quota")) + (void) nvlist_add_uint64(props, "quota", 0); + } else { + propname = "quota"; + if (!nvlist_exists(props, "used")) + (void) nvlist_add_uint64(props, "used", 0); + } + sizeidx = us_field_index(propname); + if (sizelen > cb->cb_width[sizeidx]) + cb->cb_width[sizeidx] = sizelen; + + if (nvlist_add_uint64(props, propname, space) != 0) + nomem(); + + return (0); } -/* - * Given a dataset and a list of fields, print out all the properties according - * to the described layout. - */ static void -print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) +print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types, + size_t *width, us_node_t *node) { + nvlist_t *nvl = node->usn_nvl; + char valstr[MAXNAMELEN]; boolean_t first = B_TRUE; - char property[ZFS_MAXPROPLEN]; - nvlist_t *userprops = zfs_get_user_props(zhp); - nvlist_t *propval; - char *propstr; - boolean_t right_justify; - int width; + int cfield = 0; + int field; + uint32_t ustype; + + /* Check type */ + (void) nvlist_lookup_uint32(nvl, "type", &ustype); + if (!(ustype & types)) + return; + + while ((field = fields[cfield]) != USFIELD_LAST) { + nvpair_t *nvp = NULL; + data_type_t type; + uint32_t val32; + uint64_t val64; + char *strval = NULL; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + if (strcmp(nvpair_name(nvp), + us_field_names[field]) == 0) + break; + } + + type = nvpair_type(nvp); + switch (type) { + case DATA_TYPE_UINT32: + (void) nvpair_value_uint32(nvp, &val32); + break; + case DATA_TYPE_UINT64: + (void) nvpair_value_uint64(nvp, &val64); + break; + case DATA_TYPE_STRING: + (void) nvpair_value_string(nvp, &strval); + break; + default: + (void) fprintf(stderr, "invalid data type\n"); + } + + switch (field) { + case USFIELD_TYPE: + strval = (char *)us_type2str(val32); + break; + case USFIELD_NAME: + if (type == DATA_TYPE_UINT64) { + (void) sprintf(valstr, "%llu", val64); + strval = valstr; + } + break; + case USFIELD_USED: + case USFIELD_QUOTA: + if (type == DATA_TYPE_UINT64) { + if (parsable) { + (void) sprintf(valstr, "%llu", val64); + } else { + zfs_nicenum(val64, valstr, + sizeof (valstr)); + } + if (field == USFIELD_QUOTA && + strcmp(valstr, "0") == 0) + strval = "none"; + else + strval = valstr; + } + break; + } - for (; pl != NULL; pl = pl->pl_next) { if (!first) { if (scripted) (void) printf("\t"); else (void) printf(" "); - } else { - first = B_FALSE; } + if (scripted) + (void) printf("%s", strval); + else if (field == USFIELD_TYPE || field == USFIELD_NAME) + (void) printf("%-*s", width[field], strval); + else + (void) printf("%*s", width[field], strval); - if (pl->pl_prop != ZPROP_INVAL) { - if (zfs_prop_get(zhp, pl->pl_prop, property, - sizeof (property), NULL, NULL, 0, B_FALSE) != 0) + first = B_FALSE; + cfield++; + } + + (void) printf("\n"); +} + +static void +print_us(boolean_t scripted, boolean_t parsable, int *fields, int types, + size_t *width, boolean_t rmnode, uu_avl_t *avl) +{ + us_node_t *node; + const char *col; + int cfield = 0; + int field; + + if (!scripted) { + boolean_t first = B_TRUE; + + while ((field = fields[cfield]) != USFIELD_LAST) { + col = gettext(us_field_hdr[field]); + if (field == USFIELD_TYPE || field == USFIELD_NAME) { + (void) printf(first ? "%-*s" : " %-*s", + width[field], col); + } else { + (void) printf(first ? "%*s" : " %*s", + width[field], col); + } + first = B_FALSE; + cfield++; + } + (void) printf("\n"); + } + + for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) { + print_us_node(scripted, parsable, fields, types, width, node); + if (rmnode) + nvlist_free(node->usn_nvl); + } +} + +static int +zfs_do_userspace(int argc, char **argv) +{ + zfs_handle_t *zhp; + zfs_userquota_prop_t p; + + uu_avl_pool_t *avl_pool; + uu_avl_t *avl_tree; + uu_avl_walk_t *walk; + char *delim; + char deffields[] = "type,name,used,quota"; + char *ofield = NULL; + char *tfield = NULL; + int cfield = 0; + int fields[256]; + int i; + boolean_t scripted = B_FALSE; + boolean_t prtnum = B_FALSE; + boolean_t parsable = B_FALSE; + boolean_t sid2posix = B_FALSE; + int ret = 0; + int c; + zfs_sort_column_t *sortcol = NULL; + int types = USTYPE_PSX_USR | USTYPE_SMB_USR; + us_cbdata_t cb; + us_node_t *node; + us_node_t *rmnode; + uu_list_pool_t *listpool; + uu_list_t *list; + uu_avl_index_t idx = 0; + uu_list_index_t idx2 = 0; + + if (argc < 2) + usage(B_FALSE); + + if (strcmp(argv[0], "groupspace") == 0) + /* Toggle default group types */ + types = USTYPE_PSX_GRP | USTYPE_SMB_GRP; + + while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) { + switch (c) { + case 'n': + prtnum = B_TRUE; + break; + case 'H': + scripted = B_TRUE; + break; + case 'p': + parsable = B_TRUE; + break; + case 'o': + ofield = optarg; + break; + case 's': + case 'S': + if (zfs_add_sort_column(&sortcol, optarg, + c == 's' ? B_FALSE : B_TRUE) != 0) { + (void) fprintf(stderr, + gettext("invalid field '%s'\n"), optarg); + usage(B_FALSE); + } + break; + case 't': + tfield = optarg; + break; + case 'i': + sid2posix = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing dataset name\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + /* Use default output fields if not specified using -o */ + if (ofield == NULL) + ofield = deffields; + do { + if ((delim = strchr(ofield, ',')) != NULL) + *delim = '\0'; + if ((fields[cfield++] = us_field_index(ofield)) == -1) { + (void) fprintf(stderr, gettext("invalid type '%s' " + "for -o option\n"), ofield); + return (-1); + } + if (delim != NULL) + ofield = delim + 1; + } while (delim != NULL); + fields[cfield] = USFIELD_LAST; + + /* Override output types (-t option) */ + if (tfield != NULL) { + types = 0; + + do { + boolean_t found = B_FALSE; + + if ((delim = strchr(tfield, ',')) != NULL) + *delim = '\0'; + for (i = 0; i < sizeof (us_type_bits) / sizeof (int); + i++) { + if (strcmp(tfield, us_type_names[i]) == 0) { + found = B_TRUE; + types |= us_type_bits[i]; + break; + } + } + if (!found) { + (void) fprintf(stderr, gettext("invalid type " + "'%s' for -t option\n"), tfield); + return (-1); + } + if (delim != NULL) + tfield = delim + 1; + } while (delim != NULL); + } + + if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL) + return (1); + + if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t), + offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL) + nomem(); + if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) + nomem(); + + /* Always add default sorting columns */ + (void) zfs_add_sort_column(&sortcol, "type", B_FALSE); + (void) zfs_add_sort_column(&sortcol, "name", B_FALSE); + + cb.cb_sortcol = sortcol; + cb.cb_numname = prtnum; + cb.cb_nicenum = !parsable; + cb.cb_avl_pool = avl_pool; + cb.cb_avl = avl_tree; + cb.cb_sid2posix = sid2posix; + + for (i = 0; i < USFIELD_LAST; i++) + cb.cb_width[i] = strlen(gettext(us_field_hdr[i])); + + for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { + if (((p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA) && + !(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) || + ((p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) && + !(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP)))) + continue; + cb.cb_prop = p; + if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0) + return (ret); + } + + /* Sort the list */ + if ((node = uu_avl_first(avl_tree)) == NULL) + return (0); + + us_populated = B_TRUE; + + listpool = uu_list_pool_create("tmplist", sizeof (us_node_t), + offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT); + list = uu_list_create(listpool, NULL, UU_DEFAULT); + uu_list_node_init(node, &node->usn_listnode, listpool); + + while (node != NULL) { + rmnode = node; + node = uu_avl_next(avl_tree, node); + uu_avl_remove(avl_tree, rmnode); + if (uu_list_find(list, rmnode, NULL, &idx2) == NULL) + uu_list_insert(list, rmnode, idx2); + } + + for (node = uu_list_first(list); node != NULL; + node = uu_list_next(list, node)) { + us_sort_info_t sortinfo = { sortcol, cb.cb_numname }; + + if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL) + uu_avl_insert(avl_tree, node, idx); + } + + uu_list_destroy(list); + uu_list_pool_destroy(listpool); + + /* Print and free node nvlist memory */ + print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE, + cb.cb_avl); + + zfs_free_sort_columns(sortcol); + + /* Clean up the AVL tree */ + if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) + nomem(); + + while ((node = uu_avl_walk_next(walk)) != NULL) { + uu_avl_remove(cb.cb_avl, node); + free(node); + } + + uu_avl_walk_end(walk); + uu_avl_destroy(avl_tree); + uu_avl_pool_destroy(avl_pool); + + return (ret); +} + +/* + * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property] ... + * [-t type[,...]] [filesystem|volume|snapshot] ... + * + * -H Scripted mode; elide headers and separate columns by tabs. + * -p Display values in parsable (literal) format. + * -r Recurse over all children. + * -d Limit recursion by depth. + * -o Control which fields to display. + * -s Specify sort columns, descending order. + * -S Specify sort columns, ascending order. + * -t Control which object types to display. + * + * When given no arguments, list all filesystems in the system. + * Otherwise, list the specified datasets, optionally recursing down them if + * '-r' is specified. + */ +typedef struct list_cbdata { + boolean_t cb_first; + boolean_t cb_literal; + boolean_t cb_scripted; + zprop_list_t *cb_proplist; +} list_cbdata_t; + +/* + * Given a list of columns to display, output appropriate headers for each one. + */ +static void +print_header(list_cbdata_t *cb) +{ + zprop_list_t *pl = cb->cb_proplist; + char headerbuf[ZFS_MAXPROPLEN]; + const char *header; + int i; + boolean_t first = B_TRUE; + boolean_t right_justify; + + for (; pl != NULL; pl = pl->pl_next) { + if (!first) { + (void) printf(" "); + } else { + first = B_FALSE; + } + + right_justify = B_FALSE; + if (pl->pl_prop != ZPROP_INVAL) { + header = zfs_prop_column_name(pl->pl_prop); + right_justify = zfs_prop_align_right(pl->pl_prop); + } else { + for (i = 0; pl->pl_user_prop[i] != '\0'; i++) + headerbuf[i] = toupper(pl->pl_user_prop[i]); + headerbuf[i] = '\0'; + header = headerbuf; + } + + if (pl->pl_next == NULL && !right_justify) + (void) printf("%s", header); + else if (right_justify) + (void) printf("%*s", pl->pl_width, header); + else + (void) printf("%-*s", pl->pl_width, header); + } + + (void) printf("\n"); +} + +/* + * Given a dataset and a list of fields, print out all the properties according + * to the described layout. + */ +static void +print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) +{ + zprop_list_t *pl = cb->cb_proplist; + boolean_t first = B_TRUE; + char property[ZFS_MAXPROPLEN]; + nvlist_t *userprops = zfs_get_user_props(zhp); + nvlist_t *propval; + char *propstr; + boolean_t right_justify; + + for (; pl != NULL; pl = pl->pl_next) { + if (!first) { + if (cb->cb_scripted) + (void) printf("\t"); + else + (void) printf(" "); + } else { + first = B_FALSE; + } + + if (pl->pl_prop == ZFS_PROP_NAME) { + (void) strlcpy(property, zfs_get_name(zhp), + sizeof (property)); + propstr = property; + right_justify = zfs_prop_align_right(pl->pl_prop); + } else if (pl->pl_prop != ZPROP_INVAL) { + if (zfs_prop_get(zhp, pl->pl_prop, property, + sizeof (property), NULL, NULL, 0, + cb->cb_literal) != 0) propstr = "-"; else propstr = property; - right_justify = zfs_prop_align_right(pl->pl_prop); } else if (zfs_prop_userquota(pl->pl_user_prop)) { if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, - property, sizeof (property), B_FALSE) != 0) + property, sizeof (property), cb->cb_literal) != 0) + propstr = "-"; + else + propstr = property; + right_justify = B_TRUE; + } else if (zfs_prop_written(pl->pl_user_prop)) { + if (zfs_prop_get_written(zhp, pl->pl_user_prop, + property, sizeof (property), cb->cb_literal) != 0) propstr = "-"; else propstr = property; @@ -1959,19 +3052,17 @@ print_dataset(zfs_handle_t *zhp, zprop_l right_justify = B_FALSE; } - width = pl->pl_width; - /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ - if (scripted || (pl->pl_next == NULL && !right_justify)) + if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) printf("%s", propstr); else if (right_justify) - (void) printf("%*s", width, propstr); + (void) printf("%*s", pl->pl_width, propstr); else - (void) printf("%-*s", width, propstr); + (void) printf("%-*s", pl->pl_width, propstr); } (void) printf("\n"); @@ -1987,11 +3078,11 @@ list_callback(zfs_handle_t *zhp, void *d if (cbp->cb_first) { if (!cbp->cb_scripted) - print_header(cbp->cb_proplist); + print_header(cbp); cbp->cb_first = B_FALSE; } - print_dataset(zhp, cbp->cb_proplist, cbp->cb_scripted); + print_dataset(zhp, cbp); return (0); } @@ -2000,7 +3091,6 @@ static int zfs_do_list(int argc, char **argv) { int c; - boolean_t scripted = B_FALSE; static char default_fields[] = "name,used,available,referenced,mountpoint"; int types = ZFS_TYPE_DATASET; @@ -2009,16 +3099,20 @@ zfs_do_list(int argc, char **argv) list_cbdata_t cb = { 0 }; char *value; int limit = 0; - int ret; + int ret = 0; zfs_sort_column_t *sortcol = NULL; int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS; /* check options */ - while ((c = getopt(argc, argv, ":d:o:rt:Hs:S:")) != -1) { + while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) { switch (c) { case 'o': fields = optarg; break; + case 'p': + cb.cb_literal = B_TRUE; + flags |= ZFS_ITER_LITERAL_PROPS; + break; case 'd': limit = parse_depth(optarg, &flags); break; @@ -2026,7 +3120,7 @@ zfs_do_list(int argc, char **argv) flags |= ZFS_ITER_RECURSE; break; case 'H': - scripted = B_TRUE; + cb.cb_scripted = B_TRUE; break; case 's': if (zfs_add_sort_column(&sortcol, optarg, @@ -2050,7 +3144,8 @@ zfs_do_list(int argc, char **argv) flags &= ~ZFS_ITER_PROP_LISTSNAPS; while (*optarg != '\0') { static char *type_subopts[] = { "filesystem", - "volume", "snapshot", "all", NULL }; + "volume", "snapshot", "snap", "bookmark", + "all", NULL }; switch (getsubopt(&optarg, type_subopts, &value)) { @@ -2061,16 +3156,20 @@ zfs_do_list(int argc, char **argv) types |= ZFS_TYPE_VOLUME; break; case 2: + case 3: types |= ZFS_TYPE_SNAPSHOT; break; - case 3: - types = ZFS_TYPE_DATASET; + case 4: + types |= ZFS_TYPE_BOOKMARK; + break; + case 5: + types = ZFS_TYPE_DATASET | + ZFS_TYPE_BOOKMARK; break; - default: (void) fprintf(stderr, gettext("invalid type '%s'\n"), - value); + suboptarg); usage(B_FALSE); } } @@ -2094,6 +3193,13 @@ zfs_do_list(int argc, char **argv) fields = default_fields; /* + * If we are only going to list snapshot names and sort by name, + * then we can use faster version. + */ + if (strcmp(fields, "name") == 0 && zfs_sort_only_by_name(sortcol)) + flags |= ZFS_ITER_SIMPLE; + + /* * If "-o space" and no types were specified, don't display snapshots. */ if (strcmp(fields, "space") == 0 && types_specified == B_FALSE) @@ -2108,7 +3214,6 @@ zfs_do_list(int argc, char **argv) != 0) usage(B_FALSE); - cb.cb_scripted = scripted; cb.cb_first = B_TRUE; ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist, @@ -2124,9 +3229,10 @@ zfs_do_list(int argc, char **argv) } /* - * zfs rename - * zfs rename -p + * zfs rename [-f] + * zfs rename [-f] -p * zfs rename -r + * zfs rename -u [-p] * * Renames the given dataset to another of the same type. * @@ -2137,19 +3243,27 @@ static int zfs_do_rename(int argc, char **argv) { zfs_handle_t *zhp; + renameflags_t flags = { 0 }; int c; - int ret; - boolean_t recurse = B_FALSE; + int ret = 0; + int types; boolean_t parents = B_FALSE; + char *snapshot = NULL; /* check options */ - while ((c = getopt(argc, argv, "pr")) != -1) { + while ((c = getopt(argc, argv, "fpru")) != -1) { switch (c) { case 'p': parents = B_TRUE; break; case 'r': - recurse = B_TRUE; + flags.recurse = B_TRUE; + break; + case 'u': + flags.nounmount = B_TRUE; + break; + case 'f': + flags.forceunmount = B_TRUE; break; case '?': default: @@ -2178,20 +3292,45 @@ zfs_do_rename(int argc, char **argv) usage(B_FALSE); } - if (recurse && parents) { + if (flags.recurse && parents) { (void) fprintf(stderr, gettext("-p and -r options are mutually " "exclusive\n")); usage(B_FALSE); } - if (recurse && strchr(argv[0], '@') == 0) { + if (flags.recurse && strchr(argv[0], '@') == 0) { (void) fprintf(stderr, gettext("source dataset for recursive " "rename must be a snapshot\n")); usage(B_FALSE); } - if ((zhp = zfs_open(g_zfs, argv[0], parents ? ZFS_TYPE_FILESYSTEM | - ZFS_TYPE_VOLUME : ZFS_TYPE_DATASET)) == NULL) + if (flags.nounmount && parents) { + (void) fprintf(stderr, gettext("-u and -p options are mutually " + "exclusive\n")); + usage(B_FALSE); + } + + if (flags.nounmount) + types = ZFS_TYPE_FILESYSTEM; + else if (parents) + types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; + else + types = ZFS_TYPE_DATASET; + + if (flags.recurse) { + /* + * When we do recursive rename we are fine when the given + * snapshot for the given dataset doesn't exist - it can + * still exists below. + */ + + snapshot = strchr(argv[0], '@'); + assert(snapshot != NULL); + *snapshot = '\0'; + snapshot++; + } + + if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) return (1); /* If we were asked and the name looks good, try to create ancestors. */ @@ -2201,7 +3340,7 @@ zfs_do_rename(int argc, char **argv) return (1); } - ret = (zfs_rename(zhp, argv[1], recurse) != 0); + ret = (zfs_rename(zhp, snapshot, argv[1], flags) != 0); zfs_close(zhp); return (ret); @@ -2217,7 +3356,7 @@ static int zfs_do_promote(int argc, char **argv) { zfs_handle_t *zhp; - int ret; + int ret = 0; /* check options */ if (argc > 1 && argv[1][0] == '-') { @@ -2266,9 +3405,30 @@ typedef struct rollback_cbdata { char *cb_target; int cb_error; boolean_t cb_recurse; - boolean_t cb_dependent; } rollback_cbdata_t; +static int +rollback_check_dependent(zfs_handle_t *zhp, void *data) +{ + rollback_cbdata_t *cbp = data; + + if (cbp->cb_first && cbp->cb_recurse) { + (void) fprintf(stderr, gettext("cannot rollback to " + "'%s': clones of previous snapshots exist\n"), + cbp->cb_target); + (void) fprintf(stderr, gettext("use '-R' to " + "force deletion of the following clones and " + "dependents:\n")); + cbp->cb_first = 0; + cbp->cb_error = 1; + } + + (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); + + zfs_close(zhp); + return (0); +} + /* * Report any snapshots more recent than the one specified. Used when '-r' is * not specified. We reuse this same callback for the snapshot dependents - if @@ -2285,52 +3445,30 @@ rollback_check(zfs_handle_t *zhp, void * return (0); } - if (!cbp->cb_dependent) { - if (strcmp(zfs_get_name(zhp), cbp->cb_target) != 0 && - zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT && - zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > - cbp->cb_create) { - - if (cbp->cb_first && !cbp->cb_recurse) { - (void) fprintf(stderr, gettext("cannot " - "rollback to '%s': more recent snapshots " - "exist\n"), - cbp->cb_target); - (void) fprintf(stderr, gettext("use '-r' to " - "force deletion of the following " - "snapshots:\n")); - cbp->cb_first = 0; - cbp->cb_error = 1; - } - - if (cbp->cb_recurse) { - cbp->cb_dependent = B_TRUE; - if (zfs_iter_dependents(zhp, B_TRUE, - rollback_check, cbp) != 0) { - zfs_close(zhp); - return (-1); - } - cbp->cb_dependent = B_FALSE; - } else { - (void) fprintf(stderr, "%s\n", - zfs_get_name(zhp)); - } - } - } else { - if (cbp->cb_first && cbp->cb_recurse) { - (void) fprintf(stderr, gettext("cannot rollback to " - "'%s': clones of previous snapshots exist\n"), + if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { + if (cbp->cb_first && !cbp->cb_recurse) { + (void) fprintf(stderr, gettext("cannot " + "rollback to '%s': more recent snapshots " + "or bookmarks exist\n"), cbp->cb_target); - (void) fprintf(stderr, gettext("use '-R' to " - "force deletion of the following clones and " - "dependents:\n")); + (void) fprintf(stderr, gettext("use '-r' to " + "force deletion of the following " + "snapshots and bookmarks:\n")); cbp->cb_first = 0; cbp->cb_error = 1; } - (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); + if (cbp->cb_recurse) { + if (zfs_iter_dependents(zhp, B_TRUE, + rollback_check_dependent, cbp) != 0) { + zfs_close(zhp); + return (-1); + } + } else { + (void) fprintf(stderr, "%s\n", + zfs_get_name(zhp)); + } } - zfs_close(zhp); return (0); } @@ -2338,12 +3476,12 @@ rollback_check(zfs_handle_t *zhp, void * static int zfs_do_rollback(int argc, char **argv) { - int ret; + int ret = 0; int c; boolean_t force = B_FALSE; rollback_cbdata_t cb = { 0 }; zfs_handle_t *zhp, *snap; - char parentname[ZFS_MAXNAMELEN]; + char parentname[ZFS_MAX_DATASET_NAME_LEN]; char *delim; /* check options */ @@ -2400,7 +3538,9 @@ zfs_do_rollback(int argc, char **argv) cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); cb.cb_first = B_TRUE; cb.cb_error = 0; - if ((ret = zfs_iter_children(zhp, rollback_check, &cb)) != 0) + if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb)) != 0) + goto out; + if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0) goto out; if ((ret = cb.cb_error) != 0) @@ -2422,21 +3562,17 @@ out: } /* - * zfs set property=value { fs | snap | vol } ... + * zfs set property=value ... { fs | snap | vol } ... * - * Sets the given property for all datasets specified on the command line. + * Sets the given properties for all datasets specified on the command line. */ -typedef struct set_cbdata { - char *cb_propname; - char *cb_value; -} set_cbdata_t; static int set_callback(zfs_handle_t *zhp, void *data) { - set_cbdata_t *cbp = data; + nvlist_t *props = data; - if (zfs_prop_set(zhp, cbp->cb_propname, cbp->cb_value) != 0) { + if (zfs_prop_set_list(zhp, props) != 0) { switch (libzfs_errno(g_zfs)) { case EZFS_MOUNTFAILED: (void) fprintf(stderr, gettext("property may be set " @@ -2455,8 +3591,9 @@ set_callback(zfs_handle_t *zhp, void *da static int zfs_do_set(int argc, char **argv) { - set_cbdata_t cb; - int ret; + nvlist_t *props = NULL; + int ds_start = -1; /* argv idx of first dataset arg */ + int ret = 0; /* check for options */ if (argc > 1 && argv[1][0] == '-') { @@ -2467,39 +3604,86 @@ zfs_do_set(int argc, char **argv) /* check number of arguments */ if (argc < 2) { - (void) fprintf(stderr, gettext("missing property=value " - "argument\n")); + (void) fprintf(stderr, gettext("missing arguments\n")); usage(B_FALSE); } if (argc < 3) { - (void) fprintf(stderr, gettext("missing dataset name\n")); + if (strchr(argv[1], '=') == NULL) { + (void) fprintf(stderr, gettext("missing property=value " + "argument(s)\n")); + } else { + (void) fprintf(stderr, gettext("missing dataset " + "name(s)\n")); + } usage(B_FALSE); } - /* validate property=value argument */ - cb.cb_propname = argv[1]; - if (((cb.cb_value = strchr(cb.cb_propname, '=')) == NULL) || - (cb.cb_value[1] == '\0')) { - (void) fprintf(stderr, gettext("missing value in " - "property=value argument\n")); + /* validate argument order: prop=val args followed by dataset args */ + for (int i = 1; i < argc; i++) { + if (strchr(argv[i], '=') != NULL) { + if (ds_start > 0) { + /* out-of-order prop=val argument */ + (void) fprintf(stderr, gettext("invalid " + "argument order\n"), i); + usage(B_FALSE); + } + } else if (ds_start < 0) { + ds_start = i; + } + } + if (ds_start < 0) { + (void) fprintf(stderr, gettext("missing dataset name(s)\n")); usage(B_FALSE); } - *cb.cb_value = '\0'; - cb.cb_value++; - - if (*cb.cb_propname == '\0') { - (void) fprintf(stderr, - gettext("missing property in property=value argument\n")); - usage(B_FALSE); + /* Populate a list of property settings */ + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + for (int i = 1; i < ds_start; i++) { + if ((ret = parseprop(props, argv[i])) != 0) + goto error; } - ret = zfs_for_each(argc - 2, argv + 2, 0, - ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, &cb); + ret = zfs_for_each(argc - ds_start, argv + ds_start, 0, + ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props); +error: + nvlist_free(props); return (ret); } +typedef struct snap_cbdata { + nvlist_t *sd_nvl; + boolean_t sd_recursive; + const char *sd_snapname; +} snap_cbdata_t; + +static int +zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) +{ + snap_cbdata_t *sd = arg; + char *name; + int rv = 0; + int error; + + if (sd->sd_recursive && + zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) { + zfs_close(zhp); + return (0); + } + + error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname); + if (error == -1) + nomem(); + fnvlist_add_boolean(sd->sd_nvl, name); + free(name); + + if (sd->sd_recursive) + rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); + zfs_close(zhp); + return (rv); +} + /* * zfs snapshot [-r] [-o prop=value] ... * @@ -2509,26 +3693,27 @@ zfs_do_set(int argc, char **argv) static int zfs_do_snapshot(int argc, char **argv) { - boolean_t recursive = B_FALSE; - int ret; - char c; + int ret = 0; + int c; nvlist_t *props; + snap_cbdata_t sd = { 0 }; + boolean_t multiple_snaps = B_FALSE; - if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) { - (void) fprintf(stderr, gettext("internal error: " - "out of memory\n")); - return (1); - } + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); /* check options */ while ((c = getopt(argc, argv, "ro:")) != -1) { switch (c) { case 'o': - if (parseprop(props)) + if (parseprop(props, optarg) != 0) return (1); break; case 'r': - recursive = B_TRUE; + sd.sd_recursive = B_TRUE; + multiple_snaps = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), @@ -2545,27 +3730,41 @@ zfs_do_snapshot(int argc, char **argv) (void) fprintf(stderr, gettext("missing snapshot argument\n")); goto usage; } - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - goto usage; + + if (argc > 1) + multiple_snaps = B_TRUE; + for (; argc > 0; argc--, argv++) { + char *atp; + zfs_handle_t *zhp; + + atp = strchr(argv[0], '@'); + if (atp == NULL) + goto usage; + *atp = '\0'; + sd.sd_snapname = atp + 1; + zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + goto usage; + if (zfs_snapshot_cb(zhp, &sd) != 0) + goto usage; } - ret = zfs_snapshot(g_zfs, argv[0], recursive, props); + ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props); + nvlist_free(sd.sd_nvl); nvlist_free(props); - if (ret && recursive) + if (ret != 0 && multiple_snaps) (void) fprintf(stderr, gettext("no snapshots were created\n")); return (ret != 0); usage: + nvlist_free(sd.sd_nvl); nvlist_free(props); usage(B_FALSE); return (-1); } /* - * zfs send [-vDp] -R [-i|-I <@snap>] - * zfs send [-vDp] [-i|-I <@snap>] - * * Send a backup stream to stdout. */ static int @@ -2573,13 +3772,16 @@ zfs_do_send(int argc, char **argv) { char *fromname = NULL; char *toname = NULL; + char *resume_token = NULL; char *cp; zfs_handle_t *zhp; sendflags_t flags = { 0 }; int c, err; + nvlist_t *dbgnv = NULL; + boolean_t extraverbose = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, ":i:I:RDpv")) != -1) { + while ((c = getopt(argc, argv, ":i:I:RDpvnPLet:")) != -1) { switch (c) { case 'i': if (fromname) @@ -2598,12 +3800,31 @@ zfs_do_send(int argc, char **argv) case 'p': flags.props = B_TRUE; break; + case 'P': + flags.parsable = B_TRUE; + flags.verbose = B_TRUE; + break; case 'v': + if (flags.verbose) + extraverbose = B_TRUE; flags.verbose = B_TRUE; + flags.progress = B_TRUE; break; case 'D': flags.dedup = B_TRUE; break; + case 'n': + flags.dryrun = B_TRUE; + break; + case 'L': + flags.largeblock = B_TRUE; + break; + case 'e': + flags.embed_data = B_TRUE; + break; + case 't': + resume_token = optarg; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); @@ -2619,29 +3840,87 @@ zfs_do_send(int argc, char **argv) argc -= optind; argv += optind; - /* check number of arguments */ - if (argc < 1) { - (void) fprintf(stderr, gettext("missing snapshot argument\n")); - usage(B_FALSE); - } - if (argc > 1) { - (void) fprintf(stderr, gettext("too many arguments\n")); - usage(B_FALSE); + if (resume_token != NULL) { + if (fromname != NULL || flags.replicate || flags.props || + flags.dedup) { + (void) fprintf(stderr, + gettext("invalid flags combined with -t\n")); + usage(B_FALSE); + } + if (argc != 0) { + (void) fprintf(stderr, gettext("no additional " + "arguments are permitted with -t\n")); + usage(B_FALSE); + } + } else { + if (argc < 1) { + (void) fprintf(stderr, + gettext("missing snapshot argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } } - if (isatty(STDOUT_FILENO)) { + if (!flags.dryrun && isatty(STDOUT_FILENO)) { (void) fprintf(stderr, gettext("Error: Stream can not be written to a terminal.\n" "You must redirect standard output.\n")); return (1); } - cp = strchr(argv[0], '@'); - if (cp == NULL) { - (void) fprintf(stderr, - gettext("argument must be a snapshot\n")); - usage(B_FALSE); + if (resume_token != NULL) { + return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO, + resume_token)); + } + + /* + * Special case sending a filesystem, or from a bookmark. + */ + if (strchr(argv[0], '@') == NULL || + (fromname && strchr(fromname, '#') != NULL)) { + char frombuf[ZFS_MAX_DATASET_NAME_LEN]; + enum lzc_send_flags lzc_flags = 0; + + if (flags.replicate || flags.doall || flags.props || + flags.dedup || flags.dryrun || flags.verbose || + flags.progress) { + (void) fprintf(stderr, + gettext("Error: " + "Unsupported flag with filesystem or bookmark.\n")); + return (1); + } + + zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); + if (zhp == NULL) + return (1); + + if (flags.largeblock) + lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; + if (flags.embed_data) + lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; + + if (fromname != NULL && + (fromname[0] == '#' || fromname[0] == '@')) { + /* + * Incremental source name begins with # or @. + * Default to same fs as target. + */ + (void) strncpy(frombuf, argv[0], sizeof (frombuf)); + cp = strchr(frombuf, '@'); + if (cp != NULL) + *cp = '\0'; + (void) strlcat(frombuf, fromname, sizeof (frombuf)); + fromname = frombuf; + } + err = zfs_send_one(zhp, fromname, STDOUT_FILENO, lzc_flags); + zfs_close(zhp); + return (err != 0); } + + cp = strchr(argv[0], '@'); *cp = '\0'; toname = cp + 1; zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); @@ -2654,7 +3933,7 @@ zfs_do_send(int argc, char **argv) * case if they specify the origin. */ if (fromname && (cp = strchr(fromname, '@')) != NULL) { - char origin[ZFS_MAXNAMELEN]; + char origin[ZFS_MAX_DATASET_NAME_LEN]; zprop_source_t src; (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN, @@ -2683,26 +3962,47 @@ zfs_do_send(int argc, char **argv) if (flags.replicate && fromname == NULL) flags.doall = B_TRUE; - err = zfs_send(zhp, fromname, toname, flags, STDOUT_FILENO, NULL, 0); + err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0, + extraverbose ? &dbgnv : NULL); + + if (extraverbose && dbgnv != NULL) { + /* + * dump_nvlist prints to stdout, but that's been + * redirected to a file. Make it print to stderr + * instead. + */ + (void) dup2(STDERR_FILENO, STDOUT_FILENO); + dump_nvlist(dbgnv, 0); + nvlist_free(dbgnv); + } zfs_close(zhp); return (err != 0); } /* - * zfs receive [-denvF] - * * Restore a backup stream from stdin. */ static int zfs_do_receive(int argc, char **argv) { - int c, err; + int c, err = 0; recvflags_t flags = { 0 }; + boolean_t abort_resumable = B_FALSE; + + nvlist_t *props; + nvpair_t *nvp = NULL; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); /* check options */ - while ((c = getopt(argc, argv, ":denuvF")) != -1) { + while ((c = getopt(argc, argv, ":o:denuvFsA")) != -1) { switch (c) { + case 'o': + if (parseprop(props, optarg) != 0) + return (1); + break; case 'd': flags.isprefix = B_TRUE; break; @@ -2719,9 +4019,15 @@ zfs_do_receive(int argc, char **argv) case 'v': flags.verbose = B_TRUE; break; + case 's': + flags.resumable = B_TRUE; + break; case 'F': flags.force = B_TRUE; break; + case 'A': + abort_resumable = B_TRUE; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); @@ -2747,6 +4053,51 @@ zfs_do_receive(int argc, char **argv) usage(B_FALSE); } + while ((nvp = nvlist_next_nvpair(props, nvp))) { + if (strcmp(nvpair_name(nvp), "origin") != 0) { + (void) fprintf(stderr, gettext("invalid option")); + usage(B_FALSE); + } + } + + if (abort_resumable) { + if (flags.isprefix || flags.istail || flags.dryrun || + flags.resumable || flags.nomount) { + (void) fprintf(stderr, gettext("invalid option")); + usage(B_FALSE); + } + + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + (void) snprintf(namebuf, sizeof (namebuf), + "%s/%%recv", argv[0]); + + if (zfs_dataset_exists(g_zfs, namebuf, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { + zfs_handle_t *zhp = zfs_open(g_zfs, + namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return (1); + err = zfs_destroy(zhp, B_FALSE); + } else { + zfs_handle_t *zhp = zfs_open(g_zfs, + argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + usage(B_FALSE); + if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) || + zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + NULL, 0, NULL, NULL, 0, B_TRUE) == -1) { + (void) fprintf(stderr, + gettext("'%s' does not have any " + "resumable receive state to abort\n"), + argv[0]); + return (1); + } + err = zfs_destroy(zhp, B_FALSE); + } + + return (err != 0); + } + if (isatty(STDIN_FILENO)) { (void) fprintf(stderr, gettext("Error: Backup stream can not be read " @@ -2754,147 +4105,1685 @@ zfs_do_receive(int argc, char **argv) "You must redirect standard input.\n")); return (1); } - - err = zfs_receive(g_zfs, argv[0], flags, STDIN_FILENO, NULL); + err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL); return (err != 0); } -static int -zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) -{ - int errors = 0; - int i; - const char *tag; - boolean_t recursive = B_FALSE; - boolean_t temphold = B_FALSE; - const char *opts = holding ? "rt" : "r"; - int c; +/* + * allow/unallow stuff + */ +/* copied from zfs/sys/dsl_deleg.h */ +#define ZFS_DELEG_PERM_CREATE "create" +#define ZFS_DELEG_PERM_DESTROY "destroy" +#define ZFS_DELEG_PERM_SNAPSHOT "snapshot" +#define ZFS_DELEG_PERM_ROLLBACK "rollback" +#define ZFS_DELEG_PERM_CLONE "clone" +#define ZFS_DELEG_PERM_PROMOTE "promote" +#define ZFS_DELEG_PERM_RENAME "rename" +#define ZFS_DELEG_PERM_MOUNT "mount" +#define ZFS_DELEG_PERM_SHARE "share" +#define ZFS_DELEG_PERM_SEND "send" +#define ZFS_DELEG_PERM_RECEIVE "receive" +#define ZFS_DELEG_PERM_ALLOW "allow" +#define ZFS_DELEG_PERM_USERPROP "userprop" +#define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */ +#define ZFS_DELEG_PERM_USERQUOTA "userquota" +#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" +#define ZFS_DELEG_PERM_USERUSED "userused" +#define ZFS_DELEG_PERM_GROUPUSED "groupused" +#define ZFS_DELEG_PERM_HOLD "hold" +#define ZFS_DELEG_PERM_RELEASE "release" +#define ZFS_DELEG_PERM_DIFF "diff" +#define ZFS_DELEG_PERM_BOOKMARK "bookmark" + +#define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE + +static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = { + { ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW }, + { ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE }, + { ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE }, + { ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY }, + { ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF}, + { ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD }, + { ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT }, + { ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE }, + { ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE }, + { ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE }, + { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, + { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, + { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, + { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, + { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, + { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK }, + + { ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, + { ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, + { ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, + { ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, + { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, + { NULL, ZFS_DELEG_NOTE_NONE } +}; - /* check options */ - while ((c = getopt(argc, argv, opts)) != -1) { - switch (c) { - case 'r': - recursive = B_TRUE; - break; - case 't': - temphold = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } +/* permission structure */ +typedef struct deleg_perm { + zfs_deleg_who_type_t dp_who_type; + const char *dp_name; + boolean_t dp_local; + boolean_t dp_descend; +} deleg_perm_t; + +/* */ +typedef struct deleg_perm_node { + deleg_perm_t dpn_perm; + + uu_avl_node_t dpn_avl_node; +} deleg_perm_node_t; + +typedef struct fs_perm fs_perm_t; + +/* permissions set */ +typedef struct who_perm { + zfs_deleg_who_type_t who_type; + const char *who_name; /* id */ + char who_ug_name[256]; /* user/group name */ + fs_perm_t *who_fsperm; /* uplink */ + + uu_avl_t *who_deleg_perm_avl; /* permissions */ +} who_perm_t; + +/* */ +typedef struct who_perm_node { + who_perm_t who_perm; + uu_avl_node_t who_avl_node; +} who_perm_node_t; + +typedef struct fs_perm_set fs_perm_set_t; +/* fs permissions */ +struct fs_perm { + const char *fsp_name; - argc -= optind; - argv += optind; + uu_avl_t *fsp_sc_avl; /* sets,create */ + uu_avl_t *fsp_uge_avl; /* user,group,everyone */ - /* check number of arguments */ - if (argc < 2) - usage(B_FALSE); + fs_perm_set_t *fsp_set; /* uplink */ +}; - tag = argv[0]; - --argc; - ++argv; +/* */ +typedef struct fs_perm_node { + fs_perm_t fspn_fsperm; + uu_avl_t *fspn_avl; + + uu_list_node_t fspn_list_node; +} fs_perm_node_t; + +/* top level structure */ +struct fs_perm_set { + uu_list_pool_t *fsps_list_pool; + uu_list_t *fsps_list; /* list of fs_perms */ + + uu_avl_pool_t *fsps_named_set_avl_pool; + uu_avl_pool_t *fsps_who_perm_avl_pool; + uu_avl_pool_t *fsps_deleg_perm_avl_pool; +}; - if (holding && tag[0] == '.') { - /* tags starting with '.' are reserved for libzfs */ - (void) fprintf(stderr, gettext("tag may not start with '.'\n")); - usage(B_FALSE); +static inline const char * +deleg_perm_type(zfs_deleg_note_t note) +{ + /* subcommands */ + switch (note) { + /* SUBCOMMANDS */ + /* OTHER */ + case ZFS_DELEG_NOTE_GROUPQUOTA: + case ZFS_DELEG_NOTE_GROUPUSED: + case ZFS_DELEG_NOTE_USERPROP: + case ZFS_DELEG_NOTE_USERQUOTA: + case ZFS_DELEG_NOTE_USERUSED: + /* other */ + return (gettext("other")); + default: + return (gettext("subcommand")); } +} - for (i = 0; i < argc; ++i) { - zfs_handle_t *zhp; - char parent[ZFS_MAXNAMELEN]; - const char *delim; - char *path = argv[i]; - - delim = strchr(path, '@'); - if (delim == NULL) { - (void) fprintf(stderr, - gettext("'%s' is not a snapshot\n"), path); - ++errors; - continue; - } - (void) strncpy(parent, path, delim - path); - parent[delim - path] = '\0'; - - zhp = zfs_open(g_zfs, parent, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); - if (zhp == NULL) { - ++errors; - continue; - } - if (holding) { - if (zfs_hold(zhp, delim+1, tag, recursive, - temphold, B_FALSE) != 0) - ++errors; - } else { - if (zfs_release(zhp, delim+1, tag, recursive) != 0) - ++errors; - } - zfs_close(zhp); +static int +who_type2weight(zfs_deleg_who_type_t who_type) +{ + int res; + switch (who_type) { + case ZFS_DELEG_NAMED_SET_SETS: + case ZFS_DELEG_NAMED_SET: + res = 0; + break; + case ZFS_DELEG_CREATE_SETS: + case ZFS_DELEG_CREATE: + res = 1; + break; + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + res = 2; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + res = 3; + break; + case ZFS_DELEG_EVERYONE_SETS: + case ZFS_DELEG_EVERYONE: + res = 4; + break; + default: + res = -1; } - return (errors != 0); + return (res); } -/* - * zfs hold [-r] [-t] ... - * - * -r Recursively hold - * -t Temporary hold (hidden option) - * - * Apply a user-hold with the given tag to the list of snapshots. - */ +/* ARGSUSED */ static int -zfs_do_hold(int argc, char **argv) +who_perm_compare(const void *larg, const void *rarg, void *unused) { - return (zfs_do_hold_rele_impl(argc, argv, B_TRUE)); + const who_perm_node_t *l = larg; + const who_perm_node_t *r = rarg; + zfs_deleg_who_type_t ltype = l->who_perm.who_type; + zfs_deleg_who_type_t rtype = r->who_perm.who_type; + int lweight = who_type2weight(ltype); + int rweight = who_type2weight(rtype); + int res = lweight - rweight; + if (res == 0) + res = strncmp(l->who_perm.who_name, r->who_perm.who_name, + ZFS_MAX_DELEG_NAME-1); + + if (res == 0) + return (0); + if (res > 0) + return (1); + else + return (-1); } -/* - * zfs release [-r] ... - * - * -r Recursively release - * - * Release a user-hold with the given tag from the list of snapshots. - */ +/* ARGSUSED */ static int -zfs_do_release(int argc, char **argv) +deleg_perm_compare(const void *larg, const void *rarg, void *unused) { - return (zfs_do_hold_rele_impl(argc, argv, B_FALSE)); -} + const deleg_perm_node_t *l = larg; + const deleg_perm_node_t *r = rarg; + int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name, + ZFS_MAX_DELEG_NAME-1); -typedef struct get_all_cbdata { - zfs_handle_t **cb_handles; - size_t cb_alloc; - size_t cb_used; - uint_t cb_types; - boolean_t cb_verbose; -} get_all_cbdata_t; + if (res == 0) + return (0); -#define CHECK_SPINNER 30 -#define SPINNER_TIME 3 /* seconds */ -#define MOUNT_TIME 5 /* seconds */ + if (res > 0) + return (1); + else + return (-1); +} -static int -get_one_dataset(zfs_handle_t *zhp, void *data) +static inline void +fs_perm_set_init(fs_perm_set_t *fspset) { - static char spin[] = { '-', '\\', '|', '/' }; - static int spinval = 0; - static int spincheck = 0; + bzero(fspset, sizeof (fs_perm_set_t)); + + if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool", + sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node), + NULL, UU_DEFAULT)) == NULL) + nomem(); + if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL, + UU_DEFAULT)) == NULL) + nomem(); + + if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create( + "named_set_avl_pool", sizeof (who_perm_node_t), offsetof( + who_perm_node_t, who_avl_node), who_perm_compare, + UU_DEFAULT)) == NULL) + nomem(); + + if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create( + "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof( + who_perm_node_t, who_avl_node), who_perm_compare, + UU_DEFAULT)) == NULL) + nomem(); + + if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create( + "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof( + deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT)) + == NULL) + nomem(); +} + +static inline void fs_perm_fini(fs_perm_t *); +static inline void who_perm_fini(who_perm_t *); + +static inline void +fs_perm_set_fini(fs_perm_set_t *fspset) +{ + fs_perm_node_t *node = uu_list_first(fspset->fsps_list); + + while (node != NULL) { + fs_perm_node_t *next_node = + uu_list_next(fspset->fsps_list, node); + fs_perm_t *fsperm = &node->fspn_fsperm; + fs_perm_fini(fsperm); + uu_list_remove(fspset->fsps_list, node); + free(node); + node = next_node; + } + + uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool); + uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool); + uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool); +} + +static inline void +deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type, + const char *name) +{ + deleg_perm->dp_who_type = type; + deleg_perm->dp_name = name; +} + +static inline void +who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm, + zfs_deleg_who_type_t type, const char *name) +{ + uu_avl_pool_t *pool; + pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool; + + bzero(who_perm, sizeof (who_perm_t)); + + if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL, + UU_DEFAULT)) == NULL) + nomem(); + + who_perm->who_type = type; + who_perm->who_name = name; + who_perm->who_fsperm = fsperm; +} + +static inline void +who_perm_fini(who_perm_t *who_perm) +{ + deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl); + + while (node != NULL) { + deleg_perm_node_t *next_node = + uu_avl_next(who_perm->who_deleg_perm_avl, node); + + uu_avl_remove(who_perm->who_deleg_perm_avl, node); + free(node); + node = next_node; + } + + uu_avl_destroy(who_perm->who_deleg_perm_avl); +} + +static inline void +fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname) +{ + uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool; + uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool; + + bzero(fsperm, sizeof (fs_perm_t)); + + if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT)) + == NULL) + nomem(); + + if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT)) + == NULL) + nomem(); + + fsperm->fsp_set = fspset; + fsperm->fsp_name = fsname; +} + +static inline void +fs_perm_fini(fs_perm_t *fsperm) +{ + who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl); + while (node != NULL) { + who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl, + node); + who_perm_t *who_perm = &node->who_perm; + who_perm_fini(who_perm); + uu_avl_remove(fsperm->fsp_sc_avl, node); + free(node); + node = next_node; + } + + node = uu_avl_first(fsperm->fsp_uge_avl); + while (node != NULL) { + who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl, + node); + who_perm_t *who_perm = &node->who_perm; + who_perm_fini(who_perm); + uu_avl_remove(fsperm->fsp_uge_avl, node); + free(node); + node = next_node; + } + + uu_avl_destroy(fsperm->fsp_sc_avl); + uu_avl_destroy(fsperm->fsp_uge_avl); +} + +static void +set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node, + zfs_deleg_who_type_t who_type, const char *name, char locality) +{ + uu_avl_index_t idx = 0; + + deleg_perm_node_t *found_node = NULL; + deleg_perm_t *deleg_perm = &node->dpn_perm; + + deleg_perm_init(deleg_perm, who_type, name); + + if ((found_node = uu_avl_find(avl, node, NULL, &idx)) + == NULL) + uu_avl_insert(avl, node, idx); + else { + node = found_node; + deleg_perm = &node->dpn_perm; + } + + + switch (locality) { + case ZFS_DELEG_LOCAL: + deleg_perm->dp_local = B_TRUE; + break; + case ZFS_DELEG_DESCENDENT: + deleg_perm->dp_descend = B_TRUE; + break; + case ZFS_DELEG_NA: + break; + default: + assert(B_FALSE); /* invalid locality */ + } +} + +static inline int +parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality) +{ + nvpair_t *nvp = NULL; + fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set; + uu_avl_t *avl = who_perm->who_deleg_perm_avl; + zfs_deleg_who_type_t who_type = who_perm->who_type; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + const char *name = nvpair_name(nvp); + data_type_t type = nvpair_type(nvp); + uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool; + deleg_perm_node_t *node = + safe_malloc(sizeof (deleg_perm_node_t)); + + assert(type == DATA_TYPE_BOOLEAN); + + uu_avl_node_init(node, &node->dpn_avl_node, avl_pool); + set_deleg_perm_node(avl, node, who_type, name, locality); + } + + return (0); +} + +static inline int +parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl) +{ + nvpair_t *nvp = NULL; + fs_perm_set_t *fspset = fsperm->fsp_set; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + nvlist_t *nvl2 = NULL; + const char *name = nvpair_name(nvp); + uu_avl_t *avl = NULL; + uu_avl_pool_t *avl_pool = NULL; + zfs_deleg_who_type_t perm_type = name[0]; + char perm_locality = name[1]; + const char *perm_name = name + 3; + boolean_t is_set = B_TRUE; + who_perm_t *who_perm = NULL; + + assert('$' == name[2]); + + if (nvpair_value_nvlist(nvp, &nvl2) != 0) + return (-1); + + switch (perm_type) { + case ZFS_DELEG_CREATE: + case ZFS_DELEG_CREATE_SETS: + case ZFS_DELEG_NAMED_SET: + case ZFS_DELEG_NAMED_SET_SETS: + avl_pool = fspset->fsps_named_set_avl_pool; + avl = fsperm->fsp_sc_avl; + break; + case ZFS_DELEG_USER: + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_GROUP: + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_EVERYONE: + case ZFS_DELEG_EVERYONE_SETS: + avl_pool = fspset->fsps_who_perm_avl_pool; + avl = fsperm->fsp_uge_avl; + break; + + default: + assert(!"unhandled zfs_deleg_who_type_t"); + } + + if (is_set) { + who_perm_node_t *found_node = NULL; + who_perm_node_t *node = safe_malloc( + sizeof (who_perm_node_t)); + who_perm = &node->who_perm; + uu_avl_index_t idx = 0; + + uu_avl_node_init(node, &node->who_avl_node, avl_pool); + who_perm_init(who_perm, fsperm, perm_type, perm_name); + + if ((found_node = uu_avl_find(avl, node, NULL, &idx)) + == NULL) { + if (avl == fsperm->fsp_uge_avl) { + uid_t rid = 0; + struct passwd *p = NULL; + struct group *g = NULL; + const char *nice_name = NULL; + + switch (perm_type) { + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + rid = atoi(perm_name); + p = getpwuid(rid); + if (p) + nice_name = p->pw_name; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + rid = atoi(perm_name); + g = getgrgid(rid); + if (g) + nice_name = g->gr_name; + break; + + default: + break; + } + + if (nice_name != NULL) + (void) strlcpy( + node->who_perm.who_ug_name, + nice_name, 256); + } + + uu_avl_insert(avl, node, idx); + } else { + node = found_node; + who_perm = &node->who_perm; + } + } + + (void) parse_who_perm(who_perm, nvl2, perm_locality); + } + + return (0); +} + +static inline int +parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl) +{ + nvpair_t *nvp = NULL; + uu_avl_index_t idx = 0; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + nvlist_t *nvl2 = NULL; + const char *fsname = nvpair_name(nvp); + data_type_t type = nvpair_type(nvp); + fs_perm_t *fsperm = NULL; + fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t)); + if (node == NULL) + nomem(); + + fsperm = &node->fspn_fsperm; + + assert(DATA_TYPE_NVLIST == type); + + uu_list_node_init(node, &node->fspn_list_node, + fspset->fsps_list_pool); + + idx = uu_list_numnodes(fspset->fsps_list); + fs_perm_init(fsperm, fspset, fsname); + + if (nvpair_value_nvlist(nvp, &nvl2) != 0) + return (-1); + + (void) parse_fs_perm(fsperm, nvl2); + + uu_list_insert(fspset->fsps_list, node, idx); + } + + return (0); +} + +static inline const char * +deleg_perm_comment(zfs_deleg_note_t note) +{ + const char *str = ""; + + /* subcommands */ + switch (note) { + /* SUBCOMMANDS */ + case ZFS_DELEG_NOTE_ALLOW: + str = gettext("Must also have the permission that is being" + "\n\t\t\t\tallowed"); + break; + case ZFS_DELEG_NOTE_CLONE: + str = gettext("Must also have the 'create' ability and 'mount'" + "\n\t\t\t\tability in the origin file system"); + break; + case ZFS_DELEG_NOTE_CREATE: + str = gettext("Must also have the 'mount' ability"); + break; + case ZFS_DELEG_NOTE_DESTROY: + str = gettext("Must also have the 'mount' ability"); + break; + case ZFS_DELEG_NOTE_DIFF: + str = gettext("Allows lookup of paths within a dataset;" + "\n\t\t\t\tgiven an object number. Ordinary users need this" + "\n\t\t\t\tin order to use zfs diff"); + break; + case ZFS_DELEG_NOTE_HOLD: + str = gettext("Allows adding a user hold to a snapshot"); + break; + case ZFS_DELEG_NOTE_MOUNT: + str = gettext("Allows mount/umount of ZFS datasets"); + break; + case ZFS_DELEG_NOTE_PROMOTE: + str = gettext("Must also have the 'mount'\n\t\t\t\tand" + " 'promote' ability in the origin file system"); + break; + case ZFS_DELEG_NOTE_RECEIVE: + str = gettext("Must also have the 'mount' and 'create'" + " ability"); + break; + case ZFS_DELEG_NOTE_RELEASE: + str = gettext("Allows releasing a user hold which\n\t\t\t\t" + "might destroy the snapshot"); + break; + case ZFS_DELEG_NOTE_RENAME: + str = gettext("Must also have the 'mount' and 'create'" + "\n\t\t\t\tability in the new parent"); + break; + case ZFS_DELEG_NOTE_ROLLBACK: + str = gettext(""); + break; + case ZFS_DELEG_NOTE_SEND: + str = gettext(""); + break; + case ZFS_DELEG_NOTE_SHARE: + str = gettext("Allows sharing file systems over NFS or SMB" + "\n\t\t\t\tprotocols"); + break; + case ZFS_DELEG_NOTE_SNAPSHOT: + str = gettext(""); + break; +/* + * case ZFS_DELEG_NOTE_VSCAN: + * str = gettext(""); + * break; + */ + /* OTHER */ + case ZFS_DELEG_NOTE_GROUPQUOTA: + str = gettext("Allows accessing any groupquota@... property"); + break; + case ZFS_DELEG_NOTE_GROUPUSED: + str = gettext("Allows reading any groupused@... property"); + break; + case ZFS_DELEG_NOTE_USERPROP: + str = gettext("Allows changing any user property"); + break; + case ZFS_DELEG_NOTE_USERQUOTA: + str = gettext("Allows accessing any userquota@... property"); + break; + case ZFS_DELEG_NOTE_USERUSED: + str = gettext("Allows reading any userused@... property"); + break; + /* other */ + default: + str = ""; + } + + return (str); +} + +struct allow_opts { + boolean_t local; + boolean_t descend; + boolean_t user; + boolean_t group; + boolean_t everyone; + boolean_t create; + boolean_t set; + boolean_t recursive; /* unallow only */ + boolean_t prt_usage; + + boolean_t prt_perms; + char *who; + char *perms; + const char *dataset; +}; + +static inline int +prop_cmp(const void *a, const void *b) +{ + const char *str1 = *(const char **)a; + const char *str2 = *(const char **)b; + return (strcmp(str1, str2)); +} + +static void +allow_usage(boolean_t un, boolean_t requested, const char *msg) +{ + const char *opt_desc[] = { + "-h", gettext("show this help message and exit"), + "-l", gettext("set permission locally"), + "-d", gettext("set permission for descents"), + "-u", gettext("set permission for user"), + "-g", gettext("set permission for group"), + "-e", gettext("set permission for everyone"), + "-c", gettext("set create time permission"), + "-s", gettext("define permission set"), + /* unallow only */ + "-r", gettext("remove permissions recursively"), + }; + size_t unallow_size = sizeof (opt_desc) / sizeof (char *); + size_t allow_size = unallow_size - 2; + const char *props[ZFS_NUM_PROPS]; + int i; + size_t count = 0; + FILE *fp = requested ? stdout : stderr; + zprop_desc_t *pdtbl = zfs_prop_get_table(); + const char *fmt = gettext("%-16s %-14s\t%s\n"); + + (void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW : + HELP_ALLOW)); + (void) fprintf(fp, gettext("Options:\n")); + for (i = 0; i < (un ? unallow_size : allow_size); i++) { + const char *opt = opt_desc[i++]; + const char *optdsc = opt_desc[i]; + (void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc); + } + + (void) fprintf(fp, gettext("\nThe following permissions are " + "supported:\n\n")); + (void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"), + gettext("NOTES")); + for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) { + const char *perm_name = zfs_deleg_perm_tbl[i].z_perm; + zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note; + const char *perm_type = deleg_perm_type(perm_note); + const char *perm_comment = deleg_perm_comment(perm_note); + (void) fprintf(fp, fmt, perm_name, perm_type, perm_comment); + } + + for (i = 0; i < ZFS_NUM_PROPS; i++) { + zprop_desc_t *pd = &pdtbl[i]; + if (pd->pd_visible != B_TRUE) + continue; + + if (pd->pd_attr == PROP_READONLY) + continue; + + props[count++] = pd->pd_name; + } + props[count] = NULL; + + qsort(props, count, sizeof (char *), prop_cmp); + + for (i = 0; i < count; i++) + (void) fprintf(fp, fmt, props[i], gettext("property"), ""); + + if (msg != NULL) + (void) fprintf(fp, gettext("\nzfs: error: %s"), msg); + + exit(requested ? 0 : 2); +} + +static inline const char * +munge_args(int argc, char **argv, boolean_t un, size_t expected_argc, + char **permsp) +{ + if (un && argc == expected_argc - 1) + *permsp = NULL; + else if (argc == expected_argc) + *permsp = argv[argc - 2]; + else + allow_usage(un, B_FALSE, + gettext("wrong number of parameters\n")); + + return (argv[argc - 1]); +} + +static void +parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts) +{ + int uge_sum = opts->user + opts->group + opts->everyone; + int csuge_sum = opts->create + opts->set + uge_sum; + int ldcsuge_sum = csuge_sum + opts->local + opts->descend; + int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum; + + if (uge_sum > 1) + allow_usage(un, B_FALSE, + gettext("-u, -g, and -e are mutually exclusive\n")); + + if (opts->prt_usage) { + if (argc == 0 && all_sum == 0) + allow_usage(un, B_TRUE, NULL); + else + usage(B_FALSE); + } + + if (opts->set) { + if (csuge_sum > 1) + allow_usage(un, B_FALSE, + gettext("invalid options combined with -s\n")); + + opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); + if (argv[0][0] != '@') + allow_usage(un, B_FALSE, + gettext("invalid set name: missing '@' prefix\n")); + opts->who = argv[0]; + } else if (opts->create) { + if (ldcsuge_sum > 1) + allow_usage(un, B_FALSE, + gettext("invalid options combined with -c\n")); + opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); + } else if (opts->everyone) { + if (csuge_sum > 1) + allow_usage(un, B_FALSE, + gettext("invalid options combined with -e\n")); + opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); + } else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone") + == 0) { + opts->everyone = B_TRUE; + argc--; + argv++; + opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); + } else if (argc == 1 && !un) { + opts->prt_perms = B_TRUE; + opts->dataset = argv[argc-1]; + } else { + opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); + opts->who = argv[0]; + } + + if (!opts->local && !opts->descend) { + opts->local = B_TRUE; + opts->descend = B_TRUE; + } +} + +static void +store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend, + const char *who, char *perms, nvlist_t *top_nvl) +{ + int i; + char ld[2] = { '\0', '\0' }; + char who_buf[MAXNAMELEN + 32]; + char base_type = '\0'; + char set_type = '\0'; + nvlist_t *base_nvl = NULL; + nvlist_t *set_nvl = NULL; + nvlist_t *nvl; + + if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); + if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + switch (type) { + case ZFS_DELEG_NAMED_SET_SETS: + case ZFS_DELEG_NAMED_SET: + set_type = ZFS_DELEG_NAMED_SET_SETS; + base_type = ZFS_DELEG_NAMED_SET; + ld[0] = ZFS_DELEG_NA; + break; + case ZFS_DELEG_CREATE_SETS: + case ZFS_DELEG_CREATE: + set_type = ZFS_DELEG_CREATE_SETS; + base_type = ZFS_DELEG_CREATE; + ld[0] = ZFS_DELEG_NA; + break; + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + set_type = ZFS_DELEG_USER_SETS; + base_type = ZFS_DELEG_USER; + if (local) + ld[0] = ZFS_DELEG_LOCAL; + if (descend) + ld[1] = ZFS_DELEG_DESCENDENT; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + set_type = ZFS_DELEG_GROUP_SETS; + base_type = ZFS_DELEG_GROUP; + if (local) + ld[0] = ZFS_DELEG_LOCAL; + if (descend) + ld[1] = ZFS_DELEG_DESCENDENT; + break; + case ZFS_DELEG_EVERYONE_SETS: + case ZFS_DELEG_EVERYONE: + set_type = ZFS_DELEG_EVERYONE_SETS; + base_type = ZFS_DELEG_EVERYONE; + if (local) + ld[0] = ZFS_DELEG_LOCAL; + if (descend) + ld[1] = ZFS_DELEG_DESCENDENT; + break; + + default: + assert(set_type != '\0' && base_type != '\0'); + } + + if (perms != NULL) { + char *curr = perms; + char *end = curr + strlen(perms); + + while (curr < end) { + char *delim = strchr(curr, ','); + if (delim == NULL) + delim = end; + else + *delim = '\0'; + + if (curr[0] == '@') + nvl = set_nvl; + else + nvl = base_nvl; + + (void) nvlist_add_boolean(nvl, curr); + if (delim != end) + *delim = ','; + curr = delim + 1; + } + + for (i = 0; i < 2; i++) { + char locality = ld[i]; + if (locality == 0) + continue; + + if (!nvlist_empty(base_nvl)) { + if (who != NULL) + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$%s", + base_type, locality, who); + else + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$", + base_type, locality); + + (void) nvlist_add_nvlist(top_nvl, who_buf, + base_nvl); + } + + + if (!nvlist_empty(set_nvl)) { + if (who != NULL) + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$%s", + set_type, locality, who); + else + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$", + set_type, locality); + + (void) nvlist_add_nvlist(top_nvl, who_buf, + set_nvl); + } + } + } else { + for (i = 0; i < 2; i++) { + char locality = ld[i]; + if (locality == 0) + continue; + + if (who != NULL) + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$%s", base_type, locality, who); + else + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$", base_type, locality); + (void) nvlist_add_boolean(top_nvl, who_buf); + + if (who != NULL) + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$%s", set_type, locality, who); + else + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$", set_type, locality); + (void) nvlist_add_boolean(top_nvl, who_buf); + } + } +} + +static int +construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) +{ + if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + if (opts->set) { + store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local, + opts->descend, opts->who, opts->perms, *nvlp); + } else if (opts->create) { + store_allow_perm(ZFS_DELEG_CREATE, opts->local, + opts->descend, NULL, opts->perms, *nvlp); + } else if (opts->everyone) { + store_allow_perm(ZFS_DELEG_EVERYONE, opts->local, + opts->descend, NULL, opts->perms, *nvlp); + } else { + char *curr = opts->who; + char *end = curr + strlen(curr); + + while (curr < end) { + const char *who; + zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN; + char *endch; + char *delim = strchr(curr, ','); + char errbuf[256]; + char id[64]; + struct passwd *p = NULL; + struct group *g = NULL; + + uid_t rid; + if (delim == NULL) + delim = end; + else + *delim = '\0'; + + rid = (uid_t)strtol(curr, &endch, 0); + if (opts->user) { + who_type = ZFS_DELEG_USER; + if (*endch != '\0') + p = getpwnam(curr); + else + p = getpwuid(rid); + + if (p != NULL) + rid = p->pw_uid; + else { + (void) snprintf(errbuf, 256, gettext( + "invalid user %s"), curr); + allow_usage(un, B_TRUE, errbuf); + } + } else if (opts->group) { + who_type = ZFS_DELEG_GROUP; + if (*endch != '\0') + g = getgrnam(curr); + else + g = getgrgid(rid); + + if (g != NULL) + rid = g->gr_gid; + else { + (void) snprintf(errbuf, 256, gettext( + "invalid group %s"), curr); + allow_usage(un, B_TRUE, errbuf); + } + } else { + if (*endch != '\0') { + p = getpwnam(curr); + } else { + p = getpwuid(rid); + } + + if (p == NULL) { + if (*endch != '\0') { + g = getgrnam(curr); + } else { + g = getgrgid(rid); + } + } + + if (p != NULL) { + who_type = ZFS_DELEG_USER; + rid = p->pw_uid; + } else if (g != NULL) { + who_type = ZFS_DELEG_GROUP; + rid = g->gr_gid; + } else { + (void) snprintf(errbuf, 256, gettext( + "invalid user/group %s"), curr); + allow_usage(un, B_TRUE, errbuf); + } + } + + (void) sprintf(id, "%u", rid); + who = id; + + store_allow_perm(who_type, opts->local, + opts->descend, who, opts->perms, *nvlp); + curr = delim + 1; + } + } + + return (0); +} + +static void +print_set_creat_perms(uu_avl_t *who_avl) +{ + const char *sc_title[] = { + gettext("Permission sets:\n"), + gettext("Create time permissions:\n"), + NULL + }; + const char **title_ptr = sc_title; + who_perm_node_t *who_node = NULL; + int prev_weight = -1; + + for (who_node = uu_avl_first(who_avl); who_node != NULL; + who_node = uu_avl_next(who_avl, who_node)) { + uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; + zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; + const char *who_name = who_node->who_perm.who_name; + int weight = who_type2weight(who_type); + boolean_t first = B_TRUE; + deleg_perm_node_t *deleg_node; + + if (prev_weight != weight) { + (void) printf(*title_ptr++); + prev_weight = weight; + } + + if (who_name == NULL || strnlen(who_name, 1) == 0) + (void) printf("\t"); + else + (void) printf("\t%s ", who_name); + + for (deleg_node = uu_avl_first(avl); deleg_node != NULL; + deleg_node = uu_avl_next(avl, deleg_node)) { + if (first) { + (void) printf("%s", + deleg_node->dpn_perm.dp_name); + first = B_FALSE; + } else + (void) printf(",%s", + deleg_node->dpn_perm.dp_name); + } + + (void) printf("\n"); + } +} + +static void +print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend, + const char *title) +{ + who_perm_node_t *who_node = NULL; + boolean_t prt_title = B_TRUE; + uu_avl_walk_t *walk; + + if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL) + nomem(); + + while ((who_node = uu_avl_walk_next(walk)) != NULL) { + const char *who_name = who_node->who_perm.who_name; + const char *nice_who_name = who_node->who_perm.who_ug_name; + uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; + zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; + char delim = ' '; + deleg_perm_node_t *deleg_node; + boolean_t prt_who = B_TRUE; + + for (deleg_node = uu_avl_first(avl); + deleg_node != NULL; + deleg_node = uu_avl_next(avl, deleg_node)) { + if (local != deleg_node->dpn_perm.dp_local || + descend != deleg_node->dpn_perm.dp_descend) + continue; + + if (prt_who) { + const char *who = NULL; + if (prt_title) { + prt_title = B_FALSE; + (void) printf(title); + } + + switch (who_type) { + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + who = gettext("user"); + if (nice_who_name) + who_name = nice_who_name; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + who = gettext("group"); + if (nice_who_name) + who_name = nice_who_name; + break; + case ZFS_DELEG_EVERYONE_SETS: + case ZFS_DELEG_EVERYONE: + who = gettext("everyone"); + who_name = NULL; + break; + + default: + assert(who != NULL); + } + + prt_who = B_FALSE; + if (who_name == NULL) + (void) printf("\t%s", who); + else + (void) printf("\t%s %s", who, who_name); + } + + (void) printf("%c%s", delim, + deleg_node->dpn_perm.dp_name); + delim = ','; + } + + if (!prt_who) + (void) printf("\n"); + } + + uu_avl_walk_end(walk); +} + +static void +print_fs_perms(fs_perm_set_t *fspset) +{ + fs_perm_node_t *node = NULL; + char buf[MAXNAMELEN + 32]; + const char *dsname = buf; + + for (node = uu_list_first(fspset->fsps_list); node != NULL; + node = uu_list_next(fspset->fsps_list, node)) { + uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl; + uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl; + int left = 0; + + (void) snprintf(buf, sizeof (buf), + gettext("---- Permissions on %s "), + node->fspn_fsperm.fsp_name); + (void) printf(dsname); + left = 70 - strlen(buf); + while (left-- > 0) + (void) printf("-"); + (void) printf("\n"); + + print_set_creat_perms(sc_avl); + print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE, + gettext("Local permissions:\n")); + print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE, + gettext("Descendent permissions:\n")); + print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE, + gettext("Local+Descendent permissions:\n")); + } +} + +static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL }; + +struct deleg_perms { + boolean_t un; + nvlist_t *nvl; +}; + +static int +set_deleg_perms(zfs_handle_t *zhp, void *data) +{ + struct deleg_perms *perms = (struct deleg_perms *)data; + zfs_type_t zfs_type = zfs_get_type(zhp); + + if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME) + return (0); + + return (zfs_set_fsacl(zhp, perms->un, perms->nvl)); +} + +static int +zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un) +{ + zfs_handle_t *zhp; + nvlist_t *perm_nvl = NULL; + nvlist_t *update_perm_nvl = NULL; + int error = 1; + int c; + struct allow_opts opts = { 0 }; + + const char *optstr = un ? "ldugecsrh" : "ldugecsh"; + + /* check opts */ + while ((c = getopt(argc, argv, optstr)) != -1) { + switch (c) { + case 'l': + opts.local = B_TRUE; + break; + case 'd': + opts.descend = B_TRUE; + break; + case 'u': + opts.user = B_TRUE; + break; + case 'g': + opts.group = B_TRUE; + break; + case 'e': + opts.everyone = B_TRUE; + break; + case 's': + opts.set = B_TRUE; + break; + case 'c': + opts.create = B_TRUE; + break; + case 'r': + opts.recursive = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case 'h': + opts.prt_usage = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check arguments */ + parse_allow_args(argc, argv, un, &opts); + + /* try to open the dataset */ + if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME)) == NULL) { + (void) fprintf(stderr, "Failed to open dataset: %s\n", + opts.dataset); + return (-1); + } + + if (zfs_get_fsacl(zhp, &perm_nvl) != 0) + goto cleanup2; + + fs_perm_set_init(&fs_perm_set); + if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) { + (void) fprintf(stderr, "Failed to parse fsacl permissions\n"); + goto cleanup1; + } + + if (opts.prt_perms) + print_fs_perms(&fs_perm_set); + else { + (void) construct_fsacl_list(un, &opts, &update_perm_nvl); + if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0) + goto cleanup0; + + if (un && opts.recursive) { + struct deleg_perms data = { un, update_perm_nvl }; + if (zfs_iter_filesystems(zhp, set_deleg_perms, + &data) != 0) + goto cleanup0; + } + } + + error = 0; + +cleanup0: + nvlist_free(perm_nvl); + nvlist_free(update_perm_nvl); +cleanup1: + fs_perm_set_fini(&fs_perm_set); +cleanup2: + zfs_close(zhp); + + return (error); +} + +static int +zfs_do_allow(int argc, char **argv) +{ + return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE)); +} + +static int +zfs_do_unallow(int argc, char **argv) +{ + return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE)); +} + +static int +zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) +{ + int errors = 0; + int i; + const char *tag; + boolean_t recursive = B_FALSE; + const char *opts = holding ? "rt" : "r"; + int c; + + /* check options */ + while ((c = getopt(argc, argv, opts)) != -1) { + switch (c) { + case 'r': + recursive = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 2) + usage(B_FALSE); + + tag = argv[0]; + --argc; + ++argv; + + if (holding && tag[0] == '.') { + /* tags starting with '.' are reserved for libzfs */ + (void) fprintf(stderr, gettext("tag may not start with '.'\n")); + usage(B_FALSE); + } + + for (i = 0; i < argc; ++i) { + zfs_handle_t *zhp; + char parent[ZFS_MAX_DATASET_NAME_LEN]; + const char *delim; + char *path = argv[i]; + + delim = strchr(path, '@'); + if (delim == NULL) { + (void) fprintf(stderr, + gettext("'%s' is not a snapshot\n"), path); + ++errors; + continue; + } + (void) strncpy(parent, path, delim - path); + parent[delim - path] = '\0'; + + zhp = zfs_open(g_zfs, parent, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) { + ++errors; + continue; + } + if (holding) { + if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0) + ++errors; + } else { + if (zfs_release(zhp, delim+1, tag, recursive) != 0) + ++errors; + } + zfs_close(zhp); + } + + return (errors != 0); +} + +/* + * zfs hold [-r] [-t] ... + * + * -r Recursively hold + * + * Apply a user-hold with the given tag to the list of snapshots. + */ +static int +zfs_do_hold(int argc, char **argv) +{ + return (zfs_do_hold_rele_impl(argc, argv, B_TRUE)); +} + +/* + * zfs release [-r] ... + * + * -r Recursively release + * + * Release a user-hold with the given tag from the list of snapshots. + */ +static int +zfs_do_release(int argc, char **argv) +{ + return (zfs_do_hold_rele_impl(argc, argv, B_FALSE)); +} + +typedef struct holds_cbdata { + boolean_t cb_recursive; + const char *cb_snapname; + nvlist_t **cb_nvlp; + size_t cb_max_namelen; + size_t cb_max_taglen; +} holds_cbdata_t; + +#define STRFTIME_FMT_STR "%a %b %e %k:%M %Y" +#define DATETIME_BUF_LEN (32) +/* + * + */ +static void +print_holds(boolean_t scripted, boolean_t literal, size_t nwidth, + size_t tagwidth, nvlist_t *nvl) +{ + int i; + nvpair_t *nvp = NULL; + char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" }; + const char *col; + + if (!scripted) { + for (i = 0; i < 3; i++) { + col = gettext(hdr_cols[i]); + if (i < 2) + (void) printf("%-*s ", i ? tagwidth : nwidth, + col); + else + (void) printf("%s\n", col); + } + } + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + char *zname = nvpair_name(nvp); + nvlist_t *nvl2; + nvpair_t *nvp2 = NULL; + (void) nvpair_value_nvlist(nvp, &nvl2); + while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) { + char tsbuf[DATETIME_BUF_LEN]; + char *tagname = nvpair_name(nvp2); + uint64_t val = 0; + time_t time; + struct tm t; + char sep = scripted ? '\t' : ' '; + size_t sepnum = scripted ? 1 : 2; + + (void) nvpair_value_uint64(nvp2, &val); + if (literal) + snprintf(tsbuf, DATETIME_BUF_LEN, "%llu", val); + else { + time = (time_t)val; + (void) localtime_r(&time, &t); + (void) strftime(tsbuf, DATETIME_BUF_LEN, + gettext(STRFTIME_FMT_STR), &t); + } + + (void) printf("%-*s%*c%-*s%*c%s\n", nwidth, zname, + sepnum, sep, tagwidth, tagname, sepnum, sep, tsbuf); + } + } +} + +/* + * Generic callback function to list a dataset or snapshot. + */ +static int +holds_callback(zfs_handle_t *zhp, void *data) +{ + holds_cbdata_t *cbp = data; + nvlist_t *top_nvl = *cbp->cb_nvlp; + nvlist_t *nvl = NULL; + nvpair_t *nvp = NULL; + const char *zname = zfs_get_name(zhp); + size_t znamelen = strlen(zname); + + if (cbp->cb_recursive && cbp->cb_snapname != NULL) { + const char *snapname; + char *delim = strchr(zname, '@'); + if (delim == NULL) + return (0); + + snapname = delim + 1; + if (strcmp(cbp->cb_snapname, snapname)) + return (0); + } + + if (zfs_get_holds(zhp, &nvl) != 0) + return (-1); + + if (znamelen > cbp->cb_max_namelen) + cbp->cb_max_namelen = znamelen; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + const char *tag = nvpair_name(nvp); + size_t taglen = strlen(tag); + if (taglen > cbp->cb_max_taglen) + cbp->cb_max_taglen = taglen; + } + + return (nvlist_add_nvlist(top_nvl, zname, nvl)); +} + +/* + * zfs holds [-Hp] [-r | -d max] ... + * + * -H Suppress header output + * -p Output literal values + * -r Recursively search for holds + * -d max Limit depth of recursive search + */ +static int +zfs_do_holds(int argc, char **argv) +{ + int errors = 0; + int c; + int i; + boolean_t scripted = B_FALSE; + boolean_t literal = B_FALSE; + boolean_t recursive = B_FALSE; + const char *opts = "d:rHp"; + nvlist_t *nvl; + + int types = ZFS_TYPE_SNAPSHOT; + holds_cbdata_t cb = { 0 }; + + int limit = 0; + int ret = 0; + int flags = 0; + + /* check options */ + while ((c = getopt(argc, argv, opts)) != -1) { + switch (c) { + case 'd': + limit = parse_depth(optarg, &flags); + recursive = B_TRUE; + break; + case 'r': + recursive = B_TRUE; + break; + case 'H': + scripted = B_TRUE; + break; + case 'p': + literal = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + if (recursive) { + types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; + flags |= ZFS_ITER_RECURSE; + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) + usage(B_FALSE); + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + for (i = 0; i < argc; ++i) { + char *snapshot = argv[i]; + const char *delim; + const char *snapname = NULL; + + delim = strchr(snapshot, '@'); + if (delim != NULL) { + snapname = delim + 1; + if (recursive) + snapshot[delim - snapshot] = '\0'; + } + + cb.cb_recursive = recursive; + cb.cb_snapname = snapname; + cb.cb_nvlp = &nvl; + + /* + * 1. collect holds data, set format options + */ + ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit, + holds_callback, &cb); + if (ret != 0) + ++errors; + } + + /* + * 2. print holds data + */ + print_holds(scripted, literal, cb.cb_max_namelen, cb.cb_max_taglen, + nvl); + + if (nvlist_empty(nvl)) + (void) printf(gettext("no datasets available\n")); + + nvlist_free(nvl); + + return (0 != errors); +} + +#define CHECK_SPINNER 30 +#define SPINNER_TIME 3 /* seconds */ +#define MOUNT_TIME 5 /* seconds */ + +static int +get_one_dataset(zfs_handle_t *zhp, void *data) +{ + static char *spin[] = { "-", "\\", "|", "/" }; + static int spinval = 0; + static int spincheck = 0; static time_t last_spin_time = (time_t)0; - get_all_cbdata_t *cbp = data; + get_all_cb_t *cbp = data; zfs_type_t type = zfs_get_type(zhp); if (cbp->cb_verbose) { if (--spincheck < 0) { time_t now = time(NULL); if (last_spin_time + SPINNER_TIME < now) { - (void) printf("\b%c", spin[spinval++ % 4]); - (void) fflush(stdout); + update_progress(spin[spinval++ % 4]); last_spin_time = now; } spincheck = CHECK_SPINNER; @@ -2904,8 +5793,7 @@ get_one_dataset(zfs_handle_t *zhp, void /* * Interate over any nested datasets. */ - if (type == ZFS_TYPE_FILESYSTEM && - zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) { + if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) { zfs_close(zhp); return (1); } @@ -2913,83 +5801,32 @@ get_one_dataset(zfs_handle_t *zhp, void /* * Skip any datasets whose type does not match. */ - if ((type & cbp->cb_types) == 0) { + if ((type & ZFS_TYPE_FILESYSTEM) == 0) { zfs_close(zhp); return (0); } - - if (cbp->cb_alloc == cbp->cb_used) { - zfs_handle_t **handles; - - if (cbp->cb_alloc == 0) - cbp->cb_alloc = 64; - else - cbp->cb_alloc *= 2; - - handles = safe_malloc(cbp->cb_alloc * sizeof (void *)); - - if (cbp->cb_handles) { - bcopy(cbp->cb_handles, handles, - cbp->cb_used * sizeof (void *)); - free(cbp->cb_handles); - } - - cbp->cb_handles = handles; - } - - cbp->cb_handles[cbp->cb_used++] = zhp; + libzfs_add_handle(cbp, zhp); + assert(cbp->cb_used <= cbp->cb_alloc); return (0); } static void -get_all_datasets(uint_t types, zfs_handle_t ***dslist, size_t *count, - boolean_t verbose) +get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t verbose) { - get_all_cbdata_t cb = { 0 }; - cb.cb_types = types; + get_all_cb_t cb = { 0 }; cb.cb_verbose = verbose; + cb.cb_getone = get_one_dataset; - if (verbose) { - (void) printf("%s: *", gettext("Reading ZFS config")); - (void) fflush(stdout); - } - + if (verbose) + set_progress_header(gettext("Reading ZFS config")); (void) zfs_iter_root(g_zfs, get_one_dataset, &cb); *dslist = cb.cb_handles; *count = cb.cb_used; - if (verbose) { - (void) printf("\b%s\n", gettext("done.")); - } -} - -static int -dataset_cmp(const void *a, const void *b) -{ - zfs_handle_t **za = (zfs_handle_t **)a; - zfs_handle_t **zb = (zfs_handle_t **)b; - char mounta[MAXPATHLEN]; - char mountb[MAXPATHLEN]; - boolean_t gota, gotb; - - if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0) - verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta, - sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0); - if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0) - verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb, - sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0); - - if (gota && gotb) - return (strcmp(mounta, mountb)); - - if (gota) - return (-1); - if (gotb) - return (1); - - return (strcmp(zfs_get_name(a), zfs_get_name(b))); + if (verbose) + finish_progress(gettext("done.")); } /* @@ -3013,216 +5850,197 @@ share_mount_one(zfs_handle_t *zhp, int o const char *cmdname = op == OP_SHARE ? "share" : "mount"; struct mnttab mnt; uint64_t zoned, canmount; - zfs_type_t type = zfs_get_type(zhp); boolean_t shared_nfs, shared_smb; - assert(type & (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)); - - if (type == ZFS_TYPE_FILESYSTEM) { - /* - * Check to make sure we can mount/share this dataset. If we - * are in the global zone and the filesystem is exported to a - * local zone, or if we are in a local zone and the - * filesystem is not exported, then it is an error. - */ - zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); - - if (zoned && getzoneid() == GLOBAL_ZONEID) { - if (!explicit) - return (0); - - (void) fprintf(stderr, gettext("cannot %s '%s': " - "dataset is exported to a local zone\n"), cmdname, - zfs_get_name(zhp)); - return (1); - - } else if (!zoned && getzoneid() != GLOBAL_ZONEID) { - if (!explicit) - return (0); - - (void) fprintf(stderr, gettext("cannot %s '%s': " - "permission denied\n"), cmdname, - zfs_get_name(zhp)); - return (1); - } - - /* - * Ignore any filesystems which don't apply to us. This - * includes those with a legacy mountpoint, or those with - * legacy share options. - */ - verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, - sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); - verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts, - sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0); - verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts, - sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0); - - if (op == OP_SHARE && strcmp(shareopts, "off") == 0 && - strcmp(smbshareopts, "off") == 0) { - if (!explicit) - return (0); - - (void) fprintf(stderr, gettext("cannot share '%s': " - "legacy share\n"), zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use share(1M) to " - "share this filesystem, or set " - "sharenfs property on\n")); - return (1); - } + assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM); - /* - * We cannot share or mount legacy filesystems. If the - * shareopts is non-legacy but the mountpoint is legacy, we - * treat it as a legacy share. - */ - if (strcmp(mountpoint, "legacy") == 0) { - if (!explicit) - return (0); + /* + * Check to make sure we can mount/share this dataset. If we + * are in the global zone and the filesystem is exported to a + * local zone, or if we are in a local zone and the + * filesystem is not exported, then it is an error. + */ + zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); - (void) fprintf(stderr, gettext("cannot %s '%s': " - "legacy mountpoint\n"), cmdname, zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use %s(1M) to " - "%s this filesystem\n"), cmdname, cmdname); - return (1); - } + if (zoned && getzoneid() == GLOBAL_ZONEID) { + if (!explicit) + return (0); - if (strcmp(mountpoint, "none") == 0) { - if (!explicit) - return (0); + (void) fprintf(stderr, gettext("cannot %s '%s': " + "dataset is exported to a local zone\n"), cmdname, + zfs_get_name(zhp)); + return (1); - (void) fprintf(stderr, gettext("cannot %s '%s': no " - "mountpoint set\n"), cmdname, zfs_get_name(zhp)); - return (1); - } + } else if (!zoned && getzoneid() != GLOBAL_ZONEID) { + if (!explicit) + return (0); - /* - * canmount explicit outcome - * on no pass through - * on yes pass through - * off no return 0 - * off yes display error, return 1 - * noauto no return 0 - * noauto yes pass through - */ - canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); - if (canmount == ZFS_CANMOUNT_OFF) { - if (!explicit) - return (0); + (void) fprintf(stderr, gettext("cannot %s '%s': " + "permission denied\n"), cmdname, + zfs_get_name(zhp)); + return (1); + } - (void) fprintf(stderr, gettext("cannot %s '%s': " - "'canmount' property is set to 'off'\n"), cmdname, - zfs_get_name(zhp)); - return (1); - } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) { + /* + * Ignore any filesystems which don't apply to us. This + * includes those with a legacy mountpoint, or those with + * legacy share options. + */ + verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, + sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); + verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts, + sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0); + verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts, + sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0); + + if (op == OP_SHARE && strcmp(shareopts, "off") == 0 && + strcmp(smbshareopts, "off") == 0) { + if (!explicit) return (0); - } - - /* - * At this point, we have verified that the mountpoint and/or - * shareopts are appropriate for auto management. If the - * filesystem is already mounted or shared, return (failing - * for explicit requests); otherwise mount or share the - * filesystem. - */ - switch (op) { - case OP_SHARE: - shared_nfs = zfs_is_shared_nfs(zhp, NULL); - shared_smb = zfs_is_shared_smb(zhp, NULL); + (void) fprintf(stderr, gettext("cannot share '%s': " + "legacy share\n"), zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("to " + "share this filesystem set " + "sharenfs property on\n")); + return (1); + } - if (shared_nfs && shared_smb || - (shared_nfs && strcmp(shareopts, "on") == 0 && - strcmp(smbshareopts, "off") == 0) || - (shared_smb && strcmp(smbshareopts, "on") == 0 && - strcmp(shareopts, "off") == 0)) { - if (!explicit) - return (0); + /* + * We cannot share or mount legacy filesystems. If the + * shareopts is non-legacy but the mountpoint is legacy, we + * treat it as a legacy share. + */ + if (strcmp(mountpoint, "legacy") == 0) { + if (!explicit) + return (0); - (void) fprintf(stderr, gettext("cannot share " - "'%s': filesystem already shared\n"), - zfs_get_name(zhp)); - return (1); - } + (void) fprintf(stderr, gettext("cannot %s '%s': " + "legacy mountpoint\n"), cmdname, zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use %s(8) to " + "%s this filesystem\n"), cmdname, cmdname); + return (1); + } - if (!zfs_is_mounted(zhp, NULL) && - zfs_mount(zhp, NULL, 0) != 0) - return (1); + if (strcmp(mountpoint, "none") == 0) { + if (!explicit) + return (0); - if (protocol == NULL) { - if (zfs_shareall(zhp) != 0) - return (1); - } else if (strcmp(protocol, "nfs") == 0) { - if (zfs_share_nfs(zhp)) - return (1); - } else if (strcmp(protocol, "smb") == 0) { - if (zfs_share_smb(zhp)) - return (1); - } else { - (void) fprintf(stderr, gettext("cannot share " - "'%s': invalid share type '%s' " - "specified\n"), - zfs_get_name(zhp), protocol); - return (1); - } + (void) fprintf(stderr, gettext("cannot %s '%s': no " + "mountpoint set\n"), cmdname, zfs_get_name(zhp)); + return (1); + } - break; + /* + * canmount explicit outcome + * on no pass through + * on yes pass through + * off no return 0 + * off yes display error, return 1 + * noauto no return 0 + * noauto yes pass through + */ + canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); + if (canmount == ZFS_CANMOUNT_OFF) { + if (!explicit) + return (0); - case OP_MOUNT: - if (options == NULL) - mnt.mnt_mntopts = ""; - else - mnt.mnt_mntopts = (char *)options; + (void) fprintf(stderr, gettext("cannot %s '%s': " + "'canmount' property is set to 'off'\n"), cmdname, + zfs_get_name(zhp)); + return (1); + } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) { + return (0); + } - if (!hasmntopt(&mnt, MNTOPT_REMOUNT) && - zfs_is_mounted(zhp, NULL)) { - if (!explicit) - return (0); + /* + * If this filesystem is inconsistent and has a receive resume + * token, we can not mount it. + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && + zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { + if (!explicit) + return (0); - (void) fprintf(stderr, gettext("cannot mount " - "'%s': filesystem already mounted\n"), - zfs_get_name(zhp)); - return (1); - } + (void) fprintf(stderr, gettext("cannot %s '%s': " + "Contains partially-completed state from " + "\"zfs receive -r\", which can be resumed with " + "\"zfs send -t\"\n"), + cmdname, zfs_get_name(zhp)); + return (1); + } - if (zfs_mount(zhp, options, flags) != 0) - return (1); - break; - } - } else { - assert(op == OP_SHARE); + /* + * At this point, we have verified that the mountpoint and/or + * shareopts are appropriate for auto management. If the + * filesystem is already mounted or shared, return (failing + * for explicit requests); otherwise mount or share the + * filesystem. + */ + switch (op) { + case OP_SHARE: - /* - * Ignore any volumes that aren't shared. - */ - verify(zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, shareopts, - sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0); + shared_nfs = zfs_is_shared_nfs(zhp, NULL); + shared_smb = zfs_is_shared_smb(zhp, NULL); - if (strcmp(shareopts, "off") == 0) { + if ((shared_nfs && shared_smb) || + (shared_nfs && strcmp(shareopts, "on") == 0 && + strcmp(smbshareopts, "off") == 0) || + (shared_smb && strcmp(smbshareopts, "on") == 0 && + strcmp(shareopts, "off") == 0)) { if (!explicit) return (0); - (void) fprintf(stderr, gettext("cannot share '%s': " - "'shareiscsi' property not set\n"), + (void) fprintf(stderr, gettext("cannot share " + "'%s': filesystem already shared\n"), zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("set 'shareiscsi' " - "property or use iscsitadm(1M) to share this " - "volume\n")); return (1); } - if (zfs_is_shared_iscsi(zhp)) { + if (!zfs_is_mounted(zhp, NULL) && + zfs_mount(zhp, NULL, 0) != 0) + return (1); + + if (protocol == NULL) { + if (zfs_shareall(zhp) != 0) + return (1); + } else if (strcmp(protocol, "nfs") == 0) { + if (zfs_share_nfs(zhp)) + return (1); + } else if (strcmp(protocol, "smb") == 0) { + if (zfs_share_smb(zhp)) + return (1); + } else { + (void) fprintf(stderr, gettext("cannot share " + "'%s': invalid share type '%s' " + "specified\n"), + zfs_get_name(zhp), protocol); + return (1); + } + + break; + + case OP_MOUNT: + if (options == NULL) + mnt.mnt_mntopts = ""; + else + mnt.mnt_mntopts = (char *)options; + + if (!hasmntopt(&mnt, MNTOPT_REMOUNT) && + zfs_is_mounted(zhp, NULL)) { if (!explicit) return (0); - (void) fprintf(stderr, gettext("cannot share " - "'%s': volume already shared\n"), + (void) fprintf(stderr, gettext("cannot mount " + "'%s': filesystem already mounted\n"), zfs_get_name(zhp)); return (1); } - if (zfs_share_iscsi(zhp) != 0) + if (zfs_mount(zhp, options, flags) != 0) return (1); + break; } return (0); @@ -3234,19 +6052,16 @@ share_mount_one(zfs_handle_t *zhp, int o static void report_mount_progress(int current, int total) { - static int len; - static char *reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b" - "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"; - static time_t last_progress_time; + static time_t last_progress_time = 0; time_t now = time(NULL); + char info[32]; /* report 1..n instead of 0..n-1 */ ++current; /* display header if we're here for the first time */ if (current == 1) { - (void) printf(gettext("Mounting ZFS filesystems: ")); - len = 0; + set_progress_header(gettext("Mounting ZFS filesystems")); } else if (current != total && last_progress_time + MOUNT_TIME >= now) { /* too soon to report again */ return; @@ -3254,13 +6069,12 @@ report_mount_progress(int current, int t last_progress_time = now; - /* back up to prepare for overwriting */ - if (len) - (void) printf("%*.*s", len, len, reverse); - - /* We put a newline at the end if this is the last one. */ - len = printf("(%d/%d)%s", current, total, current == total ? "\n" : ""); - (void) fflush(stdout); + (void) sprintf(info, "(%d/%d)", current, total); + + if (current == total) + finish_progress(info); + else + update_progress(info); } static void @@ -3289,7 +6103,7 @@ share_mount(int op, int argc, char **arg boolean_t verbose = B_FALSE; int c, ret = 0; char *options = NULL; - int types, flags = 0; + int flags = 0; /* check options */ while ((c = getopt(argc, argv, op == OP_MOUNT ? ":avo:O" : "a")) @@ -3316,7 +6130,7 @@ share_mount(int op, int argc, char **arg break; case 'O': - flags |= MS_OVERLAY; + warnx("no overlay mounts support on FreeBSD, ignoring"); break; case ':': (void) fprintf(stderr, gettext("missing argument for " @@ -3339,24 +6153,16 @@ share_mount(int op, int argc, char **arg size_t i, count = 0; char *protocol = NULL; - if (op == OP_MOUNT) { - types = ZFS_TYPE_FILESYSTEM; - } else if (argc > 0) { - if (strcmp(argv[0], "nfs") == 0 || - strcmp(argv[0], "smb") == 0) { - types = ZFS_TYPE_FILESYSTEM; - } else if (strcmp(argv[0], "iscsi") == 0) { - types = ZFS_TYPE_VOLUME; - } else { + if (op == OP_SHARE && argc > 0) { + if (strcmp(argv[0], "nfs") != 0 && + strcmp(argv[0], "smb") != 0) { (void) fprintf(stderr, gettext("share type " - "must be 'nfs', 'smb' or 'iscsi'\n")); + "must be 'nfs' or 'smb'\n")); usage(B_FALSE); } protocol = argv[0]; argc--; argv++; - } else { - types = ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; } if (argc != 0) { @@ -3364,12 +6170,13 @@ share_mount(int op, int argc, char **arg usage(B_FALSE); } - get_all_datasets(types, &dslist, &count, verbose); + start_progress_timer(); + get_all_datasets(&dslist, &count, verbose); if (count == 0) return (0); - qsort(dslist, count, sizeof (void *), dataset_cmp); + qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp); for (i = 0; i < count; i++) { if (verbose) @@ -3383,8 +6190,7 @@ share_mount(int op, int argc, char **arg free(dslist); } else if (argc == 0) { - struct statvfs *sfs; - int i, n; + struct mnttab entry; if ((op == OP_SHARE) || (options != NULL)) { (void) fprintf(stderr, gettext("missing filesystem " @@ -3397,33 +6203,27 @@ share_mount(int op, int argc, char **arg * display any active ZFS mounts. We hide any snapshots, since * they are controlled automatically. */ - if ((n = getmntinfo(&sfs, MNT_WAIT)) == 0) { - fprintf(stderr, "getmntinfo(): %s\n", strerror(errno)); - return (0); - } - for (i = 0; i < n; i++) { - if (strcmp(sfs[i].f_fstypename, MNTTYPE_ZFS) != 0 || - strchr(sfs[i].f_mntfromname, '@') != NULL) + rewind(mnttab_file); + while (getmntent(mnttab_file, &entry) == 0) { + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 || + strchr(entry.mnt_special, '@') != NULL) continue; - (void) printf("%-30s %s\n", sfs[i].f_mntfromname, - sfs[i].f_mntonname); + (void) printf("%-30s %s\n", entry.mnt_special, + entry.mnt_mountp); } } else { zfs_handle_t *zhp; - types = ZFS_TYPE_FILESYSTEM; - if (op == OP_SHARE) - types |= ZFS_TYPE_VOLUME; - if (argc > 1) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } - if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) { + if ((zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM)) == NULL) { ret = 1; } else { ret = share_mount_one(zhp, op, flags, NULL, B_TRUE, @@ -3436,7 +6236,7 @@ share_mount(int op, int argc, char **arg } /* - * zfs mount -a [nfs | iscsi] + * zfs mount -a [nfs] * zfs mount filesystem * * Mount all filesystems, or mount the given filesystem. @@ -3448,7 +6248,7 @@ zfs_do_mount(int argc, char **argv) } /* - * zfs share -a [nfs | iscsi | smb] + * zfs share -a [nfs | smb] * zfs share filesystem * * Share all filesystems, or share the given filesystem. @@ -3484,9 +6284,9 @@ static int unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) { zfs_handle_t *zhp; - int ret; + int ret = 0; struct stat64 statbuf; - struct mnttab entry, search = { 0 }; + struct extmnttab entry; const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; ino_t path_inode; @@ -3506,9 +6306,39 @@ unshare_unmount_path(int op, char *path, /* * Search for the given (major,minor) pair in the mount table. */ +#ifdef illumos rewind(mnttab_file); - search.mnt_mountp = path; - if ((ret = getmntany(mnttab_file, &entry, &search)) == 0) { + while ((ret = getextmntent(mnttab_file, &entry, 0)) == 0) { + if (entry.mnt_major == major(statbuf.st_dev) && + entry.mnt_minor == minor(statbuf.st_dev)) + break; + } +#endif +#ifdef __FreeBSD__ + { + struct statfs sfs; + + if (statfs(path, &sfs) != 0) { + (void) fprintf(stderr, "%s: %s\n", path, + strerror(errno)); + ret = -1; + } + statfs2mnttab(&sfs, &entry); + } +#endif +#ifdef __NetBSD__ + { + struct statvfs sfs; + + if (statvfs(path, &sfs) != 0) { + (void) fprintf(stderr, "%s: %s\n", path, + strerror(errno)); + ret = -1; + } + statvfs2mnttab(&sfs, &entry); + } +#endif + if (ret != 0) { if (op == OP_SHARE) { (void) fprintf(stderr, gettext("cannot %s '%s': not " "currently mounted\n"), cmdname, path); @@ -3556,8 +6386,10 @@ unshare_unmount_path(int op, char *path, strcmp(smbshare_prop, "off") == 0) { (void) fprintf(stderr, gettext("cannot unshare " "'%s': legacy share\n"), path); +#ifdef illumos (void) fprintf(stderr, gettext("use " "unshare(1M) to unshare this filesystem\n")); +#endif } else if (!zfs_is_shared(zhp)) { (void) fprintf(stderr, gettext("cannot unshare '%s': " "not currently shared\n"), path); @@ -3576,7 +6408,7 @@ unshare_unmount_path(int op, char *path, (void) fprintf(stderr, gettext("cannot unmount " "'%s': legacy mountpoint\n"), zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use umount(1M) " + (void) fprintf(stderr, gettext("use umount(8) " "to unmount this filesystem\n")); } else { ret = zfs_unmountall(zhp, flags); @@ -3598,9 +6430,9 @@ unshare_unmount(int op, int argc, char * int do_all = 0; int flags = 0; int ret = 0; - int types, c; + int c; zfs_handle_t *zhp; - char nfsiscsi_mnt_prop[ZFS_MAXPROPLEN]; + char nfs_mnt_prop[ZFS_MAXPROPLEN]; char sharesmb[ZFS_MAXPROPLEN]; /* check options */ @@ -3637,10 +6469,9 @@ unshare_unmount(int op, int argc, char * * the special type (dataset name), and walk the result in * reverse to make sure to get any snapshots first. */ - struct statvfs *sfs; - int i, n; + struct mnttab entry; uu_avl_pool_t *pool; - uu_avl_t *tree; + uu_avl_t *tree = NULL; unshare_unmount_node_t *node; uu_avl_index_t idx; uu_avl_walk_t *walk; @@ -3650,66 +6481,61 @@ unshare_unmount(int op, int argc, char * usage(B_FALSE); } - if ((pool = uu_avl_pool_create("unmount_pool", + if (((pool = uu_avl_pool_create("unmount_pool", sizeof (unshare_unmount_node_t), offsetof(unshare_unmount_node_t, un_avlnode), - unshare_unmount_compare, - UU_DEFAULT)) == NULL) { - (void) fprintf(stderr, gettext("internal error: " - "out of memory\n")); - exit(1); - } - - if ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL) { - (void) fprintf(stderr, gettext("internal error: " - "out of memory\n")); - exit(1); - } + unshare_unmount_compare, UU_DEFAULT)) == NULL) || + ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL)) + nomem(); rewind(mnttab_file); - if ((n = getmntinfo(&sfs, MNT_WAIT)) == 0) { - (void) fprintf(stderr, gettext("internal error: " - "getmntinfo() failed\n")); - exit(1); - } - for (i = 0; i < n; i++) { + while (getmntent(mnttab_file, &entry) == 0) { /* ignore non-ZFS entries */ - if (strcmp(sfs[i].f_fstypename, MNTTYPE_ZFS) != 0) + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) continue; /* ignore snapshots */ - if (strchr(sfs[i].f_mntfromname, '@') != NULL) + if (strchr(entry.mnt_special, '@') != NULL) continue; - if ((zhp = zfs_open(g_zfs, sfs[i].f_mntfromname, + if ((zhp = zfs_open(g_zfs, entry.mnt_special, ZFS_TYPE_FILESYSTEM)) == NULL) { ret = 1; continue; } + /* + * Ignore datasets that are excluded/restricted by + * parent pool name. + */ + if (zpool_skip_pool(zfs_get_pool_name(zhp))) { + zfs_close(zhp); + continue; + } + switch (op) { case OP_SHARE: verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, - nfsiscsi_mnt_prop, - sizeof (nfsiscsi_mnt_prop), + nfs_mnt_prop, + sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); - if (strcmp(nfsiscsi_mnt_prop, "off") != 0) + if (strcmp(nfs_mnt_prop, "off") != 0) break; verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, - nfsiscsi_mnt_prop, - sizeof (nfsiscsi_mnt_prop), + nfs_mnt_prop, + sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); - if (strcmp(nfsiscsi_mnt_prop, "off") == 0) + if (strcmp(nfs_mnt_prop, "off") == 0) continue; break; case OP_MOUNT: /* Ignore legacy mounts */ verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, - nfsiscsi_mnt_prop, - sizeof (nfsiscsi_mnt_prop), + nfs_mnt_prop, + sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); - if (strcmp(nfsiscsi_mnt_prop, "legacy") == 0) + if (strcmp(nfs_mnt_prop, "legacy") == 0) continue; /* Ignore canmount=noauto mounts */ if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == @@ -3721,13 +6547,7 @@ unshare_unmount(int op, int argc, char * node = safe_malloc(sizeof (unshare_unmount_node_t)); node->un_zhp = zhp; - - if ((node->un_mountp = strdup(sfs[i].f_mntonname)) == - NULL) { - (void) fprintf(stderr, gettext("internal error:" - " out of memory\n")); - exit(1); - } + node->un_mountp = safe_strdup(entry.mnt_mountp); uu_avl_node_init(node, &node->un_avlnode, pool); @@ -3745,11 +6565,8 @@ unshare_unmount(int op, int argc, char * * removing it from the AVL tree in the process. */ if ((walk = uu_avl_walk_start(tree, - UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) { - (void) fprintf(stderr, - gettext("internal error: out of memory")); - exit(1); - } + UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) + nomem(); while ((node = uu_avl_walk_next(walk)) != NULL) { uu_avl_remove(tree, node); @@ -3777,29 +6594,6 @@ unshare_unmount(int op, int argc, char * uu_avl_destroy(tree); uu_avl_pool_destroy(pool); - if (op == OP_SHARE) { - /* - * Finally, unshare any volumes shared via iSCSI. - */ - zfs_handle_t **dslist = NULL; - size_t i, count = 0; - - get_all_datasets(ZFS_TYPE_VOLUME, &dslist, &count, - B_FALSE); - - if (count != 0) { - qsort(dslist, count, sizeof (void *), - dataset_cmp); - - for (i = 0; i < count; i++) { - if (zfs_unshare_iscsi(dslist[i]) != 0) - ret = 1; - zfs_close(dslist[i]); - } - - free(dslist); - } - } } else { if (argc != 1) { if (argc == 0) @@ -3821,91 +6615,65 @@ unshare_unmount(int op, int argc, char * return (unshare_unmount_path(op, argv[0], flags, B_FALSE)); - types = ZFS_TYPE_FILESYSTEM; - if (op == OP_SHARE) - types |= ZFS_TYPE_VOLUME; - - if ((zhp = zfs_open(g_zfs, argv[0], types)) == NULL) + if ((zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM)) == NULL) return (1); - if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { - verify(zfs_prop_get(zhp, op == OP_SHARE ? - ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, - nfsiscsi_mnt_prop, sizeof (nfsiscsi_mnt_prop), NULL, - NULL, 0, B_FALSE) == 0); - - switch (op) { - case OP_SHARE: - verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, - nfsiscsi_mnt_prop, - sizeof (nfsiscsi_mnt_prop), - NULL, NULL, 0, B_FALSE) == 0); - verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, - sharesmb, sizeof (sharesmb), NULL, NULL, - 0, B_FALSE) == 0); - - if (strcmp(nfsiscsi_mnt_prop, "off") == 0 && - strcmp(sharesmb, "off") == 0) { - (void) fprintf(stderr, gettext("cannot " - "unshare '%s': legacy share\n"), - zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use " - "unshare(1M) to unshare this " - "filesystem\n")); - ret = 1; - } else if (!zfs_is_shared(zhp)) { - (void) fprintf(stderr, gettext("cannot " - "unshare '%s': not currently " - "shared\n"), zfs_get_name(zhp)); - ret = 1; - } else if (zfs_unshareall(zhp) != 0) { - ret = 1; - } - break; - - case OP_MOUNT: - if (strcmp(nfsiscsi_mnt_prop, "legacy") == 0) { - (void) fprintf(stderr, gettext("cannot " - "unmount '%s': legacy " - "mountpoint\n"), zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use " - "umount(1M) to unmount this " - "filesystem\n")); - ret = 1; - } else if (!zfs_is_mounted(zhp, NULL)) { - (void) fprintf(stderr, gettext("cannot " - "unmount '%s': not currently " - "mounted\n"), - zfs_get_name(zhp)); - ret = 1; - } else if (zfs_unmountall(zhp, flags) != 0) { - ret = 1; - } - break; - } - } else { - assert(op == OP_SHARE); + verify(zfs_prop_get(zhp, op == OP_SHARE ? + ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, + nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, + NULL, 0, B_FALSE) == 0); - verify(zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, - nfsiscsi_mnt_prop, sizeof (nfsiscsi_mnt_prop), + switch (op) { + case OP_SHARE: + verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, + nfs_mnt_prop, + sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); + verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, + sharesmb, sizeof (sharesmb), NULL, NULL, + 0, B_FALSE) == 0); - if (strcmp(nfsiscsi_mnt_prop, "off") == 0) { - (void) fprintf(stderr, gettext("cannot unshare " - "'%s': 'shareiscsi' property not set\n"), + if (strcmp(nfs_mnt_prop, "off") == 0 && + strcmp(sharesmb, "off") == 0) { + (void) fprintf(stderr, gettext("cannot " + "unshare '%s': legacy share\n"), zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("set " - "'shareiscsi' property or use " - "iscsitadm(1M) to share this volume\n")); +#ifdef illumos + (void) fprintf(stderr, gettext("use " + "unshare(1M) to unshare this " + "filesystem\n")); +#endif + ret = 1; + } else if (!zfs_is_shared(zhp)) { + (void) fprintf(stderr, gettext("cannot " + "unshare '%s': not currently " + "shared\n"), zfs_get_name(zhp)); ret = 1; - } else if (!zfs_is_shared_iscsi(zhp)) { + } else if (zfs_unshareall(zhp) != 0) { + ret = 1; + } + break; + + case OP_MOUNT: + if (strcmp(nfs_mnt_prop, "legacy") == 0) { + (void) fprintf(stderr, gettext("cannot " + "unmount '%s': legacy " + "mountpoint\n"), zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use " + "umount(8) to unmount this " + "filesystem\n")); + ret = 1; + } else if (!zfs_is_mounted(zhp, NULL)) { (void) fprintf(stderr, gettext("cannot " - "unshare '%s': not currently shared\n"), + "unmount '%s': not currently " + "mounted\n"), zfs_get_name(zhp)); ret = 1; - } else if (zfs_unshare_iscsi(zhp) != 0) { + } else if (zfs_unmountall(zhp, flags) != 0) { ret = 1; } + break; } zfs_close(zhp); @@ -3938,14 +6706,69 @@ zfs_do_unshare(int argc, char **argv) return (unshare_unmount(OP_SHARE, argc, argv)); } +#ifdef __FreeBSD__ +/* + * Attach/detach the given dataset to/from the given jail + */ /* ARGSUSED */ static int -zfs_do_python(int argc, char **argv) +do_jail(int argc, char **argv, int attach) { - (void) execv(pypath, argv-1); - (void) printf("internal error: %s not found\n", pypath); - return (-1); + zfs_handle_t *zhp; + int jailid, ret; + + /* check number of arguments */ + if (argc < 3) { + (void) fprintf(stderr, gettext("missing argument(s)\n")); + usage(B_FALSE); + } + if (argc > 3) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + jailid = jail_getid(argv[1]); + if (jailid < 0) { + (void) fprintf(stderr, gettext("invalid jail id or name\n")); + usage(B_FALSE); + } + + zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) + return (1); + + ret = (zfs_jail(zhp, jailid, attach) != 0); + + zfs_close(zhp); + return (ret); +} + +/* + * zfs jail jailid filesystem + * + * Attach the given dataset to the given jail + */ +/* ARGSUSED */ +static int +zfs_do_jail(int argc, char **argv) +{ + + return (do_jail(argc, argv, 1)); +} + +/* + * zfs unjail jailid filesystem + * + * Detach the given dataset from the given jail + */ +/* ARGSUSED */ +static int +zfs_do_unjail(int argc, char **argv) +{ + + return (do_jail(argc, argv, 0)); } +#endif /* __FreeBSD__ */ /* * Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is @@ -3957,7 +6780,7 @@ manual_mount(int argc, char **argv) zfs_handle_t *zhp; char mountpoint[ZFS_MAXPROPLEN]; char mntopts[MNT_LINE_MAX] = { '\0' }; - int ret; + int ret = 0; int c; int flags = 0; char *dataset, *path; @@ -4018,7 +6841,7 @@ manual_mount(int argc, char **argv) /* check for legacy mountpoint and complain appropriately */ ret = 0; if (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0) { - if (mount(dataset, path, MS_OPTIONSTR | flags, MNTTYPE_ZFS, + if (zmount(dataset, path, flags, MNTTYPE_ZFS, NULL, 0, mntopts, sizeof (mntopts)) != 0) { (void) fprintf(stderr, gettext("mount failed: %s\n"), strerror(errno)); @@ -4026,12 +6849,12 @@ manual_mount(int argc, char **argv) } } else { (void) fprintf(stderr, gettext("filesystem '%s' cannot be " - "mounted using 'mount -F zfs'\n"), dataset); + "mounted using 'mount -t zfs'\n"), dataset); (void) fprintf(stderr, gettext("Use 'zfs set mountpoint=%s' " "instead.\n"), path); - (void) fprintf(stderr, gettext("If you must use 'mount -F zfs' " - "or /etc/vfstab, use 'zfs set mountpoint=legacy'.\n")); - (void) fprintf(stderr, gettext("See zfs(1M) for more " + (void) fprintf(stderr, gettext("If you must use 'mount -t zfs' " + "or /etc/fstab, use 'zfs set mountpoint=legacy'.\n")); + (void) fprintf(stderr, gettext("See zfs(8) for more " "information.\n")); ret = 1; } @@ -4099,10 +6922,190 @@ find_command_idx(char *command, int *idx return (1); } +static int +zfs_do_diff(int argc, char **argv) +{ + zfs_handle_t *zhp; + int flags = 0; + char *tosnap = NULL; + char *fromsnap = NULL; + char *atp, *copy; + int err = 0; + int c; + + while ((c = getopt(argc, argv, "FHt")) != -1) { + switch (c) { + case 'F': + flags |= ZFS_DIFF_CLASSIFY; + break; + case 'H': + flags |= ZFS_DIFF_PARSEABLE; + break; + case 't': + flags |= ZFS_DIFF_TIMESTAMP; + break; + default: + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, + gettext("must provide at least one snapshot name\n")); + usage(B_FALSE); + } + + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + fromsnap = argv[0]; + tosnap = (argc == 2) ? argv[1] : NULL; + + copy = NULL; + if (*fromsnap != '@') + copy = strdup(fromsnap); + else if (tosnap) + copy = strdup(tosnap); + if (copy == NULL) + usage(B_FALSE); + + if ((atp = strchr(copy, '@')) != NULL) + *atp = '\0'; + + if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL) + return (1); + + free(copy); + + /* + * Ignore SIGPIPE so that the library can give us + * information on any failure + */ + (void) sigignore(SIGPIPE); + + err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags); + + zfs_close(zhp); + + return (err != 0); +} + +/* + * zfs bookmark + * + * Creates a bookmark with the given name from the given snapshot. + */ +static int +zfs_do_bookmark(int argc, char **argv) +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + zfs_handle_t *zhp; + nvlist_t *nvl; + int ret = 0; + int c; + + /* check options */ + while ((c = getopt(argc, argv, "")) != -1) { + switch (c) { + case '?': + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + goto usage; + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing snapshot argument\n")); + goto usage; + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing bookmark argument\n")); + goto usage; + } + + if (strchr(argv[1], '#') == NULL) { + (void) fprintf(stderr, + gettext("invalid bookmark name '%s' -- " + "must contain a '#'\n"), argv[1]); + goto usage; + } + + if (argv[0][0] == '@') { + /* + * Snapshot name begins with @. + * Default to same fs as bookmark. + */ + (void) strncpy(snapname, argv[1], sizeof (snapname)); + *strchr(snapname, '#') = '\0'; + (void) strlcat(snapname, argv[0], sizeof (snapname)); + } else { + (void) strncpy(snapname, argv[0], sizeof (snapname)); + } + zhp = zfs_open(g_zfs, snapname, ZFS_TYPE_SNAPSHOT); + if (zhp == NULL) + goto usage; + zfs_close(zhp); + + + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, argv[1], snapname); + ret = lzc_bookmark(nvl, NULL); + fnvlist_free(nvl); + + if (ret != 0) { + const char *err_msg; + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot create bookmark '%s'"), argv[1]); + + switch (ret) { + case EXDEV: + err_msg = "bookmark is in a different pool"; + break; + case EEXIST: + err_msg = "bookmark exists"; + break; + case EINVAL: + err_msg = "invalid argument"; + break; + case ENOTSUP: + err_msg = "bookmark feature not enabled"; + break; + case ENOSPC: + err_msg = "out of space"; + break; + default: + err_msg = "unknown error"; + break; + } + (void) fprintf(stderr, "%s: %s\n", errbuf, + dgettext(TEXT_DOMAIN, err_msg)); + } + + return (ret != 0); + +usage: + usage(B_FALSE); + return (-1); +} + int main(int argc, char **argv) { - int ret; + int ret = 0; int i; char *progname; char *cmdname; @@ -4118,8 +7121,7 @@ main(int argc, char **argv) return (1); } - zpool_set_history_str("zfs", argc, argv, history_str); - verify(zpool_stage_history(g_zfs, history_str) == 0); + zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); libzfs_print_on_error(g_zfs, B_TRUE); @@ -4162,6 +7164,12 @@ main(int argc, char **argv) cmdname = "receive"; /* + * The 'snap' command is an alias for 'snapshot' + */ + if (strcmp(cmdname, "snap") == 0) + cmdname = "snapshot"; + + /* * Special case '-?' */ if (strcmp(cmdname, "-?") == 0) @@ -4188,6 +7196,9 @@ main(int argc, char **argv) (void) fclose(mnttab_file); + if (ret == 0 && log_history) + (void) zpool_log_history(g_zfs, history_str); + libzfs_fini(g_zfs); /* Index: src/external/cddl/osnet/dist/cmd/zfs/zfs_util.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zfs/zfs_util.h,v retrieving revision 1.1.1.1 diff -u -p -r1.1.1.1 zfs_util.h --- src/external/cddl/osnet/dist/cmd/zfs/zfs_util.h 7 Aug 2009 18:32:18 -0000 1.1.1.1 +++ src/external/cddl/osnet/dist/cmd/zfs/zfs_util.h 12 Jun 2012 05:55:36 -0000 @@ -19,15 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _ZFS_UTIL_H #define _ZFS_UTIL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #ifdef __cplusplus @@ -35,6 +32,7 @@ extern "C" { #endif void * safe_malloc(size_t size); +void nomem(void); libzfs_handle_t *g_zfs; #ifdef __cplusplus Index: src/external/cddl/osnet/dist/cmd/zpool/zpool_iter.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zpool/zpool_iter.c,v retrieving revision 1.1.1.1 diff -u -p -r1.1.1.1 zpool_iter.c --- src/external/cddl/osnet/dist/cmd/zpool/zpool_iter.c 7 Aug 2009 18:32:18 -0000 1.1.1.1 +++ src/external/cddl/osnet/dist/cmd/zpool/zpool_iter.c 27 Mar 2016 02:48:25 -0000 @@ -22,9 +22,11 @@ * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2016 Igor Kozhukhov . + */ -#pragma ident "%Z%%M% %I% %E% SMI" - +#include #include #include #include @@ -131,7 +133,8 @@ pool_list_get(int argc, char **argv, zpr for (i = 0; i < argc; i++) { zpool_handle_t *zhp; - if (zhp = zpool_open_canfail(g_zfs, argv[i])) { + if ((zhp = zpool_open_canfail(g_zfs, argv[i])) != + NULL) { if (add_pool(zhp, zlp) != 0) *err = B_TRUE; } else { Index: src/external/cddl/osnet/dist/cmd/zpool/zpool_main.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zpool/zpool_main.c,v retrieving revision 1.2 diff -u -p -r1.2 zpool_main.c --- src/external/cddl/osnet/dist/cmd/zpool/zpool_main.c 2 Jan 2013 10:33:49 -0000 1.2 +++ src/external/cddl/osnet/dist/cmd/zpool/zpool_main.c 5 May 2017 16:30:16 -0000 @@ -20,10 +20,16 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012 by Frederik Wessels. All rights reserved. + * Copyright (c) 2012 Martin Matuska . All rights reserved. + * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. + * Copyright 2016 Igor Kozhukhov . + * Copyright 2016 Nexenta Systems, Inc. */ +#include #include #include #include @@ -41,15 +47,16 @@ #include #include #include +#include +#include #include - -#include #include #include #include "zpool_util.h" #include "zfs_comutil.h" +#include "zfeature_common.h" #include "statcommon.h" @@ -58,6 +65,7 @@ static int zpool_do_destroy(int, char ** static int zpool_do_add(int, char **); static int zpool_do_remove(int, char **); +static int zpool_do_labelclear(int, char **); static int zpool_do_list(int, char **); static int zpool_do_iostat(int, char **); @@ -66,6 +74,9 @@ static int zpool_do_status(int, char **) static int zpool_do_online(int, char **); static int zpool_do_offline(int, char **); static int zpool_do_clear(int, char **); +static int zpool_do_reopen(int, char **); + +static int zpool_do_reguid(int, char **); static int zpool_do_attach(int, char **); static int zpool_do_detach(int, char **); @@ -114,6 +125,7 @@ typedef enum { HELP_HISTORY, HELP_IMPORT, HELP_IOSTAT, + HELP_LABELCLEAR, HELP_LIST, HELP_OFFLINE, HELP_ONLINE, @@ -124,7 +136,9 @@ typedef enum { HELP_UPGRADE, HELP_GET, HELP_SET, - HELP_SPLIT + HELP_SPLIT, + HELP_REGUID, + HELP_REOPEN } zpool_help_t; @@ -150,6 +164,8 @@ static zpool_command_t command_table[] = { "add", zpool_do_add, HELP_ADD }, { "remove", zpool_do_remove, HELP_REMOVE }, { NULL }, + { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR }, + { NULL }, { "list", zpool_do_list, HELP_LIST }, { "iostat", zpool_do_iostat, HELP_IOSTAT }, { "status", zpool_do_status, HELP_STATUS }, @@ -157,6 +173,7 @@ static zpool_command_t command_table[] = { "online", zpool_do_online, HELP_ONLINE }, { "offline", zpool_do_offline, HELP_OFFLINE }, { "clear", zpool_do_clear, HELP_CLEAR }, + { "reopen", zpool_do_reopen, HELP_REOPEN }, { NULL }, { "attach", zpool_do_attach, HELP_ATTACH }, { "detach", zpool_do_detach, HELP_DETACH }, @@ -168,6 +185,7 @@ static zpool_command_t command_table[] = { "import", zpool_do_import, HELP_IMPORT }, { "export", zpool_do_export, HELP_EXPORT }, { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, + { "reguid", zpool_do_reguid, HELP_REGUID }, { NULL }, { "history", zpool_do_history, HELP_HISTORY }, { "get", zpool_do_get, HELP_GET }, @@ -176,13 +194,14 @@ static zpool_command_t command_table[] = #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) -zpool_command_t *current_command; +static zpool_command_t *current_command; static char history_str[HIS_MAX_RECORD_LEN]; - +static boolean_t log_history = B_TRUE; static uint_t timestamp_fmt = NODATE; static const char * -get_usage(zpool_help_t idx) { +get_usage(zpool_help_t idx) +{ switch (idx) { case HELP_ADD: return (gettext("\tadd [-fn] ...\n")); @@ -192,7 +211,7 @@ get_usage(zpool_help_t idx) { case HELP_CLEAR: return (gettext("\tclear [-nF] [device]\n")); case HELP_CREATE: - return (gettext("\tcreate [-fn] [-o property=value] ... \n" + return (gettext("\tcreate [-fnd] [-o property=value] ... \n" "\t [-O file-system-property=value] ... \n" "\t [-m mountpoint] [-R root] ...\n")); case HELP_DESTROY: @@ -205,44 +224,52 @@ get_usage(zpool_help_t idx) { return (gettext("\thistory [-il] [] ...\n")); case HELP_IMPORT: return (gettext("\timport [-d dir] [-D]\n" - "\timport [-d dir | -c cachefile] [-n] -F \n" + "\timport [-d dir | -c cachefile] [-F [-n]] \n" "\timport [-o mntopts] [-o property=value] ... \n" - "\t [-d dir | -c cachefile] [-D] [-f] [-R root] -a\n" + "\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] " + "[-R root] [-F [-n]] -a\n" "\timport [-o mntopts] [-o property=value] ... \n" - "\t [-d dir | -c cachefile] [-D] [-f] [-R root] " - " [newpool]\n")); + "\t [-d dir | -c cachefile] [-D] [-f] [-m] [-N] " + "[-R root] [-F [-n]]\n" + "\t [newpool]\n")); case HELP_IOSTAT: return (gettext("\tiostat [-v] [-T d|u] [pool] ... [interval " "[count]]\n")); + case HELP_LABELCLEAR: + return (gettext("\tlabelclear [-f] \n")); case HELP_LIST: - return (gettext("\tlist [-H] [-o property[,...]] " - "[pool] ...\n")); + return (gettext("\tlist [-Hpv] [-o property[,...]] " + "[-T d|u] [pool] ... [interval [count]]\n")); case HELP_OFFLINE: return (gettext("\toffline [-t] ...\n")); case HELP_ONLINE: - return (gettext("\tonline ...\n")); + return (gettext("\tonline [-e] ...\n")); case HELP_REPLACE: return (gettext("\treplace [-f] " "[new-device]\n")); case HELP_REMOVE: return (gettext("\tremove ...\n")); + case HELP_REOPEN: + return (gettext("\treopen \n")); case HELP_SCRUB: return (gettext("\tscrub [-s] ...\n")); case HELP_STATUS: - return (gettext("\tstatus [-vx] [pool] ...\n")); + return (gettext("\tstatus [-vx] [-T d|u] [pool] ... [interval " + "[count]]\n")); case HELP_UPGRADE: - return (gettext("\tupgrade\n" - "\tupgrade -v\n" + return (gettext("\tupgrade [-v]\n" "\tupgrade [-V version] <-a | pool ...>\n")); case HELP_GET: - return (gettext("\tget <\"all\" | property[,...]> " - " ...\n")); + return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] " + "<\"all\" | property[,...]> ...\n")); case HELP_SET: return (gettext("\tset \n")); case HELP_SPLIT: return (gettext("\tsplit [-n] [-R altroot] [-o mntopts]\n" "\t [-o property=value] " "[ ...]\n")); + case HELP_REGUID: + return (gettext("\treguid \n")); } abort(); @@ -316,6 +343,12 @@ usage(boolean_t requested) /* Iterate over all properties */ (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_POOL); + + (void) fprintf(fp, "\t%-15s ", "feature@..."); + (void) fprintf(fp, "YES disabled | enabled | active\n"); + + (void) fprintf(fp, gettext("\nThe feature@ properties must be " + "appended with a feature name.\nSee zpool-features(7).\n")); } /* @@ -359,6 +392,18 @@ print_vdev_tree(zpool_handle_t *zhp, con } } +static boolean_t +prop_list_contains_feature(nvlist_t *proplist) +{ + nvpair_t *nvp; + for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp; + nvp = nvlist_next_nvpair(proplist, nvp)) { + if (zpool_prop_feature(nvpair_name(nvp))) + return (B_TRUE); + } + return (B_FALSE); +} + /* * Add a property pair (name, string-value) into a property nvlist. */ @@ -382,12 +427,34 @@ add_prop_list(const char *propname, char proplist = *props; if (poolprop) { - if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) { + const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION); + + if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL && + !zpool_prop_feature(propname)) { (void) fprintf(stderr, gettext("property '%s' is " "not a valid pool property\n"), propname); return (2); } - normnm = zpool_prop_to_name(prop); + + /* + * feature@ properties and version should not be specified + * at the same time. + */ + if ((prop == ZPROP_INVAL && zpool_prop_feature(propname) && + nvlist_exists(proplist, vname)) || + (prop == ZPOOL_PROP_VERSION && + prop_list_contains_feature(proplist))) { + (void) fprintf(stderr, gettext("'feature@' and " + "'version' properties cannot be specified " + "together\n")); + return (2); + } + + + if (zpool_prop_feature(propname)) + normnm = propname; + else + normnm = zpool_prop_to_name(prop); } else { if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { normnm = zfs_prop_to_name(fprop); @@ -520,11 +587,10 @@ zpool_do_add(int argc, char **argv) } /* - * zpool remove ... + * zpool remove ... * - * Removes the given vdev from the pool. Currently, this only supports removing - * spares and cache devices from the pool. Eventually, we'll want to support - * removing leaf vdevs (as an alias for 'detach') as well as toplevel vdevs. + * Removes the given vdev from the pool. Currently, this supports removing + * spares, cache, and log devices from the pool. */ int zpool_do_remove(int argc, char **argv) @@ -560,7 +626,154 @@ zpool_do_remove(int argc, char **argv) } /* - * zpool create [-fn] [-o property=value] ... + * zpool labelclear [-f] + * + * -f Force clearing the label for the vdevs which are members of + * the exported or foreign pools. + * + * Verifies that the vdev is not active and zeros out the label information + * on the device. + */ +int +zpool_do_labelclear(int argc, char **argv) +{ + char vdev[MAXPATHLEN]; + char *name = NULL; + struct stat st; + int c, fd, ret = 0; + nvlist_t *config; + pool_state_t state; + boolean_t inuse = B_FALSE; + boolean_t force = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, "f")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get vdev name */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing vdev name\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + /* + * Check if we were given absolute path and use it as is. + * Otherwise if the provided vdev name doesn't point to a file, + * try prepending dsk path and appending s0. + */ + (void) strlcpy(vdev, argv[0], sizeof (vdev)); + if (vdev[0] != '/' && stat(vdev, &st) != 0) { + char *s; + + (void) snprintf(vdev, sizeof (vdev), "%s/%s", +#ifdef illumos + ZFS_DISK_ROOT, argv[0]); + if ((s = strrchr(argv[0], 's')) == NULL || + !isdigit(*(s + 1))) + (void) strlcat(vdev, "s0", sizeof (vdev)); +#else + "/dev", argv[0]); +#endif + if (stat(vdev, &st) != 0) { + (void) fprintf(stderr, gettext( + "failed to find device %s, try specifying absolute " + "path instead\n"), argv[0]); + return (1); + } + } + + if ((fd = open(vdev, O_RDWR)) < 0) { + (void) fprintf(stderr, gettext("failed to open %s: %s\n"), + vdev, strerror(errno)); + return (1); + } + + if (zpool_read_label(fd, &config) != 0 || config == NULL) { + (void) fprintf(stderr, + gettext("failed to read label from %s\n"), vdev); + return (1); + } + nvlist_free(config); + + ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse); + if (ret != 0) { + (void) fprintf(stderr, + gettext("failed to check state for %s\n"), vdev); + return (1); + } + + if (!inuse) + goto wipe_label; + + switch (state) { + default: + case POOL_STATE_ACTIVE: + case POOL_STATE_SPARE: + case POOL_STATE_L2CACHE: + (void) fprintf(stderr, gettext( + "%s is a member (%s) of pool \"%s\"\n"), + vdev, zpool_pool_state_to_name(state), name); + ret = 1; + goto errout; + + case POOL_STATE_EXPORTED: + if (force) + break; + (void) fprintf(stderr, gettext( + "use '-f' to override the following error:\n" + "%s is a member of exported pool \"%s\"\n"), + vdev, name); + ret = 1; + goto errout; + + case POOL_STATE_POTENTIALLY_ACTIVE: + if (force) + break; + (void) fprintf(stderr, gettext( + "use '-f' to override the following error:\n" + "%s is a member of potentially active pool \"%s\"\n"), + vdev, name); + ret = 1; + goto errout; + + case POOL_STATE_DESTROYED: + /* inuse should never be set for a destroyed pool */ + assert(0); + break; + } + +wipe_label: + ret = zpool_clear_label(fd); + if (ret != 0) { + (void) fprintf(stderr, + gettext("failed to clear label for %s\n"), vdev); + } + +errout: + free(name); + (void) close(fd); + + return (ret); +} + +/* + * zpool create [-fnd] [-o property=value] ... * [-O file-system-property=value] ... * [-R root] [-m mountpoint] ... * @@ -569,8 +782,10 @@ zpool_do_remove(int argc, char **argv) * were to be created. * -R Create a pool under an alternate root * -m Set default mountpoint for the root dataset. By default it's - * '/' + * '/' * -o Set property=value. + * -d Don't automatically enable all supported pool features + * (individual features can be enabled with -o). * -O Set fsproperty=value in the pool's root file system * * Creates the named pool according to the given vdev specification. The @@ -583,6 +798,7 @@ zpool_do_create(int argc, char **argv) { boolean_t force = B_FALSE; boolean_t dryrun = B_FALSE; + boolean_t enable_all_pool_feat = B_TRUE; int c; nvlist_t *nvroot = NULL; char *poolname; @@ -594,7 +810,7 @@ zpool_do_create(int argc, char **argv) char *propval; /* check options */ - while ((c = getopt(argc, argv, ":fnR:m:o:O:")) != -1) { + while ((c = getopt(argc, argv, ":fndR:m:o:O:")) != -1) { switch (c) { case 'f': force = B_TRUE; @@ -602,6 +818,9 @@ zpool_do_create(int argc, char **argv) case 'n': dryrun = B_TRUE; break; + case 'd': + enable_all_pool_feat = B_FALSE; + break; case 'R': altroot = optarg; if (add_prop_list(zpool_prop_to_name( @@ -616,6 +835,7 @@ zpool_do_create(int argc, char **argv) goto errout; break; case 'm': + /* Equivalent to -O mountpoint=optarg */ mountpoint = optarg; break; case 'o': @@ -629,6 +849,23 @@ zpool_do_create(int argc, char **argv) if (add_prop_list(optarg, propval, &props, B_TRUE)) goto errout; + + /* + * If the user is creating a pool that doesn't support + * feature flags, don't enable any features. + */ + if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) { + char *end; + u_longlong_t ver; + + ver = strtoull(propval, &end, 10); + if (*end == '\0' && + ver < SPA_VERSION_FEATURES) { + enable_all_pool_feat = B_FALSE; + } + } + if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT) + altroot = propval; break; case 'O': if ((propval = strchr(optarg, '=')) == NULL) { @@ -639,8 +876,18 @@ zpool_do_create(int argc, char **argv) *propval = '\0'; propval++; - if (add_prop_list(optarg, propval, &fsprops, B_FALSE)) + /* + * Mountpoints are checked and then added later. + * Uniquely among properties, they can be specified + * more than once, to avoid conflict with -m. + */ + if (0 == strcmp(optarg, + zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { + mountpoint = propval; + } else if (add_prop_list(optarg, propval, &fsprops, + B_FALSE)) { goto errout; + } break; case ':': (void) fprintf(stderr, gettext("missing argument for " @@ -694,7 +941,6 @@ zpool_do_create(int argc, char **argv) goto errout; } - if (altroot != NULL && altroot[0] != '/') { (void) fprintf(stderr, gettext("invalid alternate root '%s': " "must be an absolute path\n"), altroot); @@ -704,10 +950,11 @@ zpool_do_create(int argc, char **argv) /* * Check the validity of the mountpoint and direct the user to use the * '-m' mountpoint option if it looks like its in use. + * Ignore the checks if the '-f' option is given. */ - if (mountpoint == NULL || + if (!force && (mountpoint == NULL || (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && - strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) { + strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0))) { char buf[MAXPATHLEN]; DIR *dirp; @@ -758,6 +1005,18 @@ zpool_do_create(int argc, char **argv) } } + /* + * Now that the mountpoint's validity has been checked, ensure that + * the property is set appropriately prior to creating the pool. + */ + if (mountpoint != NULL) { + ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), + mountpoint, &fsprops, B_FALSE); + if (ret != 0) + goto errout; + } + + ret = 1; if (dryrun) { /* * For a dry run invocation, print out a basic message and run @@ -776,16 +1035,35 @@ zpool_do_create(int argc, char **argv) /* * Hand off to libzfs. */ + if (enable_all_pool_feat) { + spa_feature_t i; + for (i = 0; i < SPA_FEATURES; i++) { + char propname[MAXPATHLEN]; + zfeature_info_t *feat = &spa_feature_table[i]; + + (void) snprintf(propname, sizeof (propname), + "feature@%s", feat->fi_uname); + + /* + * Skip feature if user specified it manually + * on the command line. + */ + if (nvlist_exists(props, propname)) + continue; + + ret = add_prop_list(propname, + ZFS_FEATURE_ENABLED, &props, B_TRUE); + if (ret != 0) + goto errout; + } + } + + ret = 1; if (zpool_create(g_zfs, poolname, nvroot, props, fsprops) == 0) { zfs_handle_t *pool = zfs_open(g_zfs, poolname, ZFS_TYPE_FILESYSTEM); if (pool != NULL) { - if (mountpoint != NULL) - verify(zfs_prop_set(pool, - zfs_prop_to_name( - ZFS_PROP_MOUNTPOINT), - mountpoint) == 0); if (zfs_mount(pool, NULL, 0) == 0) ret = zfs_shareall(pool); zfs_close(pool); @@ -869,7 +1147,10 @@ zpool_do_destroy(int argc, char **argv) return (1); } - ret = (zpool_destroy(zhp) != 0); + /* The history must be logged as part of the export */ + log_history = B_FALSE; + + ret = (zpool_destroy(zhp, history_str) != 0); zpool_close(zhp); @@ -933,10 +1214,13 @@ zpool_do_export(int argc, char **argv) continue; } + /* The history must be logged as part of the export */ + log_history = B_FALSE; + if (hardforce) { - if (zpool_export_force(zhp) != 0) + if (zpool_export_force(zhp, history_str) != 0) ret = 1; - } else if (zpool_export(zhp, force) != 0) { + } else if (zpool_export(zhp, force, history_str) != 0) { ret = 1; } @@ -1044,21 +1328,23 @@ print_status_config(zpool_handle_t *zhp, int namewidth, int depth, boolean_t isspare) { nvlist_t **child; - uint_t c, children; + uint_t c, vsc, children; + pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; - char rbuf[6], wbuf[6], cbuf[6], repaired[7]; + char rbuf[6], wbuf[6], cbuf[6]; char *vname; uint64_t notpresent; + uint64_t ashift; spare_cbdata_t cb; - char *state; - - verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS, - (uint64_t **)&vs, &c) == 0); + const char *state; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) children = 0; + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) == 0); + state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { /* @@ -1082,10 +1368,11 @@ print_status_config(zpool_handle_t *zhp, } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, - ¬present) == 0) { + ¬present) == 0 || + vs->vs_state <= VDEV_STATE_CANT_OPEN) { char *path; - verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); - (void) printf(" was %s", path); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) + (void) printf(" was %s", path); } else if (vs->vs_aux != 0) { (void) printf(" "); @@ -1106,6 +1393,14 @@ print_status_config(zpool_handle_t *zhp, (void) printf(gettext("newer version")); break; + case VDEV_AUX_UNSUP_FEAT: + (void) printf(gettext("unsupported feature(s)")); + break; + + case VDEV_AUX_ASHIFT_TOO_BIG: + (void) printf(gettext("unsupported minimum blocksize")); + break; + case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &cb.cb_guid) == 0); @@ -1148,14 +1443,22 @@ print_status_config(zpool_handle_t *zhp, (void) printf(gettext("corrupted data")); break; } - } else if (vs->vs_scrub_repaired != 0 && children == 0) { - /* - * Report bytes resilvered/repaired on leaf devices. - */ - zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired)); - (void) printf(gettext(" %s %s"), repaired, - (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ? - "resilvered" : "repaired"); + } else if (children == 0 && !isspare && + VDEV_STAT_VALID(vs_physical_ashift, vsc) && + vs->vs_configured_ashift < vs->vs_physical_ashift) { + (void) printf( + gettext(" block size: %dB configured, %dB native"), + 1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift); + } + + (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &c); + + if (ps && ps->pss_state == DSS_SCANNING && + vs->vs_scan_processed != 0 && children == 0) { + (void) printf(gettext(" (%s)"), + (ps->pss_func == POOL_SCAN_RESILVER) ? + "resilvering" : "repairing"); } (void) printf("\n"); @@ -1195,7 +1498,7 @@ print_import_config(const char *name, nv strcmp(type, VDEV_TYPE_HOLE) == 0) return; - verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS, + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); (void) printf("\t%*s%-*s", depth, "", namewidth - depth, name); @@ -1221,6 +1524,10 @@ print_import_config(const char *name, nv (void) printf(gettext("newer version")); break; + case VDEV_AUX_UNSUP_FEAT: + (void) printf(gettext("unsupported feature(s)")); + break; + case VDEV_AUX_ERR_EXCEEDED: (void) printf(gettext("too many errors")); break; @@ -1324,6 +1631,7 @@ show_import(nvlist_t *config) const char *health; uint_t vsc; int namewidth; + char *comment; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); @@ -1334,15 +1642,15 @@ show_import(nvlist_t *config) verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS, + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); reason = zpool_import_status(config, &msgid); - (void) printf(gettext(" pool: %s\n"), name); - (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); - (void) printf(gettext(" state: %s"), health); + (void) printf(gettext(" pool: %s\n"), name); + (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); + (void) printf(gettext(" state: %s"), health); if (pool_state == POOL_STATE_DESTROYED) (void) printf(gettext(" (DESTROYED)")); (void) printf("\n"); @@ -1351,56 +1659,87 @@ show_import(nvlist_t *config) case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: - (void) printf(gettext("status: One or more devices are missing " - "from the system.\n")); + (void) printf(gettext(" status: One or more devices are " + "missing from the system.\n")); break; case ZPOOL_STATUS_CORRUPT_LABEL_R: case ZPOOL_STATUS_CORRUPT_LABEL_NR: - (void) printf(gettext("status: One or more devices contains " + (void) printf(gettext(" status: One or more devices contains " "corrupted data.\n")); break; case ZPOOL_STATUS_CORRUPT_DATA: - (void) printf(gettext("status: The pool data is corrupted.\n")); + (void) printf( + gettext(" status: The pool data is corrupted.\n")); break; case ZPOOL_STATUS_OFFLINE_DEV: - (void) printf(gettext("status: One or more devices " + (void) printf(gettext(" status: One or more devices " "are offlined.\n")); break; case ZPOOL_STATUS_CORRUPT_POOL: - (void) printf(gettext("status: The pool metadata is " + (void) printf(gettext(" status: The pool metadata is " "corrupted.\n")); break; case ZPOOL_STATUS_VERSION_OLDER: - (void) printf(gettext("status: The pool is formatted using an " - "older on-disk version.\n")); + (void) printf(gettext(" status: The pool is formatted using a " + "legacy on-disk version.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: - (void) printf(gettext("status: The pool is formatted using an " + (void) printf(gettext(" status: The pool is formatted using an " "incompatible version.\n")); break; + case ZPOOL_STATUS_FEAT_DISABLED: + (void) printf(gettext(" status: Some supported features are " + "not enabled on the pool.\n")); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_READ: + (void) printf(gettext("status: The pool uses the following " + "feature(s) not supported on this sytem:\n")); + zpool_print_unsup_feat(config); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: + (void) printf(gettext("status: The pool can only be accessed " + "in read-only mode on this system. It\n\tcannot be " + "accessed in read-write mode because it uses the " + "following\n\tfeature(s) not supported on this system:\n")); + zpool_print_unsup_feat(config); + break; + case ZPOOL_STATUS_HOSTID_MISMATCH: - (void) printf(gettext("status: The pool was last accessed by " + (void) printf(gettext(" status: The pool was last accessed by " "another system.\n")); break; case ZPOOL_STATUS_FAULTED_DEV_R: case ZPOOL_STATUS_FAULTED_DEV_NR: - (void) printf(gettext("status: One or more devices are " + (void) printf(gettext(" status: One or more devices are " "faulted.\n")); break; case ZPOOL_STATUS_BAD_LOG: - (void) printf(gettext("status: An intent log record cannot be " + (void) printf(gettext(" status: An intent log record cannot be " "read.\n")); break; + case ZPOOL_STATUS_RESILVERING: + (void) printf(gettext(" status: One or more devices were being " + "resilvered.\n")); + break; + + case ZPOOL_STATUS_NON_NATIVE_ASHIFT: + (void) printf(gettext("status: One or more devices were " + "configured to use a non-native block size.\n" + "\tExpect reduced performance.\n")); + break; + default: /* * No other status can be seen when importing pools. @@ -1412,44 +1751,64 @@ show_import(nvlist_t *config) * Print out an action according to the overall state of the pool. */ if (vs->vs_state == VDEV_STATE_HEALTHY) { - if (reason == ZPOOL_STATUS_VERSION_OLDER) - (void) printf(gettext("action: The pool can be " + if (reason == ZPOOL_STATUS_VERSION_OLDER || + reason == ZPOOL_STATUS_FEAT_DISABLED) { + (void) printf(gettext(" action: The pool can be " "imported using its name or numeric identifier, " "though\n\tsome features will not be available " "without an explicit 'zpool upgrade'.\n")); - else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) - (void) printf(gettext("action: The pool can be " + } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { + (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier and\n\tthe '-f' flag.\n")); - else - (void) printf(gettext("action: The pool can be " + } else { + (void) printf(gettext(" action: The pool can be " "imported using its name or numeric " "identifier.\n")); + } } else if (vs->vs_state == VDEV_STATE_DEGRADED) { - (void) printf(gettext("action: The pool can be imported " + (void) printf(gettext(" action: The pool can be imported " "despite missing or damaged devices. The\n\tfault " "tolerance of the pool may be compromised if imported.\n")); } else { switch (reason) { case ZPOOL_STATUS_VERSION_NEWER: - (void) printf(gettext("action: The pool cannot be " + (void) printf(gettext(" action: The pool cannot be " "imported. Access the pool on a system running " "newer\n\tsoftware, or recreate the pool from " "backup.\n")); break; + case ZPOOL_STATUS_UNSUP_FEAT_READ: + (void) printf(gettext("action: The pool cannot be " + "imported. Access the pool on a system that " + "supports\n\tthe required feature(s), or recreate " + "the pool from backup.\n")); + break; + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: + (void) printf(gettext("action: The pool cannot be " + "imported in read-write mode. Import the pool " + "with\n" + "\t\"-o readonly=on\", access the pool on a system " + "that supports the\n\trequired feature(s), or " + "recreate the pool from backup.\n")); + break; case ZPOOL_STATUS_MISSING_DEV_R: case ZPOOL_STATUS_MISSING_DEV_NR: case ZPOOL_STATUS_BAD_GUID_SUM: - (void) printf(gettext("action: The pool cannot be " + (void) printf(gettext(" action: The pool cannot be " "imported. Attach the missing\n\tdevices and try " "again.\n")); break; default: - (void) printf(gettext("action: The pool cannot be " + (void) printf(gettext(" action: The pool cannot be " "imported due to damaged devices or data.\n")); } } + /* Print the comment attached to the pool. */ + if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) + (void) printf(gettext("comment: %s\n"), comment); + /* * If the state is "closed" or "can't open", and the aux state * is "corrupt data": @@ -1467,10 +1826,10 @@ show_import(nvlist_t *config) } if (msgid != NULL) - (void) printf(gettext(" see: http://www.sun.com/msg/%s\n"), + (void) printf(gettext(" see: http://illumos.org/msg/%s\n"), msgid); - (void) printf(gettext("config:\n\n")); + (void) printf(gettext(" config:\n\n")); namewidth = max_width(NULL, nvroot, 0, 0); if (namewidth < 10) @@ -1494,7 +1853,7 @@ show_import(nvlist_t *config) */ static int do_import(nvlist_t *config, const char *newname, const char *mntopts, - int force, nvlist_t *props, boolean_t do_verbatim) + nvlist_t *props, int flags) { zpool_handle_t *zhp; char *name; @@ -1508,11 +1867,12 @@ do_import(nvlist_t *config, const char * ZPOOL_CONFIG_POOL_STATE, &state) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); - if (version > SPA_VERSION) { + if (!SPA_VERSION_IS_SUPPORTED(version)) { (void) fprintf(stderr, gettext("cannot import '%s': pool " - "is formatted using a newer ZFS version\n"), name); + "is formatted using an unsupported ZFS version\n"), name); return (1); - } else if (state != POOL_STATE_EXPORTED && !force) { + } else if (state != POOL_STATE_EXPORTED && + !(flags & ZFS_IMPORT_ANY_HOST)) { uint64_t hostid; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, @@ -1546,7 +1906,7 @@ do_import(nvlist_t *config, const char * } } - if (zpool_import_props(g_zfs, config, newname, props, do_verbatim) != 0) + if (zpool_import_props(g_zfs, config, newname, props, flags) != 0) return (1); if (newname != NULL) @@ -1556,6 +1916,7 @@ do_import(nvlist_t *config, const char * return (1); if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && + !(flags & ZFS_IMPORT_ONLY) && zpool_enable_datasets(zhp, mntopts, 0) != 0) { zpool_close(zhp); return (1); @@ -1597,6 +1958,11 @@ do_import(nvlist_t *config, const char * * * -n See if rewind would work, but don't actually rewind. * + * -N Import the pool but don't mount datasets. + * + * -T Specify a starting txg to use for import. This option is + * intentionally undocumented option for testing purposes. + * * -a Import all pools found. * * -o Set property=value and/or temporary mount options (without '='). @@ -1615,7 +1981,6 @@ zpool_do_import(int argc, char **argv) boolean_t do_all = B_FALSE; boolean_t do_destroyed = B_FALSE; char *mntopts = NULL; - boolean_t do_force = B_FALSE; nvpair_t *elem; nvlist_t *config; uint64_t searchguid = 0; @@ -1625,17 +1990,18 @@ zpool_do_import(int argc, char **argv) nvlist_t *policy = NULL; nvlist_t *props = NULL; boolean_t first; - boolean_t do_verbatim = B_FALSE; + int flags = ZFS_IMPORT_NORMAL; uint32_t rewind_policy = ZPOOL_NO_REWIND; boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; - uint64_t pool_state; + uint64_t pool_state, txg = -1ULL; char *cachefile = NULL; importargs_t idata = { 0 }; + char *endptr; /* check options */ - while ((c = getopt(argc, argv, ":aCc:d:DEfFno:rR:VX")) != -1) { + while ((c = getopt(argc, argv, ":aCc:d:DEfFmnNo:R:T:VX")) != -1) { switch (c) { case 'a': do_all = B_TRUE; @@ -1660,14 +2026,20 @@ zpool_do_import(int argc, char **argv) do_destroyed = B_TRUE; break; case 'f': - do_force = B_TRUE; + flags |= ZFS_IMPORT_ANY_HOST; break; case 'F': do_rewind = B_TRUE; break; + case 'm': + flags |= ZFS_IMPORT_MISSING_LOG; + break; case 'n': dryrun = B_TRUE; break; + case 'N': + flags |= ZFS_IMPORT_ONLY; + break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { *propval = '\0'; @@ -1691,8 +2063,18 @@ zpool_do_import(int argc, char **argv) ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto error; break; + case 'T': + errno = 0; + txg = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0') { + (void) fprintf(stderr, + gettext("invalid txg value\n")); + usage(B_FALSE); + } + rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND; + break; case 'V': - do_verbatim = B_TRUE; + flags |= ZFS_IMPORT_VERBATIM; break; case 'X': xtreme_rewind = B_TRUE; @@ -1731,12 +2113,13 @@ zpool_do_import(int argc, char **argv) /* In the future, we can capture further policy and include it here */ if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || + nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, txg) != 0 || nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0) goto error; if (searchdirs == NULL) { searchdirs = safe_malloc(sizeof (char *)); - searchdirs[0] = "/dev/dsk"; + searchdirs[0] = "/dev"; nsearch = 1; } @@ -1784,8 +2167,10 @@ zpool_do_import(int argc, char **argv) errno = 0; searchguid = strtoull(argv[0], &endptr, 10); - if (errno != 0 || *endptr != '\0') + if (errno != 0 || *endptr != '\0') { searchname = argv[0]; + searchguid = 0; + } found_config = NULL; /* @@ -1864,7 +2249,7 @@ zpool_do_import(int argc, char **argv) if (do_all) { err |= do_import(config, NULL, mntopts, - do_force, props, do_verbatim); + props, flags); } else { show_import(config); } @@ -1913,7 +2298,7 @@ zpool_do_import(int argc, char **argv) err = B_TRUE; } else { err |= do_import(found_config, argc == 1 ? NULL : - argv[1], mntopts, do_force, props, do_verbatim); + argv[1], mntopts, props, flags); } } @@ -1935,10 +2320,10 @@ error: } typedef struct iostat_cbdata { - zpool_list_t *cb_list; - int cb_verbose; - int cb_iteration; + boolean_t cb_verbose; int cb_namewidth; + int cb_iteration; + zpool_list_t *cb_list; } iostat_cbdata_t; static void @@ -1991,13 +2376,13 @@ print_vdev_stats(zpool_handle_t *zhp, co char *vname; if (oldnv != NULL) { - verify(nvlist_lookup_uint64_array(oldnv, ZPOOL_CONFIG_STATS, - (uint64_t **)&oldvs, &c) == 0); + verify(nvlist_lookup_uint64_array(oldnv, + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); } else { oldvs = &zerovs; } - verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_STATS, + verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&newvs, &c) == 0); if (strlen(name) + depth > cb->cb_namewidth) @@ -2047,6 +2432,17 @@ print_vdev_stats(zpool_handle_t *zhp, co return; for (c = 0; c < children; c++) { + uint64_t ishole = B_FALSE, islog = B_FALSE; + + (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE, + &ishole); + + (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, + &islog); + + if (ishole || islog) + continue; + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE); print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); @@ -2054,6 +2450,31 @@ print_vdev_stats(zpool_handle_t *zhp, co } /* + * Log device section + */ + + if (num_logs(newnv) > 0) { + (void) printf("%-*s - - - - - " + "-\n", cb->cb_namewidth, "logs"); + + for (c = 0; c < children; c++) { + uint64_t islog = B_FALSE; + (void) nvlist_lookup_uint64(newchild[c], + ZPOOL_CONFIG_IS_LOG, &islog); + + if (islog) { + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], + B_FALSE); + print_vdev_stats(zhp, vname, oldnv ? + oldchild[c] : NULL, newchild[c], + cb, depth + 2); + free(vname); + } + } + + } + + /* * Include level 2 ARC devices in iostat output */ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, @@ -2142,7 +2563,8 @@ get_namewidth(zpool_handle_t *zhp, void if (!cb->cb_verbose) cb->cb_namewidth = strlen(zpool_get_name(zhp)); else - cb->cb_namewidth = max_width(zhp, nvroot, 0, 0); + cb->cb_namewidth = max_width(zhp, nvroot, 0, + cb->cb_namewidth); } /* @@ -2158,55 +2580,14 @@ get_namewidth(zpool_handle_t *zhp, void } /* - * zpool iostat [-T d|u] [-v] [pool] ... [interval [count]] - * - * -T Display a timestamp in date(1) or Unix format - * -v Display statistics for individual vdevs - * - * This command can be tricky because we want to be able to deal with pool - * creation/destruction as well as vdev configuration changes. The bulk of this - * processing is handled by the pool_list_* routines in zpool_iter.c. We rely - * on pool_list_update() to detect the addition of new pools. Configuration - * changes are all handled within libzfs. + * Parse the input string, get the 'interval' and 'count' value if there is one. */ -int -zpool_do_iostat(int argc, char **argv) +static void +get_interval_count(int *argcp, char **argv, unsigned long *iv, + unsigned long *cnt) { - int c; - int ret; - int npools; unsigned long interval = 0, count = 0; - zpool_list_t *list; - boolean_t verbose = B_FALSE; - iostat_cbdata_t cb; - - /* check options */ - while ((c = getopt(argc, argv, "T:v")) != -1) { - switch (c) { - case 'T': - if (optarg) { - if (*optarg == 'u') - timestamp_fmt = UDATE; - else if (*optarg == 'd') - timestamp_fmt = DDATE; - else - usage(B_FALSE); - } else { - usage(B_FALSE); - } - break; - case 'v': - verbose = B_TRUE; - break; - case '?': - (void) fprintf(stderr, gettext("invalid option '%c'\n"), - optopt); - usage(B_FALSE); - } - } - - argc -= optind; - argv += optind; + int argc = *argcp, errno; /* * Determine if the last argument is an integer or a pool name @@ -2223,7 +2604,6 @@ zpool_do_iostat(int argc, char **argv) "cannot be zero\n")); usage(B_FALSE); } - /* * Ignore the last parameter */ @@ -2240,7 +2620,7 @@ zpool_do_iostat(int argc, char **argv) /* * If the last argument is also an integer, then we have both a count - * and an integer. + * and an interval. */ if (argc > 0 && isdigit(argv[argc - 1][0])) { char *end; @@ -2265,23 +2645,83 @@ zpool_do_iostat(int argc, char **argv) } } - /* - * Construct the list of all interesting pools. - */ - ret = 0; - if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL) - return (1); - - if (pool_list_count(list) == 0 && argc != 0) { - pool_list_free(list); - return (1); - } + *iv = interval; + *cnt = count; + *argcp = argc; +} - if (pool_list_count(list) == 0 && interval == 0) { - pool_list_free(list); - (void) fprintf(stderr, gettext("no pools available\n")); - return (1); - } +static void +get_timestamp_arg(char c) +{ + if (c == 'u') + timestamp_fmt = UDATE; + else if (c == 'd') + timestamp_fmt = DDATE; + else + usage(B_FALSE); +} + +/* + * zpool iostat [-v] [-T d|u] [pool] ... [interval [count]] + * + * -v Display statistics for individual vdevs + * -T Display a timestamp in date(1) or Unix format + * + * This command can be tricky because we want to be able to deal with pool + * creation/destruction as well as vdev configuration changes. The bulk of this + * processing is handled by the pool_list_* routines in zpool_iter.c. We rely + * on pool_list_update() to detect the addition of new pools. Configuration + * changes are all handled within libzfs. + */ +int +zpool_do_iostat(int argc, char **argv) +{ + int c; + int ret; + int npools; + unsigned long interval = 0, count = 0; + zpool_list_t *list; + boolean_t verbose = B_FALSE; + iostat_cbdata_t cb; + + /* check options */ + while ((c = getopt(argc, argv, "T:v")) != -1) { + switch (c) { + case 'T': + get_timestamp_arg(*optarg); + break; + case 'v': + verbose = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + get_interval_count(&argc, argv, &interval, &count); + + /* + * Construct the list of all interesting pools. + */ + ret = 0; + if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL) + return (1); + + if (pool_list_count(list) == 0 && argc != 0) { + pool_list_free(list); + return (1); + } + + if (pool_list_count(list) == 0 && interval == 0) { + pool_list_free(list); + (void) fprintf(stderr, gettext("no pools available\n")); + return (1); + } /* * Enter the main iostat loop. @@ -2353,39 +2793,61 @@ zpool_do_iostat(int argc, char **argv) } typedef struct list_cbdata { + boolean_t cb_verbose; + int cb_namewidth; boolean_t cb_scripted; - boolean_t cb_first; zprop_list_t *cb_proplist; + boolean_t cb_literal; } list_cbdata_t; /* * Given a list of columns to display, output appropriate headers for each one. */ static void -print_header(zprop_list_t *pl) +print_header(list_cbdata_t *cb) { + zprop_list_t *pl = cb->cb_proplist; + char headerbuf[ZPOOL_MAXPROPLEN]; const char *header; boolean_t first = B_TRUE; boolean_t right_justify; + size_t width = 0; for (; pl != NULL; pl = pl->pl_next) { - if (pl->pl_prop == ZPROP_INVAL) - continue; + width = pl->pl_width; + if (first && cb->cb_verbose) { + /* + * Reset the width to accommodate the verbose listing + * of devices. + */ + width = cb->cb_namewidth; + } if (!first) (void) printf(" "); else first = B_FALSE; - header = zpool_prop_column_name(pl->pl_prop); - right_justify = zpool_prop_align_right(pl->pl_prop); + right_justify = B_FALSE; + if (pl->pl_prop != ZPROP_INVAL) { + header = zpool_prop_column_name(pl->pl_prop); + right_justify = zpool_prop_align_right(pl->pl_prop); + } else { + int i; + + for (i = 0; pl->pl_user_prop[i] != '\0'; i++) + headerbuf[i] = toupper(pl->pl_user_prop[i]); + headerbuf[i] = '\0'; + header = headerbuf; + } if (pl->pl_next == NULL && !right_justify) (void) printf("%s", header); else if (right_justify) - (void) printf("%*s", pl->pl_width, header); + (void) printf("%*s", width, header); else - (void) printf("%-*s", pl->pl_width, header); + (void) printf("%-*s", width, header); + } (void) printf("\n"); @@ -2396,17 +2858,28 @@ print_header(zprop_list_t *pl) * to the described layout. */ static void -print_pool(zpool_handle_t *zhp, zprop_list_t *pl, int scripted) +print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) { + zprop_list_t *pl = cb->cb_proplist; boolean_t first = B_TRUE; char property[ZPOOL_MAXPROPLEN]; char *propstr; boolean_t right_justify; - int width; + size_t width; for (; pl != NULL; pl = pl->pl_next) { + + width = pl->pl_width; + if (first && cb->cb_verbose) { + /* + * Reset the width to accommodate the verbose listing + * of devices. + */ + width = cb->cb_namewidth; + } + if (!first) { - if (scripted) + if (cb->cb_scripted) (void) printf("\t"); else (void) printf(" "); @@ -2417,24 +2890,28 @@ print_pool(zpool_handle_t *zhp, zprop_li right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { if (zpool_get_prop(zhp, pl->pl_prop, property, - sizeof (property), NULL) != 0) + sizeof (property), NULL, cb->cb_literal) != 0) propstr = "-"; else propstr = property; right_justify = zpool_prop_align_right(pl->pl_prop); + } else if ((zpool_prop_feature(pl->pl_user_prop) || + zpool_prop_unsupported(pl->pl_user_prop)) && + zpool_prop_get_feature(zhp, pl->pl_user_prop, property, + sizeof (property)) == 0) { + propstr = property; } else { propstr = "-"; } - width = pl->pl_width; /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width * format specifier. */ - if (scripted || (pl->pl_next == NULL && !right_justify)) + if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) (void) printf("%s", propstr); else if (right_justify) (void) printf("%*s", width, propstr); @@ -2445,6 +2922,155 @@ print_pool(zpool_handle_t *zhp, zprop_li (void) printf("\n"); } +static void +print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted, + boolean_t valid) +{ + char propval[64]; + boolean_t fixed; + size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); + + switch (prop) { + case ZPOOL_PROP_EXPANDSZ: + if (value == 0) + (void) strlcpy(propval, "-", sizeof (propval)); + else + zfs_nicenum(value, propval, sizeof (propval)); + break; + case ZPOOL_PROP_FRAGMENTATION: + if (value == ZFS_FRAG_INVALID) { + (void) strlcpy(propval, "-", sizeof (propval)); + } else { + (void) snprintf(propval, sizeof (propval), "%llu%%", + value); + } + break; + case ZPOOL_PROP_CAPACITY: + (void) snprintf(propval, sizeof (propval), "%llu%%", value); + break; + default: + zfs_nicenum(value, propval, sizeof (propval)); + } + + if (!valid) + (void) strlcpy(propval, "-", sizeof (propval)); + + if (scripted) + (void) printf("\t%s", propval); + else + (void) printf(" %*s", width, propval); +} + +void +print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, + list_cbdata_t *cb, int depth) +{ + nvlist_t **child; + vdev_stat_t *vs; + uint_t c, children; + char *vname; + boolean_t scripted = cb->cb_scripted; + uint64_t islog = B_FALSE; + boolean_t haslog = B_FALSE; + char *dashes = "%-*s - - - - - -\n"; + + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + + if (name != NULL) { + boolean_t toplevel = (vs->vs_space != 0); + uint64_t cap; + + if (scripted) + (void) printf("\t%s", name); + else if (strlen(name) + depth > cb->cb_namewidth) + (void) printf("%*s%s", depth, "", name); + else + (void) printf("%*s%s%*s", depth, "", name, + (int)(cb->cb_namewidth - strlen(name) - depth), ""); + + /* + * Print the properties for the individual vdevs. Some + * properties are only applicable to toplevel vdevs. The + * 'toplevel' boolean value is passed to the print_one_column() + * to indicate that the value is valid. + */ + print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, scripted, + toplevel); + print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, scripted, + toplevel); + print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, + scripted, toplevel); + print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, scripted, + B_TRUE); + print_one_column(ZPOOL_PROP_FRAGMENTATION, + vs->vs_fragmentation, scripted, + (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel)); + cap = (vs->vs_space == 0) ? 0 : + (vs->vs_alloc * 100 / vs->vs_space); + print_one_column(ZPOOL_PROP_CAPACITY, cap, scripted, toplevel); + (void) printf("\n"); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return; + + for (c = 0; c < children; c++) { + uint64_t ishole = B_FALSE; + + if (nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) + continue; + + if (nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) { + haslog = B_TRUE; + continue; + } + + vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); + print_list_stats(zhp, vname, child[c], cb, depth + 2); + free(vname); + } + + if (haslog == B_TRUE) { + /* LINTED E_SEC_PRINTF_VAR_FMT */ + (void) printf(dashes, cb->cb_namewidth, "log"); + for (c = 0; c < children; c++) { + if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &islog) != 0 || !islog) + continue; + vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); + print_list_stats(zhp, vname, child[c], cb, depth + 2); + free(vname); + } + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0 && children > 0) { + /* LINTED E_SEC_PRINTF_VAR_FMT */ + (void) printf(dashes, cb->cb_namewidth, "cache"); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); + print_list_stats(zhp, vname, child[c], cb, depth + 2); + free(vname); + } + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, + &children) == 0 && children > 0) { + /* LINTED E_SEC_PRINTF_VAR_FMT */ + (void) printf(dashes, cb->cb_namewidth, "spare"); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); + print_list_stats(zhp, vname, child[c], cb, depth + 2); + free(vname); + } + } +} + + /* * Generic callback function to list a pool. */ @@ -2452,25 +3078,32 @@ int list_callback(zpool_handle_t *zhp, void *data) { list_cbdata_t *cbp = data; + nvlist_t *config; + nvlist_t *nvroot; - if (cbp->cb_first) { - if (!cbp->cb_scripted) - print_header(cbp->cb_proplist); - cbp->cb_first = B_FALSE; - } + config = zpool_get_config(zhp, NULL); + + print_pool(zhp, cbp); + if (!cbp->cb_verbose) + return (0); - print_pool(zhp, cbp->cb_proplist, cbp->cb_scripted); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + print_list_stats(zhp, NULL, nvroot, cbp, 0); return (0); } /* - * zpool list [-H] [-o prop[,prop]*] [pool] ... + * zpool list [-Hp] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] * * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -o List of properties to display. Defaults to - * "name,size,allocated,free,capacity,health,altroot" + * "name,size,allocated,free,expandsize,fragmentation,capacity," + * "dedupratio,health,altroot" + * -p Diplay values in parsable (exact) format. + * -T Display a timestamp in date(1) or Unix format * * List all pools in the system, whether or not they're healthy. Output space * statistics for each one, as well as health status summary. @@ -2482,11 +3115,15 @@ zpool_do_list(int argc, char **argv) int ret; list_cbdata_t cb = { 0 }; static char default_props[] = - "name,size,allocated,free,capacity,dedupratio,health,altroot"; + "name,size,allocated,free,expandsize,fragmentation,capacity," + "dedupratio,health,altroot"; char *props = default_props; + unsigned long interval = 0, count = 0; + zpool_list_t *list; + boolean_t first = B_TRUE; /* check options */ - while ((c = getopt(argc, argv, ":Ho:")) != -1) { + while ((c = getopt(argc, argv, ":Ho:pT:v")) != -1) { switch (c) { case 'H': cb.cb_scripted = B_TRUE; @@ -2494,6 +3131,15 @@ zpool_do_list(int argc, char **argv) case 'o': props = optarg; break; + case 'p': + cb.cb_literal = B_TRUE; + break; + case 'T': + get_timestamp_arg(*optarg); + break; + case 'v': + cb.cb_verbose = B_TRUE; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); @@ -2509,49 +3155,49 @@ zpool_do_list(int argc, char **argv) argc -= optind; argv += optind; + get_interval_count(&argc, argv, &interval, &count); + if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); - cb.cb_first = B_TRUE; + for (;;) { + if ((list = pool_list_get(argc, argv, &cb.cb_proplist, + &ret)) == NULL) + return (1); - ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, - list_callback, &cb); + if (pool_list_count(list) == 0) + break; - zprop_free_list(cb.cb_proplist); + cb.cb_namewidth = 0; + (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb); - if (argc == 0 && cb.cb_first && !cb.cb_scripted) { - (void) printf(gettext("no pools available\n")); - return (0); - } + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); - return (ret); -} + if (!cb.cb_scripted && (first || cb.cb_verbose)) { + print_header(&cb); + first = B_FALSE; + } + ret = pool_list_iter(list, B_TRUE, list_callback, &cb); -static nvlist_t * -zpool_get_vdev_by_name(nvlist_t *nv, char *name) -{ - nvlist_t **child; - uint_t c, children; - nvlist_t *match; - char *path; + if (interval == 0) + break; - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) { - verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); - if (strncmp(name, "/dev/dsk/", 9) == 0) - name += 9; - if (strncmp(path, "/dev/dsk/", 9) == 0) - path += 9; - if (strcmp(name, path) == 0) - return (nv); - return (NULL); - } - - for (c = 0; c < children; c++) - if ((match = zpool_get_vdev_by_name(child[c], name)) != NULL) - return (match); + if (count != 0 && --count == 0) + break; + + pool_list_free(list); + (void) sleep(interval); + } + + if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { + (void) printf(gettext("no pools available\n")); + ret = 0; + } - return (NULL); + pool_list_free(list); + zprop_free_list(cb.cb_proplist); + return (ret); } static int @@ -2769,8 +3415,7 @@ zpool_do_split(int argc, char **argv) if (add_prop_list( zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE) != 0) { - if (props) - nvlist_free(props); + nvlist_free(props); usage(B_FALSE); } break; @@ -2783,8 +3428,7 @@ zpool_do_split(int argc, char **argv) propval++; if (add_prop_list(optarg, propval, &props, B_TRUE) != 0) { - if (props) - nvlist_free(props); + nvlist_free(props); usage(B_FALSE); } } else { @@ -2857,7 +3501,7 @@ zpool_do_split(int argc, char **argv) if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && zpool_enable_datasets(zhp, mntopts, 0) != 0) { ret = 1; - (void) fprintf(stderr, gettext("Split was succssful, but " + (void) fprintf(stderr, gettext("Split was successful, but " "the datasets could not all be mounted\n")); (void) fprintf(stderr, gettext("Try doing '%s' with a " "different altroot\n"), "zpool import"); @@ -3086,51 +3730,20 @@ zpool_do_clear(int argc, char **argv) return (ret); } -typedef struct scrub_cbdata { - int cb_type; - int cb_argc; - char **cb_argv; -} scrub_cbdata_t; - -int -scrub_callback(zpool_handle_t *zhp, void *data) -{ - scrub_cbdata_t *cb = data; - int err; - - /* - * Ignore faulted pools. - */ - if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { - (void) fprintf(stderr, gettext("cannot scrub '%s': pool is " - "currently unavailable\n"), zpool_get_name(zhp)); - return (1); - } - - err = zpool_scrub(zhp, cb->cb_type); - - return (err != 0); -} - /* - * zpool scrub [-s] ... - * - * -s Stop. Stops any in-progress scrub. + * zpool reguid */ int -zpool_do_scrub(int argc, char **argv) +zpool_do_reguid(int argc, char **argv) { int c; - scrub_cbdata_t cb; - - cb.cb_type = POOL_SCRUB_EVERYTHING; + char *poolname; + zpool_handle_t *zhp; + int ret = 0; /* check options */ - while ((c = getopt(argc, argv, "s")) != -1) { + while ((c = getopt(argc, argv, "")) != -1) { switch (c) { - case 's': - cb.cb_type = POOL_SCRUB_NONE; - break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -3138,24 +3751,146 @@ zpool_do_scrub(int argc, char **argv) } } - cb.cb_argc = argc; - cb.cb_argv = argv; argc -= optind; argv += optind; + /* get pool name and check number of arguments */ if (argc < 1) { - (void) fprintf(stderr, gettext("missing pool name argument\n")); + (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } - return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); -} + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } -typedef struct status_cbdata { - int cb_count; - boolean_t cb_allpools; - boolean_t cb_verbose; - boolean_t cb_explain; + poolname = argv[0]; + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + ret = zpool_reguid(zhp); + + zpool_close(zhp); + return (ret); +} + + +/* + * zpool reopen + * + * Reopen the pool so that the kernel can update the sizes of all vdevs. + */ +int +zpool_do_reopen(int argc, char **argv) +{ + int c; + int ret = 0; + zpool_handle_t *zhp; + char *pool; + + /* check options */ + while ((c = getopt(argc, argv, "")) != -1) { + switch (c) { + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc--; + argv++; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + pool = argv[0]; + if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) + return (1); + + ret = zpool_reopen(zhp); + zpool_close(zhp); + return (ret); +} + +typedef struct scrub_cbdata { + int cb_type; + int cb_argc; + char **cb_argv; +} scrub_cbdata_t; + +int +scrub_callback(zpool_handle_t *zhp, void *data) +{ + scrub_cbdata_t *cb = data; + int err; + + /* + * Ignore faulted pools. + */ + if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { + (void) fprintf(stderr, gettext("cannot scrub '%s': pool is " + "currently unavailable\n"), zpool_get_name(zhp)); + return (1); + } + + err = zpool_scan(zhp, cb->cb_type); + + return (err != 0); +} + +/* + * zpool scrub [-s] ... + * + * -s Stop. Stops any in-progress scrub. + */ +int +zpool_do_scrub(int argc, char **argv) +{ + int c; + scrub_cbdata_t cb; + + cb.cb_type = POOL_SCAN_SCRUB; + + /* check options */ + while ((c = getopt(argc, argv, "s")) != -1) { + switch (c) { + case 's': + cb.cb_type = POOL_SCAN_NONE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + cb.cb_argc = argc; + cb.cb_argv = argv; + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); +} + +typedef struct status_cbdata { + int cb_count; + boolean_t cb_allpools; + boolean_t cb_verbose; + boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; } status_cbdata_t; @@ -3164,62 +3899,112 @@ typedef struct status_cbdata { * Print out detailed scrub status. */ void -print_scrub_status(nvlist_t *nvroot) +print_scan_status(pool_scan_stat_t *ps) { - vdev_stat_t *vs; - uint_t vsc; - time_t start, end, now; + time_t start, end; + uint64_t elapsed, mins_left, hours_left; + uint64_t pass_exam, examined, total; + uint_t rate; double fraction_done; - uint64_t examined, total, minutes_left, minutes_taken; - char *scrub_type; + char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; - verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS, - (uint64_t **)&vs, &vsc) == 0); + (void) printf(gettext(" scan: ")); - /* - * If there's never been a scrub, there's not much to say. - */ - if (vs->vs_scrub_end == 0 && vs->vs_scrub_type == POOL_SCRUB_NONE) { + /* If there's never been a scan, there's not much to say. */ + if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || + ps->pss_func >= POOL_SCAN_FUNCS) { (void) printf(gettext("none requested\n")); return; } - scrub_type = (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ? - "resilver" : "scrub"; + start = ps->pss_start_time; + end = ps->pss_end_time; + zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf)); - start = vs->vs_scrub_start; - end = vs->vs_scrub_end; - now = time(NULL); - examined = vs->vs_scrub_examined; - total = vs->vs_alloc; - - if (end != 0) { - minutes_taken = (uint64_t)((end - start) / 60); - - (void) printf(gettext("%s %s after %lluh%um with %llu errors " - "on %s"), - scrub_type, vs->vs_scrub_complete ? "completed" : "stopped", + assert(ps->pss_func == POOL_SCAN_SCRUB || + ps->pss_func == POOL_SCAN_RESILVER); + /* + * Scan is finished or canceled. + */ + if (ps->pss_state == DSS_FINISHED) { + uint64_t minutes_taken = (end - start) / 60; + char *fmt = NULL; + + if (ps->pss_func == POOL_SCAN_SCRUB) { + fmt = gettext("scrub repaired %s in %lluh%um with " + "%llu errors on %s"); + } else if (ps->pss_func == POOL_SCAN_RESILVER) { + fmt = gettext("resilvered %s in %lluh%um with " + "%llu errors on %s"); + } + /* LINTED */ + (void) printf(fmt, processed_buf, (u_longlong_t)(minutes_taken / 60), (uint_t)(minutes_taken % 60), - (u_longlong_t)vs->vs_scrub_errors, ctime(&end)); + (u_longlong_t)ps->pss_errors, + ctime((time_t *)&end)); + return; + } else if (ps->pss_state == DSS_CANCELED) { + if (ps->pss_func == POOL_SCAN_SCRUB) { + (void) printf(gettext("scrub canceled on %s"), + ctime(&end)); + } else if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext("resilver canceled on %s"), + ctime(&end)); + } return; } - if (examined == 0) - examined = 1; - if (examined > total) - total = examined; + assert(ps->pss_state == DSS_SCANNING); + + /* + * Scan is in progress. + */ + if (ps->pss_func == POOL_SCAN_SCRUB) { + (void) printf(gettext("scrub in progress since %s"), + ctime(&start)); + } else if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext("resilver in progress since %s"), + ctime(&start)); + } + examined = ps->pss_examined ? ps->pss_examined : 1; + total = ps->pss_to_examine; fraction_done = (double)examined / total; - minutes_left = (uint64_t)((now - start) * - (1 - fraction_done) / fraction_done / 60); - minutes_taken = (uint64_t)((now - start) / 60); - - (void) printf(gettext("%s in progress for %lluh%um, %.2f%% done, " - "%lluh%um to go\n"), - scrub_type, (u_longlong_t)(minutes_taken / 60), - (uint_t)(minutes_taken % 60), 100 * fraction_done, - (u_longlong_t)(minutes_left / 60), (uint_t)(minutes_left % 60)); + + /* elapsed time for this pass */ + elapsed = time(NULL) - ps->pss_pass_start; + elapsed = elapsed ? elapsed : 1; + pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1; + rate = pass_exam / elapsed; + rate = rate ? rate : 1; + mins_left = ((total - examined) / rate) / 60; + hours_left = mins_left / 60; + + zfs_nicenum(examined, examined_buf, sizeof (examined_buf)); + zfs_nicenum(total, total_buf, sizeof (total_buf)); + zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); + + /* + * do not print estimated time if hours_left is more than 30 days + */ + (void) printf(gettext(" %s scanned out of %s at %s/s"), + examined_buf, total_buf, rate_buf); + if (hours_left < (30 * 24)) { + (void) printf(gettext(", %lluh%um to go\n"), + (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); + } else { + (void) printf(gettext( + ", (scan is slow, no estimated time)\n")); + } + + if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext(" %s resilvered, %.2f%% done\n"), + processed_buf, 100 * fraction_done); + } else if (ps->pss_func == POOL_SCAN_SCRUB) { + (void) printf(gettext(" %s repaired, %.2f%% done\n"), + processed_buf, 100 * fraction_done); + } } static void @@ -3307,14 +4092,20 @@ print_dedup_stats(nvlist_t *config) /* * If the pool was faulted then we may not have been able to - * obtain the config. Otherwise, if have anything in the dedup + * obtain the config. Otherwise, if we have anything in the dedup * table continue processing the stats. */ if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, - (uint64_t **)&ddo, &c) != 0 || ddo->ddo_count == 0) + (uint64_t **)&ddo, &c) != 0) return; (void) printf("\n"); + (void) printf(gettext(" dedup: ")); + if (ddo->ddo_count == 0) { + (void) printf(gettext("no DDT entries\n")); + return; + } + (void) printf("DDT entries %llu, size %llu on disk, %llu in core\n", (u_longlong_t)ddo->ddo_count, (u_longlong_t)ddo->ddo_dspace, @@ -3333,7 +4124,7 @@ print_dedup_stats(nvlist_t *config) * pool: tank * status: DEGRADED * reason: One or more devices ... - * see: http://www.sun.com/msg/ZFS-xxxx-01 + * see: http://illumos.org/msg/ZFS-xxxx-01 * config: * mirror DEGRADED * c1t0d0 OK @@ -3362,7 +4153,11 @@ status_callback(zpool_handle_t *zhp, voi * If we were given 'zpool status -x', only report those pools with * problems. */ - if (reason == ZPOOL_STATUS_OK && cbp->cb_explain) { + if (cbp->cb_explain && + (reason == ZPOOL_STATUS_OK || + reason == ZPOOL_STATUS_VERSION_OLDER || + reason == ZPOOL_STATUS_NON_NATIVE_ASHIFT || + reason == ZPOOL_STATUS_FEAT_DISABLED)) { if (!cbp->cb_allpools) { (void) printf(gettext("pool '%s' is healthy\n"), zpool_get_name(zhp)); @@ -3379,7 +4174,7 @@ status_callback(zpool_handle_t *zhp, voi verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS, + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) == 0); health = zpool_state_to_name(vs->vs_state, vs->vs_aux); @@ -3452,7 +4247,6 @@ status_callback(zpool_handle_t *zhp, voi "replace'.\n")); break; - case ZPOOL_STATUS_RESILVERING: (void) printf(gettext("status: One or more devices is " "currently being resilvered. The pool will\n\tcontinue " @@ -3478,12 +4272,13 @@ status_callback(zpool_handle_t *zhp, voi break; case ZPOOL_STATUS_VERSION_OLDER: - (void) printf(gettext("status: The pool is formatted using an " - "older on-disk format. The pool can\n\tstill be used, but " - "some features are unavailable.\n")); + (void) printf(gettext("status: The pool is formatted using a " + "legacy on-disk format. The pool can\n\tstill be used, " + "but some features are unavailable.\n")); (void) printf(gettext("action: Upgrade the pool using 'zpool " "upgrade'. Once this is done, the\n\tpool will no longer " - "be accessible on older software versions.\n")); + "be accessible on software that does not support feature\n" + "\tflags.\n")); break; case ZPOOL_STATUS_VERSION_NEWER: @@ -3495,6 +4290,41 @@ status_callback(zpool_handle_t *zhp, voi "backup.\n")); break; + case ZPOOL_STATUS_FEAT_DISABLED: + (void) printf(gettext("status: Some supported features are not " + "enabled on the pool. The pool can\n\tstill be used, but " + "some features are unavailable.\n")); + (void) printf(gettext("action: Enable all features using " + "'zpool upgrade'. Once this is done,\n\tthe pool may no " + "longer be accessible by software that does not support\n\t" + "the features. See zpool-features(7) for details.\n")); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_READ: + (void) printf(gettext("status: The pool cannot be accessed on " + "this system because it uses the\n\tfollowing feature(s) " + "not supported on this system:\n")); + zpool_print_unsup_feat(config); + (void) printf("\n"); + (void) printf(gettext("action: Access the pool from a system " + "that supports the required feature(s),\n\tor restore the " + "pool from backup.\n")); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: + (void) printf(gettext("status: The pool can only be accessed " + "in read-only mode on this system. It\n\tcannot be " + "accessed in read-write mode because it uses the " + "following\n\tfeature(s) not supported on this system:\n")); + zpool_print_unsup_feat(config); + (void) printf("\n"); + (void) printf(gettext("action: The pool cannot be accessed in " + "read-write mode. Import the pool with\n" + "\t\"-o readonly=on\", access the pool from a system that " + "supports the\n\trequired feature(s), or restore the " + "pool from backup.\n")); + break; + case ZPOOL_STATUS_FAULTED_DEV_R: (void) printf(gettext("status: One or more devices are " "faulted in response to persistent errors.\n\tSufficient " @@ -3534,6 +4364,15 @@ status_callback(zpool_handle_t *zhp, voi "'zpool clear'.\n")); break; + case ZPOOL_STATUS_NON_NATIVE_ASHIFT: + (void) printf(gettext("status: One or more devices are " + "configured to use a non-native block size.\n" + "\tExpect reduced performance.\n")); + (void) printf(gettext("action: Replace affected devices with " + "devices that support the\n\tconfigured block size, or " + "migrate data to a properly configured\n\tpool.\n")); + break; + default: /* * The remaining errors can't actually be generated, yet. @@ -3542,7 +4381,7 @@ status_callback(zpool_handle_t *zhp, voi } if (msgid != NULL) - (void) printf(gettext(" see: http://www.sun.com/msg/%s\n"), + (void) printf(gettext(" see: http://illumos.org/msg/%s\n"), msgid); if (config != NULL) { @@ -3550,10 +4389,11 @@ status_callback(zpool_handle_t *zhp, voi uint64_t nerr; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; + pool_scan_stat_t *ps = NULL; - - (void) printf(gettext(" scrub: ")); - print_scrub_status(nvroot); + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); + print_scan_status(ps); namewidth = max_width(zhp, nvroot, 0, 0); if (namewidth < 10) @@ -3621,11 +4461,12 @@ status_callback(zpool_handle_t *zhp, voi } /* - * zpool status [-vx] [pool] ... + * zpool status [-vx] [-T d|u] [pool] ... [interval [count]] * * -v Display complete error logs * -x Display only pools with potential problems * -D Display dedup status (undocumented) + * -T Display a timestamp in date(1) or Unix format * * Describes the health status of all pools or some subset. */ @@ -3634,10 +4475,11 @@ zpool_do_status(int argc, char **argv) { int c; int ret; + unsigned long interval = 0, count = 0; status_cbdata_t cb = { 0 }; /* check options */ - while ((c = getopt(argc, argv, "vxD")) != -1) { + while ((c = getopt(argc, argv, "vxDT:")) != -1) { switch (c) { case 'v': cb.cb_verbose = B_TRUE; @@ -3648,6 +4490,9 @@ zpool_do_status(int argc, char **argv) case 'D': cb.cb_dedup_stats = B_TRUE; break; + case 'T': + get_timestamp_arg(*optarg); + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -3658,72 +4503,280 @@ zpool_do_status(int argc, char **argv) argc -= optind; argv += optind; - cb.cb_first = B_TRUE; + get_interval_count(&argc, argv, &interval, &count); if (argc == 0) cb.cb_allpools = B_TRUE; - ret = for_each_pool(argc, argv, B_TRUE, NULL, status_callback, &cb); + cb.cb_first = B_TRUE; - if (argc == 0 && cb.cb_count == 0) - (void) printf(gettext("no pools available\n")); - else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) - (void) printf(gettext("all pools are healthy\n")); + for (;;) { + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); - return (ret); + ret = for_each_pool(argc, argv, B_TRUE, NULL, + status_callback, &cb); + + if (argc == 0 && cb.cb_count == 0) + (void) printf(gettext("no pools available\n")); + else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) + (void) printf(gettext("all pools are healthy\n")); + + if (ret != 0) + return (ret); + + if (interval == 0) + break; + + if (count != 0 && --count == 0) + break; + + (void) sleep(interval); + } + + return (0); } typedef struct upgrade_cbdata { - int cb_all; - int cb_first; - int cb_newer; - int cb_argc; - uint64_t cb_version; - char **cb_argv; + boolean_t cb_first; + boolean_t cb_unavail; + char cb_poolname[ZFS_MAX_DATASET_NAME_LEN]; + int cb_argc; + uint64_t cb_version; + char **cb_argv; } upgrade_cbdata_t; +#ifdef __FreeBSD__ +static int +is_root_pool(zpool_handle_t *zhp) +{ + static struct statfs sfs; + static char *poolname = NULL; + static boolean_t stated = B_FALSE; + char *slash; + + if (!stated) { + stated = B_TRUE; + if (statfs("/", &sfs) == -1) { + (void) fprintf(stderr, + "Unable to stat root file system: %s.\n", + strerror(errno)); + return (0); + } + if (strcmp(sfs.f_fstypename, "zfs") != 0) + return (0); + poolname = sfs.f_mntfromname; + if ((slash = strchr(poolname, '/')) != NULL) + *slash = '\0'; + } + return (poolname != NULL && strcmp(poolname, zpool_get_name(zhp)) == 0); +} + +static void +root_pool_upgrade_check(zpool_handle_t *zhp, char *poolname, int size) +{ + + if (poolname[0] == '\0' && is_root_pool(zhp)) + (void) strlcpy(poolname, zpool_get_name(zhp), size); +} +#endif /* FreeBSD */ + +static int +upgrade_version(zpool_handle_t *zhp, uint64_t version) +{ + int ret; + nvlist_t *config; + uint64_t oldversion; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &oldversion) == 0); + + assert(SPA_VERSION_IS_SUPPORTED(oldversion)); + assert(oldversion < version); + + ret = zpool_upgrade(zhp, version); + if (ret != 0) + return (ret); + + if (version >= SPA_VERSION_FEATURES) { + (void) printf(gettext("Successfully upgraded " + "'%s' from version %llu to feature flags.\n"), + zpool_get_name(zhp), oldversion); + } else { + (void) printf(gettext("Successfully upgraded " + "'%s' from version %llu to version %llu.\n"), + zpool_get_name(zhp), oldversion, version); + } + + return (0); +} + +static int +upgrade_enable_all(zpool_handle_t *zhp, int *countp) +{ + int i, ret, count; + boolean_t firstff = B_TRUE; + nvlist_t *enabled = zpool_get_features(zhp); + + count = 0; + for (i = 0; i < SPA_FEATURES; i++) { + const char *fname = spa_feature_table[i].fi_uname; + const char *fguid = spa_feature_table[i].fi_guid; + if (!nvlist_exists(enabled, fguid)) { + char *propname; + verify(-1 != asprintf(&propname, "feature@%s", fname)); + ret = zpool_set_prop(zhp, propname, + ZFS_FEATURE_ENABLED); + if (ret != 0) { + free(propname); + return (ret); + } + count++; + + if (firstff) { + (void) printf(gettext("Enabled the " + "following features on '%s':\n"), + zpool_get_name(zhp)); + firstff = B_FALSE; + } + (void) printf(gettext(" %s\n"), fname); + free(propname); + } + } + + if (countp != NULL) + *countp = count; + return (0); +} + static int upgrade_cb(zpool_handle_t *zhp, void *arg) { upgrade_cbdata_t *cbp = arg; nvlist_t *config; uint64_t version; - int ret = 0; + boolean_t printnl = B_FALSE; + int ret; + + if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { + (void) fprintf(stderr, gettext("cannot upgrade '%s': pool is " + "currently unavailable.\n\n"), zpool_get_name(zhp)); + cbp->cb_unavail = B_TRUE; + /* Allow iteration to continue. */ + return (0); + } config = zpool_get_config(zhp, NULL); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); - if (!cbp->cb_newer && version < SPA_VERSION) { - if (!cbp->cb_all) { - if (cbp->cb_first) { - (void) printf(gettext("The following pools are " - "out of date, and can be upgraded. After " - "being\nupgraded, these pools will no " - "longer be accessible by older software " - "versions.\n\n")); - (void) printf(gettext("VER POOL\n")); - (void) printf(gettext("--- ------------\n")); - cbp->cb_first = B_FALSE; - } + assert(SPA_VERSION_IS_SUPPORTED(version)); - (void) printf("%2llu %s\n", (u_longlong_t)version, - zpool_get_name(zhp)); - } else { + if (version < cbp->cb_version) { + cbp->cb_first = B_FALSE; + ret = upgrade_version(zhp, cbp->cb_version); + if (ret != 0) + return (ret); +#ifdef __FreeBSD__ + root_pool_upgrade_check(zhp, cbp->cb_poolname, + sizeof(cbp->cb_poolname)); +#endif /* __FreeBSD__ */ + printnl = B_TRUE; + +#ifdef illumos + /* + * If they did "zpool upgrade -a", then we could + * be doing ioctls to different pools. We need + * to log this history once to each pool, and bypass + * the normal history logging that happens in main(). + */ + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; +#endif + } + + if (cbp->cb_version >= SPA_VERSION_FEATURES) { + int count; + ret = upgrade_enable_all(zhp, &count); + if (ret != 0) + return (ret); + + if (count > 0) { + cbp->cb_first = B_FALSE; + printnl = B_TRUE; +#ifdef __FreeBSD__ + root_pool_upgrade_check(zhp, cbp->cb_poolname, + sizeof(cbp->cb_poolname)); +#endif /* __FreeBSD__ */ + /* + * If they did "zpool upgrade -a", then we could + * be doing ioctls to different pools. We need + * to log this history once to each pool, and bypass + * the normal history logging that happens in main(). + */ + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; + } + } + + if (printnl) { + (void) printf(gettext("\n")); + } + + return (0); +} + +static int +upgrade_list_unavail(zpool_handle_t *zhp, void *arg) +{ + upgrade_cbdata_t *cbp = arg; + + if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { + if (cbp->cb_first) { + (void) fprintf(stderr, gettext("The following pools " + "are unavailable and cannot be upgraded as this " + "time.\n\n")); + (void) fprintf(stderr, gettext("POOL\n")); + (void) fprintf(stderr, gettext("------------\n")); cbp->cb_first = B_FALSE; - ret = zpool_upgrade(zhp, cbp->cb_version); - if (!ret) { - (void) printf(gettext("Successfully upgraded " - "'%s'\n\n"), zpool_get_name(zhp)); - } } - } else if (cbp->cb_newer && version > SPA_VERSION) { - assert(!cbp->cb_all); + (void) printf(gettext("%s\n"), zpool_get_name(zhp)); + cbp->cb_unavail = B_TRUE; + } + return (0); +} + +static int +upgrade_list_older_cb(zpool_handle_t *zhp, void *arg) +{ + upgrade_cbdata_t *cbp = arg; + nvlist_t *config; + uint64_t version; + + if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { + /* + * This will have been reported by upgrade_list_unavail so + * just allow iteration to continue. + */ + cbp->cb_unavail = B_TRUE; + return (0); + } + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0); + + assert(SPA_VERSION_IS_SUPPORTED(version)); + + if (version < SPA_VERSION_FEATURES) { if (cbp->cb_first) { (void) printf(gettext("The following pools are " - "formatted using a newer software version and\n" - "cannot be accessed on the current system.\n\n")); + "formatted with legacy version numbers and can\n" + "be upgraded to use feature flags. After " + "being upgraded, these pools\nwill no " + "longer be accessible by software that does not " + "support feature\nflags.\n\n")); (void) printf(gettext("VER POOL\n")); (void) printf(gettext("--- ------------\n")); cbp->cb_first = B_FALSE; @@ -3733,48 +4786,142 @@ upgrade_cb(zpool_handle_t *zhp, void *ar zpool_get_name(zhp)); } - zpool_close(zhp); - return (ret); + return (0); +} + +static int +upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) +{ + upgrade_cbdata_t *cbp = arg; + nvlist_t *config; + uint64_t version; + + if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { + /* + * This will have been reported by upgrade_list_unavail so + * just allow iteration to continue. + */ + cbp->cb_unavail = B_TRUE; + return (0); + } + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0); + + if (version >= SPA_VERSION_FEATURES) { + int i; + boolean_t poolfirst = B_TRUE; + nvlist_t *enabled = zpool_get_features(zhp); + + for (i = 0; i < SPA_FEATURES; i++) { + const char *fguid = spa_feature_table[i].fi_guid; + const char *fname = spa_feature_table[i].fi_uname; + if (!nvlist_exists(enabled, fguid)) { + if (cbp->cb_first) { + (void) printf(gettext("\nSome " + "supported features are not " + "enabled on the following pools. " + "Once a\nfeature is enabled the " + "pool may become incompatible with " + "software\nthat does not support " + "the feature. See " + "zpool-features(7) for " + "details.\n\n")); + (void) printf(gettext("POOL " + "FEATURE\n")); + (void) printf(gettext("------" + "---------\n")); + cbp->cb_first = B_FALSE; + } + + if (poolfirst) { + (void) printf(gettext("%s\n"), + zpool_get_name(zhp)); + poolfirst = B_FALSE; + } + + (void) printf(gettext(" %s\n"), fname); + } + } + } + + return (0); } /* ARGSUSED */ static int upgrade_one(zpool_handle_t *zhp, void *data) { + boolean_t printnl = B_FALSE; upgrade_cbdata_t *cbp = data; uint64_t cur_version; int ret; + if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { + (void) fprintf(stderr, gettext("cannot upgrade '%s': pool is " + "is currently unavailable.\n\n"), zpool_get_name(zhp)); + cbp->cb_unavail = B_TRUE; + return (1); + } + if (strcmp("log", zpool_get_name(zhp)) == 0) { (void) printf(gettext("'log' is now a reserved word\n" "Pool 'log' must be renamed using export and import" - " to upgrade.\n")); + " to upgrade.\n\n")); return (1); } cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if (cur_version > cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " - "using more current version '%llu'.\n"), + "using more current version '%llu'.\n\n"), zpool_get_name(zhp), cur_version); return (0); } - if (cur_version == cbp->cb_version) { + + if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) { (void) printf(gettext("Pool '%s' is already formatted " - "using the current version.\n"), zpool_get_name(zhp)); + "using version %llu.\n\n"), zpool_get_name(zhp), + cbp->cb_version); return (0); } - ret = zpool_upgrade(zhp, cbp->cb_version); + if (cur_version != cbp->cb_version) { + printnl = B_TRUE; + ret = upgrade_version(zhp, cbp->cb_version); + if (ret != 0) + return (ret); +#ifdef __FreeBSD__ + root_pool_upgrade_check(zhp, cbp->cb_poolname, + sizeof(cbp->cb_poolname)); +#endif /* __FreeBSD__ */ + } + + if (cbp->cb_version >= SPA_VERSION_FEATURES) { + int count = 0; + ret = upgrade_enable_all(zhp, &count); + if (ret != 0) + return (ret); + + if (count != 0) { + printnl = B_TRUE; +#ifdef __FreeBSD__ + root_pool_upgrade_check(zhp, cbp->cb_poolname, + sizeof(cbp->cb_poolname)); +#endif /* __FreeBSD __*/ + } else if (cur_version == SPA_VERSION) { + (void) printf(gettext("Pool '%s' already has all " + "supported features enabled.\n\n"), + zpool_get_name(zhp)); + } + } - if (!ret) { - (void) printf(gettext("Successfully upgraded '%s' " - "from version %llu to version %llu\n\n"), - zpool_get_name(zhp), (u_longlong_t)cur_version, - (u_longlong_t)cbp->cb_version); + if (printnl) { + (void) printf(gettext("\n")); } - return (ret != 0); + return (0); } /* @@ -3793,6 +4940,7 @@ zpool_do_upgrade(int argc, char **argv) upgrade_cbdata_t cb = { 0 }; int ret = 0; boolean_t showversions = B_FALSE; + boolean_t upgradeall = B_FALSE; char *end; @@ -3800,15 +4948,15 @@ zpool_do_upgrade(int argc, char **argv) while ((c = getopt(argc, argv, ":avV:")) != -1) { switch (c) { case 'a': - cb.cb_all = B_TRUE; + upgradeall = B_TRUE; break; case 'v': showversions = B_TRUE; break; case 'V': cb.cb_version = strtoll(optarg, &end, 10); - if (*end != '\0' || cb.cb_version > SPA_VERSION || - cb.cb_version < SPA_VERSION_1) { + if (*end != '\0' || + !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) { (void) fprintf(stderr, gettext("invalid version '%s'\n"), optarg); usage(B_FALSE); @@ -3833,19 +4981,19 @@ zpool_do_upgrade(int argc, char **argv) if (cb.cb_version == 0) { cb.cb_version = SPA_VERSION; - } else if (!cb.cb_all && argc == 0) { + } else if (!upgradeall && argc == 0) { (void) fprintf(stderr, gettext("-V option is " "incompatible with other arguments\n")); usage(B_FALSE); } if (showversions) { - if (cb.cb_all || argc != 0) { + if (upgradeall || argc != 0) { (void) fprintf(stderr, gettext("-v option is " "incompatible with other arguments\n")); usage(B_FALSE); } - } else if (cb.cb_all) { + } else if (upgradeall) { if (argc != 0) { (void) fprintf(stderr, gettext("-a option should not " "be used along with a pool name\n")); @@ -3853,11 +5001,28 @@ zpool_do_upgrade(int argc, char **argv) } } - (void) printf(gettext("This system is currently running " - "ZFS pool version %llu.\n\n"), SPA_VERSION); - cb.cb_first = B_TRUE; + (void) printf(gettext("This system supports ZFS pool feature " + "flags.\n\n")); if (showversions) { - (void) printf(gettext("The following versions are " + int i; + + (void) printf(gettext("The following features are " + "supported:\n\n")); + (void) printf(gettext("FEAT DESCRIPTION\n")); + (void) printf("----------------------------------------------" + "---------------\n"); + for (i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *fi = &spa_feature_table[i]; + const char *ro = + (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? + " (read-only compatible)" : ""; + + (void) printf("%-37s%s\n", fi->fi_uname, ro); + (void) printf(" %s\n", fi->fi_desc); + } + (void) printf("\n"); + + (void) printf(gettext("The following legacy versions are also " "supported:\n\n")); (void) printf(gettext("VER DESCRIPTION\n")); (void) printf("--- -----------------------------------------" @@ -3890,50 +5055,89 @@ zpool_do_upgrade(int argc, char **argv) (void) printf(gettext(" 21 Deduplication\n")); (void) printf(gettext(" 22 Received properties\n")); (void) printf(gettext(" 23 Slim ZIL\n")); + (void) printf(gettext(" 24 System attributes\n")); + (void) printf(gettext(" 25 Improved scrub stats\n")); + (void) printf(gettext(" 26 Improved snapshot deletion " + "performance\n")); + (void) printf(gettext(" 27 Improved snapshot creation " + "performance\n")); + (void) printf(gettext(" 28 Multiple vdev replacements\n")); (void) printf(gettext("\nFor more information on a particular " - "version, including supported releases, see:\n\n")); - (void) printf("http://www.opensolaris.org/os/community/zfs/" - "version/N\n\n"); - (void) printf(gettext("Where 'N' is the version number.\n")); - } else if (argc == 0) { - int notfound; - + "version, including supported releases,\n")); + (void) printf(gettext("see the ZFS Administration Guide.\n\n")); + } else if (argc == 0 && upgradeall) { + cb.cb_first = B_TRUE; ret = zpool_iter(g_zfs, upgrade_cb, &cb); - notfound = cb.cb_first; - - if (!cb.cb_all && ret == 0) { - if (!cb.cb_first) - (void) printf("\n"); - cb.cb_first = B_TRUE; - cb.cb_newer = B_TRUE; - ret = zpool_iter(g_zfs, upgrade_cb, &cb); - if (!cb.cb_first) { - notfound = B_FALSE; - (void) printf("\n"); + if (ret == 0 && cb.cb_first) { + if (cb.cb_version == SPA_VERSION) { + (void) printf(gettext("All %spools are already " + "formatted using feature flags.\n\n"), + cb.cb_unavail ? gettext("available ") : ""); + (void) printf(gettext("Every %sfeature flags " + "pool already has all supported features " + "enabled.\n"), + cb.cb_unavail ? gettext("available ") : ""); + } else { + (void) printf(gettext("All pools are already " + "formatted with version %llu or higher.\n"), + cb.cb_version); } } + } else if (argc == 0) { + cb.cb_first = B_TRUE; + ret = zpool_iter(g_zfs, upgrade_list_unavail, &cb); + assert(ret == 0); + + if (!cb.cb_first) { + (void) fprintf(stderr, "\n"); + } + + cb.cb_first = B_TRUE; + ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb); + assert(ret == 0); + + if (cb.cb_first) { + (void) printf(gettext("All %spools are formatted using " + "feature flags.\n\n"), cb.cb_unavail ? + gettext("available ") : ""); + } else { + (void) printf(gettext("\nUse 'zpool upgrade -v' " + "for a list of available legacy versions.\n")); + } - if (ret == 0) { - if (notfound) - (void) printf(gettext("All pools are formatted " - "using this version.\n")); - else if (!cb.cb_all) - (void) printf(gettext("Use 'zpool upgrade -v' " - "for a list of available versions and " - "their associated\nfeatures.\n")); + cb.cb_first = B_TRUE; + ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb); + assert(ret == 0); + + if (cb.cb_first) { + (void) printf(gettext("Every %sfeature flags pool has " + "all supported features enabled.\n"), + cb.cb_unavail ? gettext("available ") : ""); + } else { + (void) printf(gettext("\n")); } } else { - ret = for_each_pool(argc, argv, B_FALSE, NULL, + ret = for_each_pool(argc, argv, B_TRUE, NULL, upgrade_one, &cb); } + if (cb.cb_poolname[0] != '\0') { + (void) printf( + "If you boot from pool '%s', don't forget to update boot code.\n" + "Assuming you use GPT partitioning and da0 is your boot disk\n" + "the following command will do it:\n" + "\n" + "\tgpart bootcode -b /boot/pmbr -p /boot/gptzfsboot -i 1 da0\n\n", + cb.cb_poolname); + } + return (ret); } typedef struct hist_cbdata { boolean_t first; - int longfmt; - int internal; + boolean_t longfmt; + boolean_t internal; } hist_cbdata_t; /* @@ -3945,21 +5149,8 @@ get_history_one(zpool_handle_t *zhp, voi nvlist_t *nvhis; nvlist_t **records; uint_t numrecords; - char *cmdstr; - char *pathstr; - uint64_t dst_time; - time_t tsec; - struct tm t; - char tbuf[30]; int ret, i; - uint64_t who; - struct passwd *pwd; - char *hostname; - char *zonename; - char internalstr[MAXPATHLEN]; hist_cbdata_t *cb = (hist_cbdata_t *)data; - uint64_t txg; - uint64_t ievent; cb->first = B_FALSE; @@ -3971,64 +5162,94 @@ get_history_one(zpool_handle_t *zhp, voi verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, &records, &numrecords) == 0); for (i = 0; i < numrecords; i++) { - if (nvlist_lookup_uint64(records[i], ZPOOL_HIST_TIME, - &dst_time) != 0) - continue; + nvlist_t *rec = records[i]; + char tbuf[30] = ""; - /* is it an internal event or a standard event? */ - if (nvlist_lookup_string(records[i], ZPOOL_HIST_CMD, - &cmdstr) != 0) { - if (cb->internal == 0) + if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { + time_t tsec; + struct tm t; + + tsec = fnvlist_lookup_uint64(records[i], + ZPOOL_HIST_TIME); + (void) localtime_r(&tsec, &t); + (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); + } + + if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { + (void) printf("%s %s", tbuf, + fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); + } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { + int ievent = + fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); + if (!cb->internal) continue; - - if (nvlist_lookup_uint64(records[i], - ZPOOL_HIST_INT_EVENT, &ievent) != 0) + if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { + (void) printf("%s unrecognized record:\n", + tbuf); + dump_nvlist(rec, 4); + continue; + } + (void) printf("%s [internal %s txg:%lld] %s", tbuf, + zfs_history_event_names[ievent], + fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG), + fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); + } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { + if (!cb->internal) + continue; + (void) printf("%s [txg:%lld] %s", tbuf, + fnvlist_lookup_uint64(rec, ZPOOL_HIST_TXG), + fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); + if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { + (void) printf(" %s (%llu)", + fnvlist_lookup_string(rec, + ZPOOL_HIST_DSNAME), + fnvlist_lookup_uint64(rec, + ZPOOL_HIST_DSID)); + } + (void) printf(" %s", fnvlist_lookup_string(rec, + ZPOOL_HIST_INT_STR)); + } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { + if (!cb->internal) continue; - verify(nvlist_lookup_uint64(records[i], - ZPOOL_HIST_TXG, &txg) == 0); - verify(nvlist_lookup_string(records[i], - ZPOOL_HIST_INT_STR, &pathstr) == 0); - if (ievent >= LOG_END) + (void) printf("%s ioctl %s\n", tbuf, + fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); + if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { + (void) printf(" input:\n"); + dump_nvlist(fnvlist_lookup_nvlist(rec, + ZPOOL_HIST_INPUT_NVL), 8); + } + if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { + (void) printf(" output:\n"); + dump_nvlist(fnvlist_lookup_nvlist(rec, + ZPOOL_HIST_OUTPUT_NVL), 8); + } + } else { + if (!cb->internal) continue; - (void) snprintf(internalstr, - sizeof (internalstr), - "[internal %s txg:%lld] %s", - hist_event_table[ievent], txg, - pathstr); - cmdstr = internalstr; - } - tsec = dst_time; - (void) localtime_r(&tsec, &t); - (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); - (void) printf("%s %s", tbuf, cmdstr); + (void) printf("%s unrecognized record:\n", tbuf); + dump_nvlist(rec, 4); + } if (!cb->longfmt) { (void) printf("\n"); continue; } (void) printf(" ["); - if (nvlist_lookup_uint64(records[i], - ZPOOL_HIST_WHO, &who) == 0) { - pwd = getpwuid((uid_t)who); - if (pwd) - (void) printf("user %s on", - pwd->pw_name); - else - (void) printf("user %d on", - (int)who); - } else { - (void) printf(gettext("no info]\n")); - continue; + if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { + uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); + struct passwd *pwd = getpwuid(who); + (void) printf("user %d ", (int)who); + if (pwd != NULL) + (void) printf("(%s) ", pwd->pw_name); + } + if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { + (void) printf("on %s", + fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); + } + if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { + (void) printf(":%s", + fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); } - if (nvlist_lookup_string(records[i], - ZPOOL_HIST_HOST, &hostname) == 0) { - (void) printf(" %s", hostname); - } - if (nvlist_lookup_string(records[i], - ZPOOL_HIST_ZONE, &zonename) == 0) { - (void) printf(":%s", zonename); - } - (void) printf("]"); (void) printf("\n"); } @@ -4043,8 +5264,6 @@ get_history_one(zpool_handle_t *zhp, voi * * Displays the history of commands that modified pools. */ - - int zpool_do_history(int argc, char **argv) { @@ -4057,10 +5276,10 @@ zpool_do_history(int argc, char **argv) while ((c = getopt(argc, argv, "li")) != -1) { switch (c) { case 'l': - cbdata.longfmt = 1; + cbdata.longfmt = B_TRUE; break; case 'i': - cbdata.internal = 1; + cbdata.internal = B_TRUE; break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), @@ -4100,28 +5319,56 @@ get_callback(zpool_handle_t *zhp, void * pl == cbp->cb_proplist) continue; - if (zpool_get_prop(zhp, pl->pl_prop, - value, sizeof (value), &srctype) != 0) - continue; + if (pl->pl_prop == ZPROP_INVAL && + (zpool_prop_feature(pl->pl_user_prop) || + zpool_prop_unsupported(pl->pl_user_prop))) { + srctype = ZPROP_SRC_LOCAL; + + if (zpool_prop_get_feature(zhp, pl->pl_user_prop, + value, sizeof (value)) == 0) { + zprop_print_one_property(zpool_get_name(zhp), + cbp, pl->pl_user_prop, value, srctype, + NULL, NULL); + } + } else { + if (zpool_get_prop(zhp, pl->pl_prop, value, + sizeof (value), &srctype, cbp->cb_literal) != 0) + continue; - zprop_print_one_property(zpool_get_name(zhp), cbp, - zpool_prop_to_name(pl->pl_prop), value, srctype, NULL, - NULL); + zprop_print_one_property(zpool_get_name(zhp), cbp, + zpool_prop_to_name(pl->pl_prop), value, srctype, + NULL, NULL); + } } return (0); } +/* + * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> ... + * + * -H Scripted mode. Don't display headers, and separate properties + * by a single tab. + * -o List of columns to display. Defaults to + * "name,property,value,source". + * -p Diplay values in parsable (exact) format. + * + * Get properties of pools in the system. Output space statistics + * for each one as well as other attributes. + */ int zpool_do_get(int argc, char **argv) { zprop_get_cbdata_t cb = { 0 }; zprop_list_t fake_name = { 0 }; int ret; - - if (argc < 3) - usage(B_FALSE); + int c, i; + char *value; cb.cb_first = B_TRUE; + + /* + * Set up default columns and sources. + */ cb.cb_sources = ZPROP_SRC_ALL; cb.cb_columns[0] = GET_COL_NAME; cb.cb_columns[1] = GET_COL_PROPERTY; @@ -4129,10 +5376,89 @@ zpool_do_get(int argc, char **argv) cb.cb_columns[3] = GET_COL_SOURCE; cb.cb_type = ZFS_TYPE_POOL; - if (zprop_get_list(g_zfs, argv[1], &cb.cb_proplist, + /* check options */ + while ((c = getopt(argc, argv, ":Hpo:")) != -1) { + switch (c) { + case 'p': + cb.cb_literal = B_TRUE; + break; + case 'H': + cb.cb_scripted = B_TRUE; + break; + case 'o': + bzero(&cb.cb_columns, sizeof (cb.cb_columns)); + i = 0; + while (*optarg != '\0') { + static char *col_subopts[] = + { "name", "property", "value", "source", + "all", NULL }; + + if (i == ZFS_GET_NCOLS) { + (void) fprintf(stderr, gettext("too " + "many fields given to -o " + "option\n")); + usage(B_FALSE); + } + + switch (getsubopt(&optarg, col_subopts, + &value)) { + case 0: + cb.cb_columns[i++] = GET_COL_NAME; + break; + case 1: + cb.cb_columns[i++] = GET_COL_PROPERTY; + break; + case 2: + cb.cb_columns[i++] = GET_COL_VALUE; + break; + case 3: + cb.cb_columns[i++] = GET_COL_SOURCE; + break; + case 4: + if (i > 0) { + (void) fprintf(stderr, + gettext("\"all\" conflicts " + "with specific fields " + "given to -o option\n")); + usage(B_FALSE); + } + cb.cb_columns[0] = GET_COL_NAME; + cb.cb_columns[1] = GET_COL_PROPERTY; + cb.cb_columns[2] = GET_COL_VALUE; + cb.cb_columns[3] = GET_COL_SOURCE; + i = ZFS_GET_NCOLS; + break; + default: + (void) fprintf(stderr, + gettext("invalid column name " + "'%s'\n"), suboptarg); + usage(B_FALSE); + } + } + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing property " + "argument\n")); + usage(B_FALSE); + } + + if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist, ZFS_TYPE_POOL) != 0) usage(B_FALSE); + argc--; + argv++; + if (cb.cb_proplist != NULL) { fake_name.pl_prop = ZPOOL_PROP_NAME; fake_name.pl_width = strlen(gettext("NAME")); @@ -4140,7 +5466,7 @@ zpool_do_get(int argc, char **argv) cb.cb_proplist = &fake_name; } - ret = for_each_pool(argc - 2, argv + 2, B_TRUE, &cb.cb_proplist, + ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, get_callback, &cb); if (cb.cb_proplist == &fake_name) @@ -4236,7 +5562,7 @@ find_command_idx(char *command, int *idx int main(int argc, char **argv) { - int ret; + int ret = 0; int i; char *cmdname; @@ -4269,8 +5595,7 @@ main(int argc, char **argv) if (strcmp(cmdname, "-?") == 0) usage(B_TRUE); - zpool_set_history_str("zpool", argc, argv, history_str); - verify(zpool_stage_history(g_zfs, history_str) == 0); + zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); /* * Run the appropriate command. @@ -4287,16 +5612,18 @@ main(int argc, char **argv) * 'freeze' is a vile debugging abomination, so we treat * it as such. */ - char buf[16384]; - int fd = open(ZFS_DEV, O_RDWR); - (void) strcpy((void *)buf, argv[2]); - return (!!ioctl(fd, ZFS_IOC_POOL_FREEZE, buf)); + zfs_cmd_t zc = { 0 }; + (void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name)); + return (!!zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc)); } else { (void) fprintf(stderr, gettext("unrecognized " "command '%s'\n"), cmdname); usage(B_FALSE); } + if (ret == 0 && log_history) + (void) zpool_log_history(g_zfs, history_str); + libzfs_fini(g_zfs); /* Index: src/external/cddl/osnet/dist/cmd/zpool/zpool_util.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zpool/zpool_util.h,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zpool_util.h --- src/external/cddl/osnet/dist/cmd/zpool/zpool_util.h 27 Feb 2010 22:29:23 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/cmd/zpool/zpool_util.h 12 Jun 2012 05:55:36 -0000 @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef ZPOOL_UTIL_H @@ -45,7 +44,7 @@ uint_t num_logs(nvlist_t *nv); */ nvlist_t *make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, - boolean_t isreplace, boolean_t dryrun, int argc, char **argv); + boolean_t replacing, boolean_t dryrun, int argc, char **argv); nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, splitflags_t flags, int argc, char **argv); Index: src/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c,v retrieving revision 1.3 diff -u -p -r1.3 zpool_vdev.c --- src/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c 27 Feb 2010 23:43:52 -0000 1.3 +++ src/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c 5 May 2017 16:51:20 -0000 @@ -20,8 +20,9 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov . */ /* @@ -70,15 +71,19 @@ #include #include #include -#include +#include #include -#include +#include #include +#ifdef __FreeBSD__ +#include +#endif +#ifdef __NetBSD__ +#include +#endif #include "zpool_util.h" -#define DISK_ROOT "/dev/dsk" -#define RDISK_ROOT "/dev/rdsk" #define BACKUP_SLICE "s2" /* @@ -111,6 +116,7 @@ vdev_error(const char *fmt, ...) va_end(ap); } +#ifdef illumos static void libdiskmgt_error(int error) { @@ -126,6 +132,155 @@ libdiskmgt_error(int error) } /* + * Validate a device, passing the bulk of the work off to libdiskmgt. + */ +static int +check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare) +{ + char *msg; + int error = 0; + dm_who_type_t who; + + if (force) + who = DM_WHO_ZPOOL_FORCE; + else if (isspare) + who = DM_WHO_ZPOOL_SPARE; + else + who = DM_WHO_ZPOOL; + + if (dm_inuse((char *)path, &msg, who, &error) || error) { + if (error != 0) { + libdiskmgt_error(error); + return (0); + } else { + vdev_error("%s", msg); + free(msg); + return (-1); + } + } + + /* + * If we're given a whole disk, ignore overlapping slices since we're + * about to label it anyway. + */ + error = 0; + if (!wholedisk && !force && + (dm_isoverlapping((char *)path, &msg, &error) || error)) { + if (error == 0) { + /* dm_isoverlapping returned -1 */ + vdev_error(gettext("%s overlaps with %s\n"), path, msg); + free(msg); + return (-1); + } else if (error != ENODEV) { + /* libdiskmgt's devcache only handles physical drives */ + libdiskmgt_error(error); + return (0); + } + } + + return (0); +} + + +/* + * Validate a whole disk. Iterate over all slices on the disk and make sure + * that none is in use by calling check_slice(). + */ +static int +check_disk(const char *name, dm_descriptor_t disk, int force, int isspare) +{ + dm_descriptor_t *drive, *media, *slice; + int err = 0; + int i; + int ret; + + /* + * Get the drive associated with this disk. This should never fail, + * because we already have an alias handle open for the device. + */ + if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE, + &err)) == NULL || *drive == NULL) { + if (err) + libdiskmgt_error(err); + return (0); + } + + if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA, + &err)) == NULL) { + dm_free_descriptors(drive); + if (err) + libdiskmgt_error(err); + return (0); + } + + dm_free_descriptors(drive); + + /* + * It is possible that the user has specified a removable media drive, + * and the media is not present. + */ + if (*media == NULL) { + dm_free_descriptors(media); + vdev_error(gettext("'%s' has no media in drive\n"), name); + return (-1); + } + + if ((slice = dm_get_associated_descriptors(*media, DM_SLICE, + &err)) == NULL) { + dm_free_descriptors(media); + if (err) + libdiskmgt_error(err); + return (0); + } + + dm_free_descriptors(media); + + ret = 0; + + /* + * Iterate over all slices and report any errors. We don't care about + * overlapping slices because we are using the whole disk. + */ + for (i = 0; slice[i] != NULL; i++) { + char *name = dm_get_name(slice[i], &err); + + if (check_slice(name, force, B_TRUE, isspare) != 0) + ret = -1; + + dm_free_name(name); + } + + dm_free_descriptors(slice); + return (ret); +} + +/* + * Validate a device. + */ +static int +check_device(const char *path, boolean_t force, boolean_t isspare) +{ + dm_descriptor_t desc; + int err; + char *dev; + + /* + * For whole disks, libdiskmgt does not include the leading dev path. + */ + dev = strrchr(path, '/'); + assert(dev != NULL); + dev++; + if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) { + err = check_disk(path, desc, force, isspare); + dm_free_descriptor(desc); + return (err); + } + + return (check_slice(path, force, B_FALSE, isspare)); +} +#endif /* illumos */ + +/* * Check that a file is valid. All we can do in this case is check that it's * not in use by another pool, and not in use by swap. */ @@ -139,7 +294,7 @@ check_file(const char *file, boolean_t f pool_state_t state; boolean_t inuse; -#ifndef __NetBSD__ +#ifdef illumos if (dm_inuse_swap(file, &err)) { if (err) libdiskmgt_error(err); @@ -202,6 +357,18 @@ check_file(const char *file, boolean_t f return (ret); } +static int +check_device(const char *name, boolean_t force, boolean_t isspare) +{ + char path[MAXPATHLEN]; + + if (strncmp(name, _PATH_DEV, sizeof(_PATH_DEV) - 1) != 0) + snprintf(path, sizeof(path), "%s%s", _PATH_DEV, name); + else + strlcpy(path, name, sizeof(path)); + + return (check_file(path, force, isspare)); +} /* * By "whole disk" we mean an entire physical disk (something we can @@ -214,12 +381,13 @@ check_file(const char *file, boolean_t f static boolean_t is_whole_disk(const char *arg) { +#ifdef illumos struct dk_gpt *label; int fd; char path[MAXPATHLEN]; (void) snprintf(path, sizeof (path), "%s%s%s", - RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE); + ZFS_RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE); if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) return (B_FALSE); if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { @@ -229,6 +397,28 @@ is_whole_disk(const char *arg) efi_free(label); (void) close(fd); return (B_TRUE); +#endif +#ifdef __FreeBSD__ + int fd; + + fd = g_open(arg, 0); + if (fd >= 0) { + g_close(fd); + return (B_TRUE); + } + return (B_FALSE); +#endif +#ifdef __NetBSD__ + struct disklabel dl; + int fd, rv; + + if ((fd = open(arg, O_RDWR | O_NONBLOCK)) < 0) + return (B_FALSE); + + rv = ioctl(fd, DIOCGDINFO, &dl); + close(fd); + return (rv == 0); +#endif } /* @@ -275,8 +465,10 @@ make_leaf_vdev(const char *arg, uint64_t * /dev/dsk/. As part of this check, see if we've been given a * an entire disk (minus the slice number). */ - (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, - arg); + if (strncmp(arg, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) + strlcpy(path, arg, sizeof (path)); + else + snprintf(path, sizeof (path), "%s%s", _PATH_DEV, arg); wholedisk = is_whole_disk(path); if (!wholedisk && (stat64(path, &statbuf) != 0)) { /* @@ -289,7 +481,7 @@ make_leaf_vdev(const char *arg, uint64_t if (errno == ENOENT) { (void) fprintf(stderr, gettext("cannot open '%s': no such " - "device in %s\n"), arg, DISK_ROOT); + "GEOM provider\n"), arg); (void) fprintf(stderr, gettext("must be a full path or " "shorthand device name\n")); @@ -303,6 +495,14 @@ make_leaf_vdev(const char *arg, uint64_t } } +#ifdef __FreeBSD__ + if (S_ISCHR(statbuf.st_mode)) { + statbuf.st_mode &= ~S_IFCHR; + statbuf.st_mode |= S_IFBLK; + wholedisk = B_FALSE; + } +#endif + /* * Determine whether this is a device or a file. */ @@ -312,7 +512,7 @@ make_leaf_vdev(const char *arg, uint64_t type = VDEV_TYPE_FILE; } else { (void) fprintf(stderr, gettext("cannot use '%s': must be a " - "block device or regular file\n"), path); + "GEOM provider or regular file\n"), path); return (NULL); } @@ -329,6 +529,7 @@ make_leaf_vdev(const char *arg, uint64_t verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)wholedisk) == 0); +#ifdef have_devid /* * For a whole disk, defer getting its devid until after labeling it. */ @@ -363,6 +564,7 @@ make_leaf_vdev(const char *arg, uint64_t (void) close(fd); } +#endif return (vdev); } @@ -402,7 +604,9 @@ get_replication(nvlist_t *nvroot, boolea uint_t c, children; nvlist_t *nv; char *type; - replication_level_t lastrep, rep, *ret; + replication_level_t lastrep = {0}; + replication_level_t rep; + replication_level_t *ret; boolean_t dontreport; ret = safe_malloc(sizeof (replication_level_t)); @@ -410,7 +614,6 @@ get_replication(nvlist_t *nvroot, boolea verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &top, &toplevels) == 0); - lastrep.zprl_type = NULL; for (t = 0; t < toplevels; t++) { uint64_t is_log = B_FALSE; @@ -467,6 +670,7 @@ get_replication(nvlist_t *nvroot, boolea dontreport = 0; vdev_size = -1ULL; for (c = 0; c < children; c++) { + boolean_t is_replacing, is_spare; nvlist_t *cnv = child[c]; char *path; struct stat64 statbuf; @@ -483,16 +687,19 @@ get_replication(nvlist_t *nvroot, boolea * If this is a replacing or spare vdev, then * get the real first child of the vdev. */ - if (strcmp(childtype, - VDEV_TYPE_REPLACING) == 0 || - strcmp(childtype, VDEV_TYPE_SPARE) == 0) { + is_replacing = strcmp(childtype, + VDEV_TYPE_REPLACING) == 0; + is_spare = strcmp(childtype, + VDEV_TYPE_SPARE) == 0; + if (is_replacing || is_spare) { nvlist_t **rchild; uint_t rchildren; verify(nvlist_lookup_nvlist_array(cnv, ZPOOL_CONFIG_CHILDREN, &rchild, &rchildren) == 0); - assert(rchildren == 2); + assert((is_replacing && rchildren == 2) + || (is_spare && rchildren >= 2)); cnv = rchild[0]; verify(nvlist_lookup_string(cnv, @@ -725,6 +932,7 @@ check_replication(nvlist_t *config, nvli return (ret); } +#ifdef illumos /* * Go through and find any whole disks in the vdev specification, labelling them * as appropriate. When constructing the vdev spec, we were unable to open this @@ -828,6 +1036,7 @@ make_disks(zpool_handle_t *zhp, nvlist_t return (0); } +#endif /* illumos */ /* * Determine if the given path is a hot spare within the given configuration. @@ -857,8 +1066,8 @@ is_spare(nvlist_t *config, const char *p return (B_FALSE); } free(name); - (void) close(fd); + verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); nvlist_free(label); @@ -881,16 +1090,17 @@ is_spare(nvlist_t *config, const char *p * Go through and find any devices that are in use. We rely on libdiskmgt for * the majority of this task. */ -static int -check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, - int isspare) +static boolean_t +is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, + boolean_t replacing, boolean_t isspare) { nvlist_t **child; uint_t c, children; char *type, *path; - int ret; + int ret = 0; char buf[MAXPATHLEN]; uint64_t wholedisk; + boolean_t anyinuse = B_FALSE; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); @@ -904,44 +1114,48 @@ check_in_use(nvlist_t *config, nvlist_t * hot spare within the same pool. If so, we allow it * regardless of what libdiskmgt or zpool_in_use() says. */ - if (isreplacing) { + if (replacing) { +#ifdef illumos if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk) == 0 && wholedisk) (void) snprintf(buf, sizeof (buf), "%ss0", path); else +#endif (void) strlcpy(buf, path, sizeof (buf)); + if (is_spare(config, buf)) - return (0); + return (B_FALSE); } - if (strcmp(type, VDEV_TYPE_DISK) == 0 || - strcmp(type, VDEV_TYPE_FILE) == 0) + if (strcmp(type, VDEV_TYPE_DISK) == 0) + ret = check_device(path, force, isspare); + else if (strcmp(type, VDEV_TYPE_FILE) == 0) ret = check_file(path, force, isspare); - return (ret); + return (ret != 0); } for (c = 0; c < children; c++) - if ((ret = check_in_use(config, child[c], force, - isreplacing, B_FALSE)) != 0) - return (ret); + if (is_device_in_use(config, child[c], force, replacing, + B_FALSE)) + anyinuse = B_TRUE; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) for (c = 0; c < children; c++) - if ((ret = check_in_use(config, child[c], force, - isreplacing, B_TRUE)) != 0) - return (ret); + if (is_device_in_use(config, child[c], force, replacing, + B_TRUE)) + anyinuse = B_TRUE; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) for (c = 0; c < children; c++) - if ((ret = check_in_use(config, child[c], force, - isreplacing, B_FALSE)) != 0) - return (ret); + if (is_device_in_use(config, child[c], force, replacing, + B_FALSE)) + anyinuse = B_TRUE; - return (0); + return (anyinuse); } static const char * @@ -1225,10 +1439,12 @@ split_mirror_vdev(zpool_handle_t *zhp, c return (NULL); } +#ifdef illumos if (!flags.dryrun && make_disks(zhp, newroot) != 0) { nvlist_free(newroot); return (NULL); } +#endif /* avoid any tricks in the spec */ verify(nvlist_lookup_nvlist_array(newroot, @@ -1250,8 +1466,7 @@ split_mirror_vdev(zpool_handle_t *zhp, c } if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { - if (newroot != NULL) - nvlist_free(newroot); + nvlist_free(newroot); return (NULL); } @@ -1270,7 +1485,7 @@ split_mirror_vdev(zpool_handle_t *zhp, c */ nvlist_t * make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, - boolean_t isreplacing, boolean_t dryrun, int argc, char **argv) + boolean_t replacing, boolean_t dryrun, int argc, char **argv) { nvlist_t *newroot; nvlist_t *poolconfig = NULL; @@ -1293,8 +1508,7 @@ make_root_vdev(zpool_handle_t *zhp, int * uses (such as a dedicated dump device) that even '-f' cannot * override. */ - if (check_in_use(poolconfig, newroot, force, isreplacing, - B_FALSE) != 0) { + if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) { nvlist_free(newroot); return (NULL); } @@ -1309,6 +1523,7 @@ make_root_vdev(zpool_handle_t *zhp, int return (NULL); } +#ifdef illumos /* * Run through the vdev specification and label any whole disks found. */ @@ -1316,6 +1531,7 @@ make_root_vdev(zpool_handle_t *zhp, int nvlist_free(newroot); return (NULL); } +#endif return (newroot); } Index: src/external/cddl/osnet/dist/cmd/ztest/ztest.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/cmd/ztest/ztest.c,v retrieving revision 1.7 diff -u -p -r1.7 ztest.c --- src/external/cddl/osnet/dist/cmd/ztest/ztest.c 28 Mar 2014 02:58:36 -0000 1.7 +++ src/external/cddl/osnet/dist/cmd/ztest/ztest.c 27 Mar 2017 06:26:21 -0000 @@ -19,8 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012 Martin Matuska . All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ /* @@ -50,7 +54,9 @@ * At random times, the child self-immolates with a SIGKILL. * This is the software equivalent of pulling the power cord. * The parent then runs the test again, using the existing - * storage pool, as many times as desired. + * storage pool, as many times as desired. If backwards compatibility + * testing is enabled ztest will sometimes run the "older" version + * of ztest after a SIGKILL. * * (6) To verify that we don't have future leaks or temporal incursions, * many of the functional tests record the transaction group number @@ -67,9 +73,15 @@ * You can ask more more vdevs [-v], datasets [-d], or threads [-t] * to increase the pool capacity, fanout, and overall stress level. * - * The -N(okill) option will suppress kills, so each child runs to completion. - * This can be useful when you're trying to distinguish temporal incursions - * from plain old race conditions. + * Use the -k option to set the desired frequency of kills. + * + * When ztest invokes itself it passes all relevant information through a + * temporary file which is mmap-ed in the child process. This allows shared + * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always + * stored at offset 0 of this file and contains information on the size and + * number of shared structures in the file. The information stored in this file + * must remain backwards compatible with older versions of ztest so that + * ztest can invoke them during backwards compatibility testing (-B). */ #include @@ -94,7 +106,12 @@ #include #include #include +#include +#include +#include #include +#include +#include #include #include #include @@ -104,30 +121,89 @@ #include #include #include +#include #include #include -static char cmdname[] = "ztest"; -static char *zopt_pool = cmdname; +static int ztest_fd_data = -1; +static int ztest_fd_rand = -1; + +typedef struct ztest_shared_hdr { + uint64_t zh_hdr_size; + uint64_t zh_opts_size; + uint64_t zh_size; + uint64_t zh_stats_size; + uint64_t zh_stats_count; + uint64_t zh_ds_size; + uint64_t zh_ds_count; +} ztest_shared_hdr_t; + +static ztest_shared_hdr_t *ztest_shared_hdr; + +typedef struct ztest_shared_opts { + char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; + char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; + char zo_alt_ztest[MAXNAMELEN]; + char zo_alt_libpath[MAXNAMELEN]; + uint64_t zo_vdevs; + uint64_t zo_vdevtime; + size_t zo_vdev_size; + int zo_ashift; + int zo_mirrors; + int zo_raidz; + int zo_raidz_parity; + int zo_datasets; + int zo_threads; + uint64_t zo_passtime; + uint64_t zo_killrate; + int zo_verbose; + int zo_init; + uint64_t zo_time; + uint64_t zo_maxloops; + uint64_t zo_metaslab_gang_bang; +} ztest_shared_opts_t; + +static const ztest_shared_opts_t ztest_opts_defaults = { + .zo_pool = { 'z', 't', 'e', 's', 't', '\0' }, + .zo_dir = { '/', 't', 'm', 'p', '\0' }, + .zo_alt_ztest = { '\0' }, + .zo_alt_libpath = { '\0' }, + .zo_vdevs = 5, + .zo_ashift = SPA_MINBLOCKSHIFT, + .zo_mirrors = 2, + .zo_raidz = 4, + .zo_raidz_parity = 1, + .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ + .zo_datasets = 7, + .zo_threads = 23, + .zo_passtime = 60, /* 60 seconds */ + .zo_killrate = 70, /* 70% kill rate */ + .zo_verbose = 0, + .zo_init = 1, + .zo_time = 300, /* 5 minutes */ + .zo_maxloops = 50, /* max loops during spa_freeze() */ + .zo_metaslab_gang_bang = 32 << 10 +}; + +extern uint64_t metaslab_gang_bang; +extern uint64_t metaslab_df_alloc_threshold; +extern uint64_t zfs_deadman_synctime_ms; +extern int metaslab_preload_limit; +extern boolean_t zfs_compressed_arc_enabled; + +static ztest_shared_opts_t *ztest_shared_opts; +static ztest_shared_opts_t ztest_opts; + +typedef struct ztest_shared_ds { + uint64_t zd_seq; +} ztest_shared_ds_t; -static uint64_t zopt_vdevs = 5; -static uint64_t zopt_vdevtime; -static int zopt_ashift = SPA_MINBLOCKSHIFT; -static int zopt_mirrors = 2; -static int zopt_raidz = 4; -static int zopt_raidz_parity = 1; -static size_t zopt_vdev_size = SPA_MINDEVSIZE; -static int zopt_datasets = 7; -static int zopt_threads = 23; -static uint64_t zopt_passtime = 60; /* 60 seconds */ -static uint64_t zopt_killrate = 70; /* 70% kill rate */ -static int zopt_verbose = 0; -static int zopt_init = 1; -static char *zopt_dir = "/tmp"; -static uint64_t zopt_time = 300; /* 5 minutes */ +static ztest_shared_ds_t *ztest_shared_ds; +#define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) #define BT_MAGIC 0x123456789abcdefULL -#define MAXFAULTS() (MAX(zs->zs_mirrors, 1) * (zopt_raidz_parity + 1) - 1) +#define MAXFAULTS() \ + (MAX(zs->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1) enum ztest_io_type { ZTEST_IO_WRITE_TAG, @@ -135,6 +211,7 @@ enum ztest_io_type { ZTEST_IO_WRITE_ZEROES, ZTEST_IO_TRUNCATE, ZTEST_IO_SETATTR, + ZTEST_IO_REWRITE, ZTEST_IO_TYPES }; @@ -192,18 +269,19 @@ typedef struct ztest_od { uint64_t od_crblocksize; uint64_t od_gen; uint64_t od_crgen; - char od_name[MAXNAMELEN]; + char od_name[ZFS_MAX_DATASET_NAME_LEN]; } ztest_od_t; /* * Per-dataset state. */ typedef struct ztest_ds { + ztest_shared_ds_t *zd_shared; objset_t *zd_os; + rwlock_t zd_zilog_lock; zilog_t *zd_zilog; - uint64_t zd_seq; ztest_od_t *zd_od; /* debugging aid */ - char zd_name[MAXNAMELEN]; + char zd_name[ZFS_MAX_DATASET_NAME_LEN]; mutex_t zd_dirobj_lock; rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; @@ -218,11 +296,17 @@ typedef struct ztest_info { ztest_func_t *zi_func; /* test function */ uint64_t zi_iters; /* iterations per execution */ uint64_t *zi_interval; /* execute every seconds */ - uint64_t zi_call_count; /* per-pass count */ - uint64_t zi_call_time; /* per-pass time */ - uint64_t zi_call_next; /* next time to call this function */ } ztest_info_t; +typedef struct ztest_shared_callstate { + uint64_t zc_count; /* per-pass count */ + uint64_t zc_time; /* per-pass time */ + uint64_t zc_next; /* next time to call this function */ +} ztest_shared_callstate_t; + +static ztest_shared_callstate_t *ztest_shared_callstate; +#define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) + /* * Note: these aren't static because we want dladdr() to work. */ @@ -233,6 +317,7 @@ ztest_func_t ztest_dmu_commit_callbacks; ztest_func_t ztest_zap; ztest_func_t ztest_zap_parallel; ztest_func_t ztest_zil_commit; +ztest_func_t ztest_zil_remount; ztest_func_t ztest_dmu_read_write_zcopy; ztest_func_t ztest_dmu_objset_create_destroy; ztest_func_t ztest_dmu_prealloc; @@ -252,6 +337,8 @@ ztest_func_t ztest_vdev_LUN_growth; ztest_func_t ztest_vdev_add_remove; ztest_func_t ztest_vdev_aux_add_remove; ztest_func_t ztest_split_pool; +ztest_func_t ztest_reguid; +ztest_func_t ztest_spa_upgrade; uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -268,6 +355,7 @@ ztest_info_t ztest_info[] = { { ztest_zap_parallel, 100, &zopt_always }, { ztest_split_pool, 1, &zopt_always }, { ztest_zil_commit, 1, &zopt_incessant }, + { ztest_zil_remount, 1, &zopt_sometimes }, { ztest_dmu_read_write_zcopy, 1, &zopt_often }, { ztest_dmu_objset_create_destroy, 1, &zopt_often }, { ztest_dsl_prop_get_set, 1, &zopt_often }, @@ -281,13 +369,17 @@ ztest_info_t ztest_info[] = { { ztest_fault_inject, 1, &zopt_sometimes }, { ztest_ddt_repair, 1, &zopt_sometimes }, { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, + { ztest_reguid, 1, &zopt_rarely }, { ztest_spa_rename, 1, &zopt_rarely }, { ztest_scrub, 1, &zopt_rarely }, + { ztest_spa_upgrade, 1, &zopt_rarely }, { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, - { ztest_vdev_attach_detach, 1, &zopt_rarely }, + { ztest_vdev_attach_detach, 1, &zopt_sometimes }, { ztest_vdev_LUN_growth, 1, &zopt_rarely }, - { ztest_vdev_add_remove, 1, &zopt_vdevtime }, - { ztest_vdev_aux_add_remove, 1, &zopt_vdevtime }, + { ztest_vdev_add_remove, 1, + &ztest_opts.zo_vdevtime }, + { ztest_vdev_aux_add_remove, 1, + &ztest_opts.zo_vdevtime }, }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -305,8 +397,7 @@ typedef struct ztest_cb_list { * Stuff we need to share writably between parent and child. */ typedef struct ztest_shared { - char *zs_pool; - spa_t *zs_spa; + boolean_t zs_do_init; hrtime_t zs_proc_start; hrtime_t zs_proc_stop; hrtime_t zs_thread_start; @@ -317,12 +408,11 @@ typedef struct ztest_shared { uint64_t zs_vdev_aux; uint64_t zs_alloc; uint64_t zs_space; - mutex_t zs_vdev_lock; - rwlock_t zs_name_lock; - ztest_info_t zs_info[ZTEST_FUNCS]; uint64_t zs_splits; uint64_t zs_mirrors; - ztest_ds_t zs_zd[]; + uint64_t zs_metaslab_sz; + uint64_t zs_metaslab_df_alloc_threshold; + uint64_t zs_guid; } ztest_shared_t; #define ID_PARALLEL -1ULL @@ -330,20 +420,26 @@ typedef struct ztest_shared { static char ztest_dev_template[] = "%s/%s.%llua"; static char ztest_aux_template[] = "%s/%s.%s.%llu"; ztest_shared_t *ztest_shared; -uint64_t *ztest_seq; -static int ztest_random_fd; -static int ztest_dump_core = 1; +static spa_t *ztest_spa = NULL; +static ztest_ds_t *ztest_ds; +static mutex_t ztest_vdev_lock; + +/* + * The ztest_name_lock protects the pool and dataset namespace used by + * the individual tests. To modify the namespace, consumers must grab + * this lock as writer. Grabbing the lock as reader will ensure that the + * namespace does not change while the lock is held. + */ +static rwlock_t ztest_name_lock; + +static boolean_t ztest_dump_core = B_TRUE; static boolean_t ztest_exiting; /* Global commit callback list */ static ztest_cb_list_t zcl; -extern uint64_t metaslab_gang_bang; -extern uint64_t metaslab_df_alloc_threshold; -static uint64_t metaslab_sz; - enum ztest_object { ZTEST_META_DNODE = 0, ZTEST_DIROBJ, @@ -378,21 +474,17 @@ fatal(int do_perror, char *message, ...) va_list args; int save_errno = errno; char buf[FATAL_MSG_SZ]; - size_t len, blklen = sizeof(buf); (void) fflush(stdout); va_start(args, message); - len = snprintf(buf, blklen, "ztest: "); - if (len > blklen) - len = blklen; + (void) sprintf(buf, "ztest: "); /* LINTED */ - len += vsnprintf(buf + len, blklen - len, message, args); + (void) vsprintf(buf + strlen(buf), message, args); va_end(args); - if (len > blklen) - len = blklen; if (do_perror) { - snprintf(buf + len, blklen - len, ": %s", strerror(save_errno)); + (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), + ": %s", strerror(save_errno)); } (void) fprintf(stderr, "%s\n", buf); fatal_msg = buf; /* to ease debugging */ @@ -460,63 +552,71 @@ nicenumtoull(const char *buf) static void usage(boolean_t requested) { + const ztest_shared_opts_t *zo = &ztest_opts_defaults; + char nice_vdev_size[10]; char nice_gang_bang[10]; FILE *fp = requested ? stdout : stderr; - nicenum(zopt_vdev_size, nice_vdev_size, sizeof(nice_vdev_size)); - nicenum(metaslab_gang_bang, nice_gang_bang, sizeof(nice_gang_bang)); + nicenum(zo->zo_vdev_size, nice_vdev_size); + nicenum(zo->zo_metaslab_gang_bang, nice_gang_bang); (void) fprintf(fp, "Usage: %s\n" "\t[-v vdevs (default: %llu)]\n" "\t[-s size_of_each_vdev (default: %s)]\n" - "\t[-a alignment_shift (default: %d) (use 0 for random)]\n" + "\t[-a alignment_shift (default: %d)] use 0 for random\n" "\t[-m mirror_copies (default: %d)]\n" "\t[-r raidz_disks (default: %d)]\n" "\t[-R raidz_parity (default: %d)]\n" "\t[-d datasets (default: %d)]\n" "\t[-t threads (default: %d)]\n" "\t[-g gang_block_threshold (default: %s)]\n" - "\t[-i initialize pool i times (default: %d)]\n" - "\t[-k kill percentage (default: %llu%%)]\n" + "\t[-i init_count (default: %d)] initialize pool i times\n" + "\t[-k kill_percentage (default: %llu%%)]\n" "\t[-p pool_name (default: %s)]\n" - "\t[-f file directory for vdev files (default: %s)]\n" - "\t[-V(erbose)] (use multiple times for ever more blather)\n" - "\t[-E(xisting)] (use existing pool instead of creating new one)\n" - "\t[-T time] total run time (default: %llu sec)\n" - "\t[-P passtime] time per pass (default: %llu sec)\n" + "\t[-f dir (default: %s)] file directory for vdev files\n" + "\t[-V] verbose (use multiple times for ever more blather)\n" + "\t[-E] use existing pool instead of creating new one\n" + "\t[-T time (default: %llu sec)] total run time\n" + "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" + "\t[-P passtime (default: %llu sec)] time per pass\n" + "\t[-B alt_ztest (default: )] alternate ztest path\n" "\t[-h] (print help)\n" "", - cmdname, - (u_longlong_t)zopt_vdevs, /* -v */ + zo->zo_pool, + (u_longlong_t)zo->zo_vdevs, /* -v */ nice_vdev_size, /* -s */ - zopt_ashift, /* -a */ - zopt_mirrors, /* -m */ - zopt_raidz, /* -r */ - zopt_raidz_parity, /* -R */ - zopt_datasets, /* -d */ - zopt_threads, /* -t */ + zo->zo_ashift, /* -a */ + zo->zo_mirrors, /* -m */ + zo->zo_raidz, /* -r */ + zo->zo_raidz_parity, /* -R */ + zo->zo_datasets, /* -d */ + zo->zo_threads, /* -t */ nice_gang_bang, /* -g */ - zopt_init, /* -i */ - (u_longlong_t)zopt_killrate, /* -k */ - zopt_pool, /* -p */ - zopt_dir, /* -f */ - (u_longlong_t)zopt_time, /* -T */ - (u_longlong_t)zopt_passtime); /* -P */ + zo->zo_init, /* -i */ + (u_longlong_t)zo->zo_killrate, /* -k */ + zo->zo_pool, /* -p */ + zo->zo_dir, /* -f */ + (u_longlong_t)zo->zo_time, /* -T */ + (u_longlong_t)zo->zo_maxloops, /* -F */ + (u_longlong_t)zo->zo_passtime); exit(requested ? 0 : 1); } static void process_options(int argc, char **argv) { + char *path; + ztest_shared_opts_t *zo = &ztest_opts; + int opt; uint64_t value; + char altdir[MAXNAMELEN] = { 0 }; - /* By default, test gang blocks for blocks 32K and greater */ - metaslab_gang_bang = 32 << 10; + bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); while ((opt = getopt(argc, argv, - "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:h")) != EOF) { + "v:s:a:m:r:R:d:t:g:i:k:p:f:VET:P:hF:B:")) != EOF) { value = 0; switch (opt) { case 'v': @@ -532,59 +632,76 @@ process_options(int argc, char **argv) case 'k': case 'T': case 'P': + case 'F': value = nicenumtoull(optarg); } switch (opt) { case 'v': - zopt_vdevs = value; + zo->zo_vdevs = value; break; case 's': - zopt_vdev_size = MAX(SPA_MINDEVSIZE, value); + zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); break; case 'a': - zopt_ashift = value; + zo->zo_ashift = value; break; case 'm': - zopt_mirrors = value; + zo->zo_mirrors = value; break; case 'r': - zopt_raidz = MAX(1, value); + zo->zo_raidz = MAX(1, value); break; case 'R': - zopt_raidz_parity = MIN(MAX(value, 1), 3); + zo->zo_raidz_parity = MIN(MAX(value, 1), 3); break; case 'd': - zopt_datasets = MAX(1, value); + zo->zo_datasets = MAX(1, value); break; case 't': - zopt_threads = MAX(1, value); + zo->zo_threads = MAX(1, value); break; case 'g': - metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, value); + zo->zo_metaslab_gang_bang = MAX(SPA_MINBLOCKSIZE << 1, + value); break; case 'i': - zopt_init = value; + zo->zo_init = value; break; case 'k': - zopt_killrate = value; + zo->zo_killrate = value; break; case 'p': - zopt_pool = strdup(optarg); + (void) strlcpy(zo->zo_pool, optarg, + sizeof (zo->zo_pool)); break; case 'f': - zopt_dir = strdup(optarg); + path = realpath(optarg, NULL); + if (path == NULL) { + (void) fprintf(stderr, "error: %s: %s\n", + optarg, strerror(errno)); + usage(B_FALSE); + } else { + (void) strlcpy(zo->zo_dir, path, + sizeof (zo->zo_dir)); + } break; case 'V': - zopt_verbose++; + zo->zo_verbose++; break; case 'E': - zopt_init = 0; + zo->zo_init = 0; break; case 'T': - zopt_time = value; + zo->zo_time = value; break; case 'P': - zopt_passtime = MAX(1, value); + zo->zo_passtime = MAX(1, value); + break; + case 'F': + zo->zo_maxloops = MAX(1, value); + break; + case 'B': + (void) strlcpy(altdir, optarg, sizeof (altdir)); break; case 'h': usage(B_TRUE); @@ -596,17 +713,75 @@ process_options(int argc, char **argv) } } - zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1); + zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1); - zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time * NANOSEC / zopt_vdevs : + zo->zo_vdevtime = + (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : UINT64_MAX >> 2); + + if (strlen(altdir) > 0) { + char *cmd; + char *realaltdir; + char *bin; + char *ztest; + char *isa; + int isalen; + + cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + + VERIFY(NULL != realpath(getexecname(), cmd)); + if (0 != access(altdir, F_OK)) { + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "invalid alternate ztest path: %s", + altdir); + } + VERIFY(NULL != realpath(altdir, realaltdir)); + + /* + * 'cmd' should be of the form "/usr/bin//ztest". + * We want to extract to determine if we should use + * 32 or 64 bit binaries. + */ + bin = strstr(cmd, "/usr/bin/"); + ztest = strstr(bin, "/ztest"); + isa = bin + 9; + isalen = ztest - isa; + (void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest), + "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa); + (void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath), + "%s/usr/lib/%.*s", realaltdir, isalen, isa); + + if (0 != access(zo->zo_alt_ztest, X_OK)) { + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "invalid alternate ztest: %s", + zo->zo_alt_ztest); + } else if (0 != access(zo->zo_alt_libpath, X_OK)) { + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "invalid alternate lib directory %s", + zo->zo_alt_libpath); + } + + umem_free(cmd, MAXPATHLEN); + umem_free(realaltdir, MAXPATHLEN); + } } static void ztest_kill(ztest_shared_t *zs) { - zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(zs->zs_spa)); - zs->zs_space = metaslab_class_get_space(spa_normal_class(zs->zs_spa)); + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); + + /* + * Before we kill off ztest, make sure that the config is updated. + * See comment above spa_config_sync(). + */ + mutex_enter(&spa_namespace_lock); + spa_config_sync(ztest_spa, B_FALSE, B_FALSE); + mutex_exit(&spa_namespace_lock); + + zfs_dbgmsg_print(FTAG); (void) kill(getpid(), SIGKILL); } @@ -615,10 +790,12 @@ ztest_random(uint64_t range) { uint64_t r; + ASSERT3S(ztest_fd_rand, >=, 0); + if (range == 0) return (0); - if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r)) + if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) fatal(1, "short read from /dev/urandom"); return (r % range); @@ -634,13 +811,13 @@ ztest_record_enospc(const char *s) static uint64_t ztest_get_ashift(void) { - if (zopt_ashift == 0) - return (SPA_MINBLOCKSHIFT + ztest_random(3)); - return (zopt_ashift); + if (ztest_opts.zo_ashift == 0) + return (SPA_MINBLOCKSHIFT + ztest_random(5)); + return (ztest_opts.zo_ashift); } static nvlist_t * -make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift) +make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) { char pathbuf[MAXPATHLEN]; uint64_t vdev; @@ -654,12 +831,15 @@ make_vdev_file(char *path, char *aux, si if (aux != NULL) { vdev = ztest_shared->zs_vdev_aux; - (void) snprintf(path, sizeof(pathbuf), ztest_aux_template, - zopt_dir, zopt_pool, aux, vdev); + (void) snprintf(path, sizeof (pathbuf), + ztest_aux_template, ztest_opts.zo_dir, + pool == NULL ? ztest_opts.zo_pool : pool, + aux, vdev); } else { vdev = ztest_shared->zs_vdev_next_leaf++; - (void) snprintf(path, sizeof(pathbuf), ztest_dev_template, - zopt_dir, zopt_pool, vdev); + (void) snprintf(path, sizeof (pathbuf), + ztest_dev_template, ztest_opts.zo_dir, + pool == NULL ? ztest_opts.zo_pool : pool, vdev); } } @@ -681,23 +861,24 @@ make_vdev_file(char *path, char *aux, si } static nvlist_t * -make_vdev_raidz(char *path, char *aux, size_t size, uint64_t ashift, int r) +make_vdev_raidz(char *path, char *aux, char *pool, size_t size, + uint64_t ashift, int r) { nvlist_t *raidz, **child; int c; if (r < 2) - return (make_vdev_file(path, aux, size, ashift)); + return (make_vdev_file(path, aux, pool, size, ashift)); child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < r; c++) - child[c] = make_vdev_file(path, aux, size, ashift); + child[c] = make_vdev_file(path, aux, pool, size, ashift); VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, VDEV_TYPE_RAIDZ) == 0); VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, - zopt_raidz_parity) == 0); + ztest_opts.zo_raidz_parity) == 0); VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, child, r) == 0); @@ -710,19 +891,19 @@ make_vdev_raidz(char *path, char *aux, s } static nvlist_t * -make_vdev_mirror(char *path, char *aux, size_t size, uint64_t ashift, - int r, int m) +make_vdev_mirror(char *path, char *aux, char *pool, size_t size, + uint64_t ashift, int r, int m) { nvlist_t *mirror, **child; int c; if (m < 1) - return (make_vdev_raidz(path, aux, size, ashift, r)); + return (make_vdev_raidz(path, aux, pool, size, ashift, r)); child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < m; c++) - child[c] = make_vdev_raidz(path, aux, size, ashift, r); + child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r); VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, @@ -739,8 +920,8 @@ make_vdev_mirror(char *path, char *aux, } static nvlist_t * -make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift, - int log, int r, int m, int t) +make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, + int log, int r, int m, int t) { nvlist_t *root, **child; int c; @@ -750,7 +931,8 @@ make_vdev_root(char *path, char *aux, si child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < t; c++) { - child[c] = make_vdev_mirror(path, aux, size, ashift, r, m); + child[c] = make_vdev_mirror(path, aux, pool, size, ashift, + r, m); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, log) == 0); } @@ -768,11 +950,40 @@ make_vdev_root(char *path, char *aux, si return (root); } +/* + * Find a random spa version. Returns back a random spa version in the + * range [initial_version, SPA_VERSION_FEATURES]. + */ +static uint64_t +ztest_random_spa_version(uint64_t initial_version) +{ + uint64_t version = initial_version; + + if (version <= SPA_VERSION_BEFORE_FEATURES) { + version = version + + ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); + } + + if (version > SPA_VERSION_BEFORE_FEATURES) + version = SPA_VERSION_FEATURES; + + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); + return (version); +} + static int ztest_random_blocksize(void) { - return (1 << (SPA_MINBLOCKSHIFT + - ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1))); + uint64_t block_shift; + /* + * Choose a block size >= the ashift. + * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. + */ + int maxbs = SPA_OLD_MAXBLOCKSHIFT; + if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) + maxbs = 20; + block_shift = ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); + return (1 << (SPA_MINBLOCKSHIFT + block_shift)); } static int @@ -822,20 +1033,18 @@ ztest_dsl_prop_set_uint64(char *osname, uint64_t curval; int error; - error = dsl_prop_set(osname, propname, - (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), - sizeof (value), 1, &value); + error = dsl_prop_set_int(osname, propname, + (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (error); } - ASSERT3U(error, ==, 0); + ASSERT0(error); - VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval), - 1, &curval, setpoint), ==, 0); + VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); - if (zopt_verbose >= 6) { + if (ztest_opts.zo_verbose >= 6) { VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0); (void) printf("%s %s = %s at '%s'\n", osname, propname, valname, setpoint); @@ -845,9 +1054,9 @@ ztest_dsl_prop_set_uint64(char *osname, } static int -ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value) +ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) { - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; nvlist_t *props = NULL; int error; @@ -862,7 +1071,7 @@ ztest_spa_prop_set_uint64(ztest_shared_t ztest_record_enospc(FTAG); return (error); } - ASSERT3U(error, ==, 0); + ASSERT0(error); return (error); } @@ -969,13 +1178,17 @@ ztest_range_unlock(rl_t *rl) } static void -ztest_zd_init(ztest_ds_t *zd, objset_t *os) +ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) { zd->zd_os = os; zd->zd_zilog = dmu_objset_zil(os); - zd->zd_seq = 0; + zd->zd_shared = szd; dmu_objset_name(os, zd->zd_name); + if (zd->zd_shared != NULL) + zd->zd_shared->zd_seq = 0; + + VERIFY(rwlock_init(&zd->zd_zilog_lock, USYNC_THREAD, NULL) == 0); VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0); for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) @@ -1065,13 +1278,13 @@ static void ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) { - ASSERT(bt->bt_magic == BT_MAGIC); - ASSERT(bt->bt_objset == dmu_objset_id(os)); - ASSERT(bt->bt_object == object); - ASSERT(bt->bt_offset == offset); - ASSERT(bt->bt_gen <= gen); - ASSERT(bt->bt_txg <= txg); - ASSERT(bt->bt_crtxg == crtxg); + ASSERT3U(bt->bt_magic, ==, BT_MAGIC); + ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); + ASSERT3U(bt->bt_object, ==, object); + ASSERT3U(bt->bt_offset, ==, offset); + ASSERT3U(bt->bt_gen, <=, gen); + ASSERT3U(bt->bt_txg, <=, txg); + ASSERT3U(bt->bt_crtxg, ==, crtxg); } static ztest_block_tag_t * @@ -1098,7 +1311,7 @@ ztest_bt_bonus(dmu_buf_t *db) #define lrz_bonustype lr_rdev #define lrz_bonuslen lr_crtime[1] -static uint64_t +static void ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) { char *name = (void *)(lr + 1); /* name follows lr */ @@ -1106,40 +1319,41 @@ ztest_log_create(ztest_ds_t *zd, dmu_tx_ itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) - return (0); + return; itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) + namesize - sizeof (lr_t)); - return (zil_itx_assign(zd->zd_zilog, itx, tx)); + zil_itx_assign(zd->zd_zilog, itx, tx); } -static uint64_t -ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr) +static void +ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) { char *name = (void *)(lr + 1); /* name follows lr */ size_t namesize = strlen(name) + 1; itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) - return (0); + return; itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) + namesize - sizeof (lr_t)); - return (zil_itx_assign(zd->zd_zilog, itx, tx)); + itx->itx_oid = object; + zil_itx_assign(zd->zd_zilog, itx, tx); } -static uint64_t +static void ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) { itx_t *itx; itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); if (zil_replaying(zd->zd_zilog, tx)) - return (0); + return; if (lr->lr_length > ZIL_MAX_LOG_DATA) write_state = WR_INDIRECT; @@ -1157,42 +1371,43 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t itx->itx_private = zd; itx->itx_wr_state = write_state; itx->itx_sync = (ztest_random(8) == 0); - itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) - sizeof (lr_t)); - return (zil_itx_assign(zd->zd_zilog, itx, tx)); + zil_itx_assign(zd->zd_zilog, itx, tx); } -static uint64_t +static void ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) { itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) - return (0); + return; itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) - sizeof (lr_t)); - return (zil_itx_assign(zd->zd_zilog, itx, tx)); + itx->itx_sync = B_FALSE; + zil_itx_assign(zd->zd_zilog, itx, tx); } -static uint64_t +static void ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) { itx_t *itx; if (zil_replaying(zd->zd_zilog, tx)) - return (0); + return; itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) - sizeof (lr_t)); - return (zil_itx_assign(zd->zd_zilog, itx, tx)); + itx->itx_sync = B_FALSE; + zil_itx_assign(zd->zd_zilog, itx, tx); } /* @@ -1324,7 +1539,7 @@ ztest_replay_remove(ztest_ds_t *zd, lr_r VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx)); - (void) ztest_log_remove(zd, tx, lr); + (void) ztest_log_remove(zd, tx, lr, object); dmu_tx_commit(tx); @@ -1552,7 +1767,7 @@ ztest_replay_setattr(ztest_ds_t *zd, lr_ ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); ASSERT3U(lr->lr_size, <=, db->db_size); - VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0); + VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); bbt = ztest_bt_bonus(db); ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg); @@ -1671,9 +1886,16 @@ ztest_get_data(void *arg, lr_write_t *lr zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, RL_READER); - error = dmu_buf_hold(os, object, offset, zgd, &db); + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); if (error == 0) { + blkptr_t *obp = dmu_buf_get_blkptr(db); + if (obp) { + ASSERT(BP_IS_HOLE(bp)); + *bp = *obp; + } + zgd->zgd_db = db; zgd->zgd_bp = bp; @@ -1819,6 +2041,9 @@ ztest_remove(ztest_ds_t *zd, ztest_od_t continue; } + /* + * No object was found. + */ if (od->od_object == 0) continue; @@ -1934,6 +2159,7 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t static void ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) { + int err; ztest_block_tag_t wbt; dmu_object_info_t doi; enum ztest_io_type io_type; @@ -1951,6 +2177,8 @@ ztest_io(ztest_ds_t *zd, uint64_t object if (ztest_random(2) == 0) io_type = ZTEST_IO_WRITE_TAG; + (void) rw_rdlock(&zd->zd_zilog_lock); + switch (io_type) { case ZTEST_IO_WRITE_TAG: @@ -1984,8 +2212,29 @@ ztest_io(ztest_ds_t *zd, uint64_t object case ZTEST_IO_SETATTR: (void) ztest_setattr(zd, object); break; + + case ZTEST_IO_REWRITE: + (void) rw_rdlock(&ztest_name_lock); + err = ztest_dsl_prop_set_uint64(zd->zd_name, + ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), + B_FALSE); + VERIFY(err == 0 || err == ENOSPC); + err = ztest_dsl_prop_set_uint64(zd->zd_name, + ZFS_PROP_COMPRESSION, + ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), + B_FALSE); + VERIFY(err == 0 || err == ENOSPC); + (void) rw_unlock(&ztest_name_lock); + + VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, + DMU_READ_NO_PREFETCH)); + + (void) ztest_write(zd, object, offset, blocksize, data); + break; } + (void) rw_unlock(&zd->zd_zilog_lock); + umem_free(data, blocksize); } @@ -2040,7 +2289,9 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_ { zilog_t *zilog = zd->zd_zilog; - zil_commit(zilog, UINT64_MAX, ztest_random(ZTEST_OBJECTS)); + (void) rw_rdlock(&zd->zd_zilog_lock); + + zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); /* * Remember the committed values in zd, which is in parent/child @@ -2048,9 +2299,42 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_ * will verify that the log really does contain this record. */ mutex_enter(&zilog->zl_lock); - ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq); - zd->zd_seq = zilog->zl_commit_lr_seq; + ASSERT(zd->zd_shared != NULL); + ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); + zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; mutex_exit(&zilog->zl_lock); + + (void) rw_unlock(&zd->zd_zilog_lock); +} + +/* + * This function is designed to simulate the operations that occur during a + * mount/unmount operation. We hold the dataset across these operations in an + * attempt to expose any implicit assumptions about ZIL management. + */ +/* ARGSUSED */ +void +ztest_zil_remount(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + + /* + * We grab the zd_dirobj_lock to ensure that no other thread is + * updating the zil (i.e. adding in-memory log records) and the + * zd_zilog_lock to block any I/O. + */ + VERIFY0(mutex_lock(&zd->zd_dirobj_lock)); + (void) rw_wrlock(&zd->zd_zilog_lock); + + /* zfsvfs_teardown() */ + zil_close(zd->zd_zilog); + + /* zfsvfs_setup() */ + VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog); + zil_replay(os, zd, ztest_replay_vector); + + (void) rw_unlock(&zd->zd_zilog_lock); + VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0); } /* @@ -2061,39 +2345,111 @@ ztest_zil_commit(ztest_ds_t *zd, uint64_ void ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; + ztest_shared_opts_t *zo = &ztest_opts; spa_t *spa; nvlist_t *nvroot; /* * Attempt to create using a bad file. */ - nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1); VERIFY3U(ENOENT, ==, - spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); + spa_create("ztest_bad_file", nvroot, NULL, NULL)); nvlist_free(nvroot); /* * Attempt to create using a bad mirror. */ - nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1); + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 2, 1); VERIFY3U(ENOENT, ==, - spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); + spa_create("ztest_bad_mirror", nvroot, NULL, NULL)); nvlist_free(nvroot); /* * Attempt to create an existing pool. It shouldn't matter * what's in the nvroot; we should fail with EEXIST. */ - (void) rw_rdlock(&zs->zs_name_lock); - nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); - VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL)); + (void) rw_rdlock(&ztest_name_lock); + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, 0, 0, 0, 1); + VERIFY3U(EEXIST, ==, spa_create(zo->zo_pool, nvroot, NULL, NULL)); nvlist_free(nvroot); - VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); - VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool)); + VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG)); + VERIFY3U(EBUSY, ==, spa_destroy(zo->zo_pool)); + spa_close(spa, FTAG); + + (void) rw_unlock(&ztest_name_lock); +} + +/* ARGSUSED */ +void +ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa; + uint64_t initial_version = SPA_VERSION_INITIAL; + uint64_t version, newversion; + nvlist_t *nvroot, *props; + char *name; + + VERIFY0(mutex_lock(&ztest_vdev_lock)); + name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); + + /* + * Clean up from previous runs. + */ + (void) spa_destroy(name); + + nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, + 0, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); + + /* + * If we're configuring a RAIDZ device then make sure that the + * the initial version is capable of supporting that feature. + */ + switch (ztest_opts.zo_raidz_parity) { + case 0: + case 1: + initial_version = SPA_VERSION_INITIAL; + break; + case 2: + initial_version = SPA_VERSION_RAIDZ2; + break; + case 3: + initial_version = SPA_VERSION_RAIDZ3; + break; + } + + /* + * Create a pool with a spa version that can be upgraded. Pick + * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. + */ + do { + version = ztest_random_spa_version(initial_version); + } while (version > SPA_VERSION_BEFORE_FEATURES); + + props = fnvlist_alloc(); + fnvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_VERSION), version); + VERIFY0(spa_create(name, nvroot, props, NULL)); + fnvlist_free(nvroot); + fnvlist_free(props); + + VERIFY0(spa_open(name, &spa, FTAG)); + VERIFY3U(spa_version(spa), ==, version); + newversion = ztest_random_spa_version(version + 1); + + if (ztest_opts.zo_verbose >= 4) { + (void) printf("upgrading spa version from %llu to %llu\n", + (u_longlong_t)version, (u_longlong_t)newversion); + } + + spa_upgrade(spa, newversion); + VERIFY3U(spa_version(spa), >, version); + VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, + zpool_prop_to_name(ZPOOL_PROP_VERSION))); spa_close(spa, FTAG); - (void) rw_unlock(&zs->zs_name_lock); + strfree(name); + VERIFY0(mutex_unlock(&ztest_vdev_lock)); } static vdev_t * @@ -2140,14 +2496,14 @@ void ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; uint64_t leaves; uint64_t guid; nvlist_t *nvroot; int error; - VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * zopt_raidz; + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -2172,9 +2528,9 @@ ztest_vdev_add_remove(ztest_ds_t *zd, ui * dmu_objset_destroy() to fail with EBUSY thus * leaving the dataset in an inconsistent state. */ - VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0); + VERIFY(rw_wrlock(&ztest_name_lock) == 0); error = spa_vdev_remove(spa, guid, B_FALSE); - VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0); + VERIFY(rw_unlock(&ztest_name_lock) == 0); if (error && error != EEXIST) fatal(0, "spa_vdev_remove() = %d", error); @@ -2184,8 +2540,10 @@ ztest_vdev_add_remove(ztest_ds_t *zd, ui /* * Make 1/4 of the devices be log devices. */ - nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, - ztest_random(4) == 0, zopt_raidz, zs->zs_mirrors, 1); + nvroot = make_vdev_root(NULL, NULL, NULL, + ztest_opts.zo_vdev_size, 0, + ztest_random(4) == 0, ztest_opts.zo_raidz, + zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); nvlist_free(nvroot); @@ -2196,7 +2554,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, ui fatal(0, "spa_vdev_add() = %d", error); } - VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); } /* @@ -2207,7 +2565,7 @@ void ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; vdev_t *rvd = spa->spa_root_vdev; spa_aux_vdev_t *sav; char *aux; @@ -2222,7 +2580,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd aux = ZPOOL_CONFIG_L2CACHE; } - VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -2239,8 +2597,9 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd for (;;) { char path[MAXPATHLEN]; int c; - (void) snprintf(path, sizeof(path), ztest_aux_template, zopt_dir, - zopt_pool, aux, zs->zs_vdev_aux); + (void) snprintf(path, sizeof (path), ztest_aux_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, aux, + zs->zs_vdev_aux); for (c = 0; c < sav->sav_count; c++) if (strcmp(sav->sav_vdevs[c]->vdev_path, path) == 0) @@ -2258,8 +2617,8 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd /* * Add a new device. */ - nvlist_t *nvroot = make_vdev_root(NULL, aux, - (zopt_vdev_size * 5) / 4, 0, 0, 0, 0, 1); + nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, + (ztest_opts.zo_vdev_size * 5) / 4, 0, 0, 0, 0, 1); error = spa_vdev_add(spa, nvroot); if (error != 0) fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); @@ -2278,7 +2637,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); } - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); } /* @@ -2289,17 +2648,17 @@ void ztest_split_pool(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; vdev_t *rvd = spa->spa_root_vdev; nvlist_t *tree, **child, *config, *split, **schild; uint_t c, children, schildren = 0, lastlogid = 0; int error = 0; - VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); /* ensure we have a useable config; mirrors of raidz aren't supported */ - if (zs->zs_mirrors < 3 || zopt_raidz > 1) { - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) { + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); return; } @@ -2309,8 +2668,11 @@ ztest_split_pool(ztest_ds_t *zd, uint64_ spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); /* generate a config from the existing config */ + mutex_enter(&spa->spa_props_lock); VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE, &tree) == 0); + mutex_exit(&spa->spa_props_lock); + VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); @@ -2355,9 +2717,9 @@ ztest_split_pool(ztest_ds_t *zd, uint64_ spa_config_exit(spa, SCL_VDEV, FTAG); - (void) rw_wrlock(&zs->zs_name_lock); + (void) rw_wrlock(&ztest_name_lock); error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); nvlist_free(config); @@ -2370,7 +2732,7 @@ ztest_split_pool(ztest_ds_t *zd, uint64_ ++zs->zs_splits; --zs->zs_mirrors; } - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); } @@ -2382,7 +2744,7 @@ void ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; spa_aux_vdev_t *sav = &spa->spa_spares; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *pvd; @@ -2391,7 +2753,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t leaf, top; uint64_t ashift = ztest_get_ashift(); uint64_t oldguid, pguid; - size_t oldsize, newsize; + uint64_t oldsize, newsize; char oldpath[MAXPATHLEN], newpath[MAXPATHLEN]; int replacing; int oldvd_has_siblings = B_FALSE; @@ -2399,8 +2761,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, int oldvd_is_log; int error, expected_error; - VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); - leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz; + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -2426,12 +2788,12 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, if (zs->zs_mirrors >= 1) { ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); ASSERT(oldvd->vdev_children >= zs->zs_mirrors); - oldvd = oldvd->vdev_child[leaf / zopt_raidz]; + oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; } - if (zopt_raidz > 1) { + if (ztest_opts.zo_raidz > 1) { ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); - ASSERT(oldvd->vdev_children == zopt_raidz); - oldvd = oldvd->vdev_child[leaf % zopt_raidz]; + ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); + oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz]; } /* @@ -2460,7 +2822,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, if (error != 0 && error != ENODEV && error != EBUSY && error != ENOTSUP) fatal(0, "detach (%s) returned %d", oldpath, error); - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); return; } @@ -2474,7 +2836,8 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, (void) strcpy(newpath, newvd->vdev_path); } else { (void) snprintf(newpath, sizeof (newpath), ztest_dev_template, - zopt_dir, zopt_pool, top * leaves + leaf); + ztest_opts.zo_dir, ztest_opts.zo_pool, + top * leaves + leaf); if (ztest_random(2) == 0) newpath[strlen(newpath) - 1] = 'b'; newvd = vdev_lookup_by_path(rvd, newpath); @@ -2523,7 +2886,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, /* * Build the nvlist describing newpath. */ - root = make_vdev_root(newpath, NULL, newvd == NULL ? newsize : 0, + root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, ashift, 0, 0, 0, 1); error = spa_vdev_attach(spa, oldguid, root, replacing); @@ -2549,11 +2912,11 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, if (error != expected_error && expected_error != EBUSY) { fatal(0, "attach (%s %llu, %s %llu, %d) " "returned %d, expected %d", - oldpath, (longlong_t)oldsize, newpath, - (longlong_t)newsize, replacing, error, expected_error); + oldpath, oldsize, newpath, + newsize, replacing, error, expected_error); } - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); } /* @@ -2576,7 +2939,7 @@ grow_vdev(vdev_t *vd, void *arg) fsize = lseek(fd, 0, SEEK_END); (void) ftruncate(fd, *newsize); - if (zopt_verbose >= 6) { + if (ztest_opts.zo_verbose >= 6) { (void) printf("%s grew from %lu to %lu bytes\n", vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); } @@ -2612,7 +2975,7 @@ online_vdev(vdev_t *vd, void *arg) * vdev_open fails is by checking the returned newstate. */ if (error || newstate != VDEV_STATE_HEALTHY) { - if (zopt_verbose >= 5) { + if (ztest_opts.zo_verbose >= 5) { (void) printf("Unable to expand vdev, state %llu, " "error %d\n", (u_longlong_t)newstate, error); } @@ -2627,7 +2990,7 @@ online_vdev(vdev_t *vd, void *arg) * trying to online it. */ if (generation != spa->spa_config_generation) { - if (zopt_verbose >= 5) { + if (ztest_opts.zo_verbose >= 5) { (void) printf("vdev configuration has changed, " "guid %llu, state %llu, expected gen %llu, " "got gen %llu\n", @@ -2673,8 +3036,7 @@ vdev_walk_tree(vdev_t *vd, vdev_t *(*fun void ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; vdev_t *vd, *tvd; metaslab_class_t *mc; metaslab_group_t *mg; @@ -2682,7 +3044,7 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, ui uint64_t top; uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; - VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); spa_config_enter(spa, SCL_STATE, spa, RW_READER); top = ztest_random_vdev_top(spa, B_TRUE); @@ -2708,16 +3070,16 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, ui * original size, and it has a valid psize. */ if (tvd->vdev_state != VDEV_STATE_HEALTHY || - psize == 0 || psize >= 4 * zopt_vdev_size) { + psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { spa_config_exit(spa, SCL_STATE, spa); - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); return; } ASSERT(psize > 0); newsize = psize + psize / 8; ASSERT3U(newsize, >, psize); - if (zopt_verbose >= 6) { + if (ztest_opts.zo_verbose >= 6) { (void) printf("Expanding LUN %s from %lu to %lu\n", vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); } @@ -2730,12 +3092,12 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, ui if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || vdev_walk_tree(tvd, online_vdev, NULL) != NULL || tvd->vdev_state != VDEV_STATE_HEALTHY) { - if (zopt_verbose >= 5) { + if (ztest_opts.zo_verbose >= 5) { (void) printf("Could not expand LUN because " "the vdev configuration changed.\n"); } spa_config_exit(spa, SCL_STATE, spa); - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); return; } @@ -2764,12 +3126,12 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, ui new_class_space = metaslab_class_get_space(mc); if (tvd->vdev_mg != mg || mg->mg_class != mc) { - if (zopt_verbose >= 5) { + if (ztest_opts.zo_verbose >= 5) { (void) printf("Could not verify LUN expansion due to " "intervening vdev offline or remove.\n"); } spa_config_exit(spa, SCL_STATE, spa); - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); return; } @@ -2787,17 +3149,17 @@ ztest_vdev_LUN_growth(ztest_ds_t *zd, ui fatal(0, "LUN expansion failed: class_space %llu <= %llu\n", old_class_space, new_class_space); - if (zopt_verbose >= 5) { + if (ztest_opts.zo_verbose >= 5) { char oldnumbuf[6], newnumbuf[6]; - nicenum(old_class_space, oldnumbuf, sizeof(oldnumbuf)); - nicenum(new_class_space, newnumbuf, sizeof(newnumbuf)); + nicenum(old_class_space, oldnumbuf); + nicenum(new_class_space, newnumbuf); (void) printf("%s grew from %s to %s\n", spa->spa_name, oldnumbuf, newnumbuf); } spa_config_exit(spa, SCL_STATE, spa); - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); } /* @@ -2814,6 +3176,22 @@ ztest_objset_create_cb(objset_t *os, voi DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); } +static int +ztest_dataset_create(char *dsname) +{ + uint64_t zilset = ztest_random(100); + int err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, + ztest_objset_create_cb, NULL); + + if (err || zilset < 80) + return (err); + + if (ztest_opts.zo_verbose >= 6) + (void) printf("Setting dataset %s to sync always\n", dsname); + return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, + ZFS_SYNC_ALWAYS, B_FALSE)); +} + /* ARGSUSED */ static int ztest_objset_destroy_cb(const char *name, void *arg) @@ -2825,53 +3203,57 @@ ztest_objset_destroy_cb(const char *name /* * Verify that the dataset contains a directory object. */ - VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os)); + VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, FTAG, &os)); error = dmu_object_info(os, ZTEST_DIROBJ, &doi); if (error != ENOENT) { /* We could have crashed in the middle of destroying it */ - ASSERT3U(error, ==, 0); + ASSERT0(error); ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); ASSERT3S(doi.doi_physical_blocks_512, >=, 0); } - dmu_objset_rele(os, FTAG); + dmu_objset_disown(os, FTAG); /* * Destroy the dataset. */ - VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE)); + if (strchr(name, '@') != NULL) { + VERIFY0(dsl_destroy_snapshot(name, B_FALSE)); + } else { + VERIFY0(dsl_destroy_head(name)); + } return (0); } static boolean_t ztest_snapshot_create(char *osname, uint64_t id) { - char snapname[MAXNAMELEN]; + char snapname[ZFS_MAX_DATASET_NAME_LEN]; int error; - (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname, - (u_longlong_t)id); + (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id); - error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1, - NULL, B_FALSE); + error = dmu_objset_snapshot_one(osname, snapname); if (error == ENOSPC) { ztest_record_enospc(FTAG); return (B_FALSE); } - if (error != 0 && error != EEXIST) - fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error); + if (error != 0 && error != EEXIST) { + fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname, + snapname, error); + } return (B_TRUE); } static boolean_t ztest_snapshot_destroy(char *osname, uint64_t id) { - char snapname[MAXNAMELEN]; + char snapname[ZFS_MAX_DATASET_NAME_LEN]; int error; - (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname, + (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname, (u_longlong_t)id); - error = dmu_objset_destroy(snapname, B_FALSE); + error = dsl_destroy_snapshot(snapname, B_FALSE); if (error != 0 && error != ENOENT) fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); return (B_TRUE); @@ -2881,18 +3263,17 @@ ztest_snapshot_destroy(char *osname, uin void ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; ztest_ds_t zdtmp; int iters; int error; objset_t *os, *os2; - char name[MAXNAMELEN]; + char name[ZFS_MAX_DATASET_NAME_LEN]; zilog_t *zilog; - (void) rw_rdlock(&zs->zs_name_lock); + (void) rw_rdlock(&ztest_name_lock); - (void) snprintf(name, MAXNAMELEN, "%s/temp_%llu", - zs->zs_pool, (u_longlong_t)id); + (void) snprintf(name, sizeof (name), "%s/temp_%llu", + ztest_opts.zo_pool, (u_longlong_t)id); /* * If this dataset exists from a previous run, process its replay log @@ -2901,7 +3282,7 @@ ztest_dmu_objset_create_destroy(ztest_ds */ if (ztest_random(2) == 0 && dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) { - ztest_zd_init(&zdtmp, os); + ztest_zd_init(&zdtmp, NULL, os); zil_replay(os, &zdtmp, ztest_replay_vector); ztest_zd_fini(&zdtmp); dmu_objset_disown(os, FTAG); @@ -2918,26 +3299,25 @@ ztest_dmu_objset_create_destroy(ztest_ds /* * Verify that the destroyed dataset is no longer in the namespace. */ - VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os)); + VERIFY3U(ENOENT, ==, dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, + FTAG, &os)); /* * Verify that we can create a new dataset. */ - error = dmu_objset_create(name, DMU_OST_OTHER, 0, - ztest_objset_create_cb, NULL); + error = ztest_dataset_create(name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); return; } fatal(0, "dmu_objset_create(%s) = %d", name, error); } - VERIFY3U(0, ==, - dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os)); + VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os)); - ztest_zd_init(&zdtmp, os); + ztest_zd_init(&zdtmp, NULL, os); /* * Open the intent log for it. @@ -2977,7 +3357,7 @@ ztest_dmu_objset_create_destroy(ztest_ds dmu_objset_disown(os, FTAG); ztest_zd_fini(&zdtmp); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } /* @@ -2986,12 +3366,10 @@ ztest_dmu_objset_create_destroy(ztest_ds void ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; - - (void) rw_rdlock(&zs->zs_name_lock); + (void) rw_rdlock(&ztest_name_lock); (void) ztest_snapshot_destroy(zd->zd_name, id); (void) ztest_snapshot_create(zd->zd_name, id); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } /* @@ -3000,34 +3378,39 @@ ztest_dmu_snapshot_create_destroy(ztest_ void ztest_dsl_dataset_cleanup(char *osname, uint64_t id) { - char snap1name[MAXNAMELEN]; - char clone1name[MAXNAMELEN]; - char snap2name[MAXNAMELEN]; - char clone2name[MAXNAMELEN]; - char snap3name[MAXNAMELEN]; + char snap1name[ZFS_MAX_DATASET_NAME_LEN]; + char clone1name[ZFS_MAX_DATASET_NAME_LEN]; + char snap2name[ZFS_MAX_DATASET_NAME_LEN]; + char clone2name[ZFS_MAX_DATASET_NAME_LEN]; + char snap3name[ZFS_MAX_DATASET_NAME_LEN]; int error; - (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id); - (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id); - (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id); - (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id); - (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id); + (void) snprintf(snap1name, sizeof (snap1name), + "%s@s1_%llu", osname, id); + (void) snprintf(clone1name, sizeof (clone1name), + "%s/c1_%llu", osname, id); + (void) snprintf(snap2name, sizeof (snap2name), + "%s@s2_%llu", clone1name, id); + (void) snprintf(clone2name, sizeof (clone2name), + "%s/c2_%llu", osname, id); + (void) snprintf(snap3name, sizeof (snap3name), + "%s@s3_%llu", clone1name, id); - error = dmu_objset_destroy(clone2name, B_FALSE); + error = dsl_destroy_head(clone2name); if (error && error != ENOENT) - fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error); - error = dmu_objset_destroy(snap3name, B_FALSE); + fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error); + error = dsl_destroy_snapshot(snap3name, B_FALSE); if (error && error != ENOENT) - fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error); - error = dmu_objset_destroy(snap2name, B_FALSE); + fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error); + error = dsl_destroy_snapshot(snap2name, B_FALSE); if (error && error != ENOENT) - fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error); - error = dmu_objset_destroy(clone1name, B_FALSE); + fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error); + error = dsl_destroy_head(clone1name); if (error && error != ENOENT) - fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error); - error = dmu_objset_destroy(snap1name, B_FALSE); + fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error); + error = dsl_destroy_snapshot(snap1name, B_FALSE); if (error && error != ENOENT) - fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error); + fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error); } /* @@ -3036,29 +3419,31 @@ ztest_dsl_dataset_cleanup(char *osname, void ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; - objset_t *clone; - dsl_dataset_t *ds; - char snap1name[MAXNAMELEN]; - char clone1name[MAXNAMELEN]; - char snap2name[MAXNAMELEN]; - char clone2name[MAXNAMELEN]; - char snap3name[MAXNAMELEN]; + objset_t *os; + char snap1name[ZFS_MAX_DATASET_NAME_LEN]; + char clone1name[ZFS_MAX_DATASET_NAME_LEN]; + char snap2name[ZFS_MAX_DATASET_NAME_LEN]; + char clone2name[ZFS_MAX_DATASET_NAME_LEN]; + char snap3name[ZFS_MAX_DATASET_NAME_LEN]; char *osname = zd->zd_name; int error; - (void) rw_rdlock(&zs->zs_name_lock); + (void) rw_rdlock(&ztest_name_lock); ztest_dsl_dataset_cleanup(osname, id); - (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id); - (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id); - (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id); - (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id); - (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id); + (void) snprintf(snap1name, sizeof (snap1name), + "%s@s1_%llu", osname, id); + (void) snprintf(clone1name, sizeof (clone1name), + "%s/c1_%llu", osname, id); + (void) snprintf(snap2name, sizeof (snap2name), + "%s@s2_%llu", clone1name, id); + (void) snprintf(clone2name, sizeof (clone2name), + "%s/c2_%llu", osname, id); + (void) snprintf(snap3name, sizeof (snap3name), + "%s@s3_%llu", clone1name, id); - error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1, - NULL, B_FALSE); + error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); @@ -3067,12 +3452,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_ fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); } - error = dmu_objset_hold(snap1name, FTAG, &clone); - if (error) - fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error); - - error = dmu_objset_clone(clone1name, dmu_objset_ds(clone), 0); - dmu_objset_rele(clone, FTAG); + error = dmu_objset_clone(clone1name, snap1name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); @@ -3081,8 +3461,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_ fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); } - error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1, - NULL, B_FALSE); + error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); @@ -3091,8 +3470,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_ fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); } - error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1, - NULL, B_FALSE); + error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); if (error && error != EEXIST) { if (error == ENOSPC) { ztest_record_enospc(FTAG); @@ -3101,12 +3479,7 @@ ztest_dsl_dataset_promote_busy(ztest_ds_ fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); } - error = dmu_objset_hold(snap3name, FTAG, &clone); - if (error) - fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); - - error = dmu_objset_clone(clone2name, dmu_objset_ds(clone), 0); - dmu_objset_rele(clone, FTAG); + error = dmu_objset_clone(clone2name, snap3name); if (error) { if (error == ENOSPC) { ztest_record_enospc(FTAG); @@ -3115,19 +3488,24 @@ ztest_dsl_dataset_promote_busy(ztest_ds_ fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); } - error = dsl_dataset_own(snap1name, B_FALSE, FTAG, &ds); + error = dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, FTAG, &os); if (error) - fatal(0, "dsl_dataset_own(%s) = %d", snap1name, error); + fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); error = dsl_dataset_promote(clone2name, NULL); + if (error == ENOSPC) { + dmu_objset_disown(os, FTAG); + ztest_record_enospc(FTAG); + goto out; + } if (error != EBUSY) fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, error); - dsl_dataset_disown(ds, FTAG); + dmu_objset_disown(os, FTAG); out: ztest_dsl_dataset_cleanup(osname, id); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } /* @@ -3219,7 +3597,8 @@ ztest_dmu_read_write(ztest_ds_t *zd, uin */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(2 * width - 1); - dmu_prefetch(os, bigobj, n * chunksize, s * chunksize); + dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, + ZIO_PRIORITY_SYNC_READ); /* * Pick a random index and compute the offsets into packobj and bigobj. @@ -3247,10 +3626,10 @@ ztest_dmu_read_write(ztest_ds_t *zd, uin */ error = dmu_read(os, packobj, packoff, packsize, packbuf, DMU_READ_PREFETCH); - ASSERT3U(error, ==, 0); + ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, DMU_READ_PREFETCH); - ASSERT3U(error, ==, 0); + ASSERT0(error); /* * Get a tx for the mods to both packobj and bigobj. @@ -3264,6 +3643,9 @@ ztest_dmu_read_write(ztest_ds_t *zd, uin else dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); + /* This accounts for setting the checksum/compression. */ + dmu_tx_hold_bonus(tx, bigobj); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); if (txg == 0) { umem_free(packbuf, packsize); @@ -3271,11 +3653,19 @@ ztest_dmu_read_write(ztest_ds_t *zd, uin return; } - dmu_object_set_checksum(os, bigobj, - (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx); + enum zio_checksum cksum; + do { + cksum = (enum zio_checksum) + ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); + } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); + dmu_object_set_checksum(os, bigobj, cksum, tx); - dmu_object_set_compress(os, bigobj, - (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx); + enum zio_compress comp; + do { + comp = (enum zio_compress) + ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); + } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); + dmu_object_set_compress(os, bigobj, comp, tx); /* * For each index from n to n + s, verify that the existing bufwad @@ -3326,7 +3716,7 @@ ztest_dmu_read_write(ztest_ds_t *zd, uin dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (freeit) { - if (zopt_verbose >= 7) { + if (ztest_opts.zo_verbose >= 7) { (void) printf("freeing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, @@ -3335,7 +3725,7 @@ ztest_dmu_read_write(ztest_ds_t *zd, uin } VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx)); } else { - if (zopt_verbose >= 7) { + if (ztest_opts.zo_verbose >= 7) { (void) printf("writing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, @@ -3560,10 +3950,10 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *z if (i != 0 || ztest_random(2) != 0) { error = dmu_read(os, packobj, packoff, packsize, packbuf, DMU_READ_PREFETCH); - ASSERT3U(error, ==, 0); + ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, DMU_READ_PREFETCH); - ASSERT3U(error, ==, 0); + ASSERT0(error); } compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, n, chunksize, txg); @@ -3573,7 +3963,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *z * Now write them out. */ dmu_write(os, packobj, packoff, packsize, packbuf, tx); - if (zopt_verbose >= 7) { + if (ztest_opts.zo_verbose >= 7) { (void) printf("writing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, @@ -3597,7 +3987,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *z if (i == 1) { VERIFY(dmu_buf_hold(os, bigobj, off, - FTAG, &dbt) == 0); + FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); } if (i != 5) { dmu_assign_arcbuf(bonus_db, off, @@ -3764,8 +4154,8 @@ ztest_zap(ztest_ds_t *zd, uint64_t id) ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); prop = ztest_random(ZTEST_ZAP_MAX_PROPS); - (void) snprintf(propname, sizeof(propname), "prop_%llu", (u_longlong_t)prop); - (void) snprintf(txgname, sizeof(txgname), "txg_%llu", (u_longlong_t)prop); + (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); + (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); bzero(value, sizeof (value)); last_txg = 0; @@ -3826,15 +4216,15 @@ ztest_zap(ztest_ds_t *zd, uint64_t id) * Remove a random pair of entries. */ prop = ztest_random(ZTEST_ZAP_MAX_PROPS); - (void) snprintf(propname, sizeof(propname), "prop_%llu", (u_longlong_t)prop); - (void) snprintf(txgname, sizeof(txgname), "txg_%llu", (u_longlong_t)prop); + (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); + (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); if (error == ENOENT) return; - ASSERT3U(error, ==, 0); + ASSERT0(error); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, object, B_TRUE, NULL); @@ -3869,7 +4259,7 @@ ztest_fzap(ztest_ds_t *zd, uint64_t id) * 2050 entries we should see ptrtbl growth and leaf-block split. */ for (int i = 0; i < 2050; i++) { - char name[MAXNAMELEN]; + char name[ZFS_MAX_DATASET_NAME_LEN]; uint64_t value = i; dmu_tx_t *tx; int error; @@ -3934,7 +4324,7 @@ ztest_zap_parallel(ztest_ds_t *zd, uint6 } count = -1ULL; - VERIFY(zap_count(os, object, &count) == 0); + VERIFY0(zap_count(os, object, &count)); ASSERT(count != -1ULL); /* @@ -4030,7 +4420,7 @@ ztest_commit_callback(void *arg, int err data->zcd_called = B_TRUE; if (error == ECANCELED) { - ASSERT3U(data->zcd_txg, ==, 0); + ASSERT0(data->zcd_txg); ASSERT(!data->zcd_added); /* @@ -4168,7 +4558,7 @@ ztest_dmu_commit_callbacks(ztest_ds_t *z */ tmp_cb = list_head(&zcl.zcl_callbacks); if (tmp_cb != NULL && - tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) { + (txg - ZTEST_COMMIT_CALLBACK_THRESH) > tmp_cb->zcd_txg) { fatal(0, "Commit callback threshold exceeded, oldest txg: %" PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); } @@ -4214,37 +4604,51 @@ ztest_dsl_prop_get_set(ztest_ds_t *zd, u ZFS_PROP_COPIES, ZFS_PROP_DEDUP }; - ztest_shared_t *zs = ztest_shared; - (void) rw_rdlock(&zs->zs_name_lock); + (void) rw_rdlock(&ztest_name_lock); for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } /* ARGSUSED */ void ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; nvlist_t *props = NULL; - (void) rw_rdlock(&zs->zs_name_lock); + (void) rw_rdlock(&ztest_name_lock); - (void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO, + (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_DEDUPDITTO, ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN)); - VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0); + VERIFY0(spa_prop_get(ztest_spa, &props)); - if (zopt_verbose >= 6) + if (ztest_opts.zo_verbose >= 6) dump_nvlist(props, 4); nvlist_free(props); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); +} + +static int +user_release_one(const char *snapname, const char *holdname) +{ + nvlist_t *snaps, *holds; + int error; + + snaps = fnvlist_alloc(); + holds = fnvlist_alloc(); + fnvlist_add_boolean(holds, holdname); + fnvlist_add_nvlist(snaps, snapname, holds); + fnvlist_free(holds); + error = dsl_dataset_user_release(snaps, NULL); + fnvlist_free(snaps); + return (error); } /* @@ -4260,29 +4664,37 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, char fullname[100]; char clonename[100]; char tag[100]; - char osname[MAXNAMELEN]; + char osname[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *holds; - (void) rw_rdlock(&ztest_shared->zs_name_lock); + (void) rw_rdlock(&ztest_name_lock); dmu_objset_name(os, osname); - (void) snprintf(snapname, 100, "sh1_%llu", id); - (void) snprintf(fullname, 100, "%s@%s", osname, snapname); - (void) snprintf(clonename, 100, "%s/ch1_%llu", osname, id); - (void) snprintf(tag, 100, "%tag_%llu", id); + (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", id); + (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); + (void) snprintf(clonename, sizeof (clonename), + "%s/ch1_%llu", osname, id); + (void) snprintf(tag, sizeof (tag), "tag_%llu", id); /* * Clean up from any previous run. */ - (void) dmu_objset_destroy(clonename, B_FALSE); - (void) dsl_dataset_user_release(osname, snapname, tag, B_FALSE); - (void) dmu_objset_destroy(fullname, B_FALSE); + error = dsl_destroy_head(clonename); + if (error != ENOENT) + ASSERT0(error); + error = user_release_one(fullname, tag); + if (error != ESRCH && error != ENOENT) + ASSERT0(error); + error = dsl_destroy_snapshot(fullname, B_FALSE); + if (error != ENOENT) + ASSERT0(error); /* * Create snapshot, clone it, mark snap for deferred destroy, * destroy clone, verify snap was also destroyed. */ - error = dmu_objset_snapshot(osname, snapname, NULL, FALSE); + error = dmu_objset_snapshot_one(osname, snapname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_snapshot"); @@ -4291,12 +4703,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); } - error = dmu_objset_hold(fullname, FTAG, &origin); - if (error) - fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); - - error = dmu_objset_clone(clonename, dmu_objset_ds(origin), 0); - dmu_objset_rele(origin, FTAG); + error = dmu_objset_clone(clonename, fullname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_clone"); @@ -4305,15 +4712,15 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); } - error = dmu_objset_destroy(fullname, B_TRUE); + error = dsl_destroy_snapshot(fullname, B_TRUE); if (error) { - fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d", + fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", fullname, error); } - error = dmu_objset_destroy(clonename, B_FALSE); + error = dsl_destroy_head(clonename); if (error) - fatal(0, "dmu_objset_destroy(%s) = %d", clonename, error); + fatal(0, "dsl_destroy_head(%s) = %d", clonename, error); error = dmu_objset_hold(fullname, FTAG, &origin); if (error != ENOENT) @@ -4324,7 +4731,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, * destroy a held snapshot, mark for deferred destroy, * release hold, verify snapshot was destroyed. */ - error = dmu_objset_snapshot(osname, snapname, NULL, FALSE); + error = dmu_objset_snapshot_one(osname, snapname); if (error) { if (error == ENOSPC) { ztest_record_enospc("dmu_objset_snapshot"); @@ -4333,30 +4740,39 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); } - error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE, B_TRUE); - if (error) - fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag); + holds = fnvlist_alloc(); + fnvlist_add_string(holds, fullname, tag); + error = dsl_dataset_user_hold(holds, 0, NULL); + fnvlist_free(holds); + + if (error == ENOSPC) { + ztest_record_enospc("dsl_dataset_user_hold"); + goto out; + } else if (error) { + fatal(0, "dsl_dataset_user_hold(%s, %s) = %u", + fullname, tag, error); + } - error = dmu_objset_destroy(fullname, B_FALSE); + error = dsl_destroy_snapshot(fullname, B_FALSE); if (error != EBUSY) { - fatal(0, "dmu_objset_destroy(%s, B_FALSE) = %d", + fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d", fullname, error); } - error = dmu_objset_destroy(fullname, B_TRUE); + error = dsl_destroy_snapshot(fullname, B_TRUE); if (error) { - fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d", + fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", fullname, error); } - error = dsl_dataset_user_release(osname, snapname, tag, B_FALSE); + error = user_release_one(fullname, tag); if (error) - fatal(0, "dsl_dataset_user_release(%s)", fullname, tag); + fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error); - VERIFY(dmu_objset_hold(fullname, FTAG, &origin) == ENOENT); + VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); out: - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } /* @@ -4367,16 +4783,16 @@ void ztest_fault_inject(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; int fd; uint64_t offset; uint64_t leaves; - uint64_t bad = 0x1990c0ffeedecade; + uint64_t bad = 0x1990c0ffeedecadeULL; uint64_t top, leaf; char path0[MAXPATHLEN]; char pathrand[MAXPATHLEN]; size_t fsize; - int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */ + int bshift = SPA_MAXBLOCKSHIFT + 2; int iters = 1000; int maxfaults; int mirror_save; @@ -4384,15 +4800,23 @@ ztest_fault_inject(ztest_ds_t *zd, uint6 uint64_t guid0 = 0; boolean_t islog = B_FALSE; - VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); maxfaults = MAXFAULTS(); - leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz; + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; mirror_save = zs->zs_mirrors; - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); ASSERT(leaves >= 1); /* + * Grab the name lock as reader. There are some operations + * which don't like to have their vdevs changed while + * they are in progress (i.e. spa_change_guid). Those + * operations will have grabbed the name lock as writer. + */ + (void) rw_rdlock(&ztest_name_lock); + + /* * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); @@ -4411,15 +4835,24 @@ ztest_fault_inject(ztest_ds_t *zd, uint6 * and we'll write random garbage to the randomly chosen leaf. */ (void) snprintf(path0, sizeof (path0), ztest_dev_template, - zopt_dir, zopt_pool, top * leaves + zs->zs_splits); + ztest_opts.zo_dir, ztest_opts.zo_pool, + top * leaves + zs->zs_splits); (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template, - zopt_dir, zopt_pool, top * leaves + leaf); + ztest_opts.zo_dir, ztest_opts.zo_pool, + top * leaves + leaf); vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); if (vd0 != NULL && vd0->vdev_top->vdev_islog) islog = B_TRUE; - if (vd0 != NULL && maxfaults != 1) { + /* + * If the top-level vdev needs to be resilvered + * then we only allow faults on the device that is + * resilvering. + */ + if (vd0 != NULL && maxfaults != 1 && + (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || + vd0->vdev_resilver_txg != 0)) { /* * Make vd0 explicitly claim to be unreadable, * or unwriteable, or reach behind its back @@ -4450,6 +4883,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint6 if (sav->sav_count == 0) { spa_config_exit(spa, SCL_STATE, FTAG); + (void) rw_unlock(&ztest_name_lock); return; } vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; @@ -4463,6 +4897,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint6 } spa_config_exit(spa, SCL_STATE, FTAG); + (void) rw_unlock(&ztest_name_lock); /* * If we can tolerate two or more faults, or we're dealing @@ -4482,14 +4917,25 @@ ztest_fault_inject(ztest_ds_t *zd, uint6 * leaving the dataset in an inconsistent state. */ if (islog) - (void) rw_wrlock(&ztest_shared->zs_name_lock); + (void) rw_wrlock(&ztest_name_lock); VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); if (islog) - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } else { + /* + * Ideally we would like to be able to randomly + * call vdev_[on|off]line without holding locks + * to force unpredictable failures but the side + * effects of vdev_[on|off]line prevent us from + * doing so. We grab the ztest_vdev_lock here to + * prevent a race between injection testing and + * aux_vdev removal. + */ + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); (void) vdev_online(spa, guid0, 0, NULL); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); } } @@ -4507,16 +4953,63 @@ ztest_fault_inject(ztest_ds_t *zd, uint6 fsize = lseek(fd, 0, SEEK_END); while (--iters != 0) { + /* + * The offset must be chosen carefully to ensure that + * we do not inject a given logical block with errors + * on two different leaf devices, because ZFS can not + * tolerate that (if maxfaults==1). + * + * We divide each leaf into chunks of size + * (# leaves * SPA_MAXBLOCKSIZE * 4). Within each chunk + * there is a series of ranges to which we can inject errors. + * Each range can accept errors on only a single leaf vdev. + * The error injection ranges are separated by ranges + * which we will not inject errors on any device (DMZs). + * Each DMZ must be large enough such that a single block + * can not straddle it, so that a single block can not be + * a target in two different injection ranges (on different + * leaf vdevs). + * + * For example, with 3 leaves, each chunk looks like: + * 0 to 32M: injection range for leaf 0 + * 32M to 64M: DMZ - no injection allowed + * 64M to 96M: injection range for leaf 1 + * 96M to 128M: DMZ - no injection allowed + * 128M to 160M: injection range for leaf 2 + * 160M to 192M: DMZ - no injection allowed + */ offset = ztest_random(fsize / (leaves << bshift)) * (leaves << bshift) + (leaf << bshift) + (ztest_random(1ULL << (bshift - 1)) & -8ULL); - if (offset >= fsize) + /* + * Only allow damage to the labels at one end of the vdev. + * + * If all labels are damaged, the device will be totally + * inaccessible, which will result in loss of data, + * because we also damage (parts of) the other side of + * the mirror/raidz. + * + * Additionally, we will always have both an even and an + * odd label, so that we can handle crashes in the + * middle of vdev_config_sync(). + */ + if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) continue; - VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + /* + * The two end labels are stored at the "end" of the disk, but + * the end of the disk (vdev_psize) is aligned to + * sizeof (vdev_label_t). + */ + uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); + if ((leaf & 1) == 1 && + offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) + continue; + + VERIFY(mutex_lock(&ztest_vdev_lock) == 0); if (mirror_save != zs->zs_mirrors) { - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); (void) close(fd); return; } @@ -4525,9 +5018,9 @@ ztest_fault_inject(ztest_ds_t *zd, uint6 fatal(1, "can't inject bad word at 0x%llx in %s", offset, pathrand); - VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + VERIFY(mutex_unlock(&ztest_vdev_lock) == 0); - if (zopt_verbose >= 7) + if (ztest_opts.zo_verbose >= 7) (void) printf("injected bad word into %s," " offset 0x%llx\n", pathrand, (u_longlong_t)offset); } @@ -4542,7 +5035,7 @@ void ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) { ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; objset_t *os = zd->zd_os; ztest_od_t od[1]; uint64_t object, blocksize, txg, pattern, psize; @@ -4565,19 +5058,24 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_ * Take the name lock as writer to prevent anyone else from changing * the pool and dataset properies we need to maintain during this test. */ - (void) rw_wrlock(&zs->zs_name_lock); + (void) rw_wrlock(&ztest_name_lock); if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum, B_FALSE) != 0 || ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1, B_FALSE) != 0) { - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); return; } + dmu_objset_stats_t dds; + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + dmu_objset_fast_stat(os, &dds); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + object = od[0].od_object; blocksize = od[0].od_blocksize; - pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os); + pattern = zs->zs_guid ^ dds.dds_guid; ASSERT(object != 0); @@ -4585,7 +5083,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_ dmu_tx_hold_write(tx, object, 0, copies * blocksize); txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); if (txg == 0) { - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); return; } @@ -4594,7 +5092,12 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_ */ for (int i = 0; i < copies; i++) { uint64_t offset = i * blocksize; - VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db) == 0); + int error = dmu_buf_hold(os, object, offset, FTAG, &db, + DMU_READ_NO_PREFETCH); + if (error != 0) { + fatal(B_FALSE, "dmu_buf_hold(%p, %llu, %llu) = %u", + os, (long long)object, (long long) offset, error); + } ASSERT(db->db_offset == offset); ASSERT(db->db_size == blocksize); ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) || @@ -4610,7 +5113,8 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_ /* * Find out what block we got. */ - VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db) == 0); + VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db, + DMU_READ_NO_PREFETCH)); blk = *((dmu_buf_impl_t *)db)->db_blkptr; dmu_buf_rele(db, FTAG); @@ -4627,7 +5131,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_ zio_buf_free(buf, psize); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } /* @@ -4637,12 +5141,41 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_ void ztest_scrub(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; - spa_t *spa = zs->zs_spa; + spa_t *spa = ztest_spa; - (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); + (void) spa_scan(spa, POOL_SCAN_SCRUB); (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */ - (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); + (void) spa_scan(spa, POOL_SCAN_SCRUB); +} + +/* + * Change the guid for the pool. + */ +/* ARGSUSED */ +void +ztest_reguid(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + uint64_t orig, load; + int error; + + orig = spa_guid(spa); + load = spa_load_guid(spa); + + (void) rw_wrlock(&ztest_name_lock); + error = spa_change_guid(spa); + (void) rw_unlock(&ztest_name_lock); + + if (error != 0) + return; + + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Changed guid old %llu -> %llu\n", + (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); + } + + VERIFY3U(orig, !=, spa_guid(spa)); + VERIFY3U(load, ==, spa_load_guid(spa)); } /* @@ -4652,13 +5185,12 @@ ztest_scrub(ztest_ds_t *zd, uint64_t id) void ztest_spa_rename(ztest_ds_t *zd, uint64_t id) { - ztest_shared_t *zs = ztest_shared; char *oldname, *newname; spa_t *spa; - (void) rw_wrlock(&zs->zs_name_lock); + (void) rw_wrlock(&ztest_name_lock); - oldname = zs->zs_pool; + oldname = ztest_opts.zo_pool; newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL); (void) strcpy(newname, oldname); (void) strcat(newname, "_tmp"); @@ -4678,7 +5210,7 @@ ztest_spa_rename(ztest_ds_t *zd, uint64_ */ VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); - ASSERT(spa == zs->zs_spa); + ASSERT(spa == ztest_spa); spa_close(spa, FTAG); /* @@ -4691,12 +5223,12 @@ ztest_spa_rename(ztest_ds_t *zd, uint64_ */ VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); - ASSERT(spa == zs->zs_spa); + ASSERT(spa == ztest_spa); spa_close(spa, FTAG); umem_free(newname, strlen(newname) + 1); - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); } /* @@ -4714,7 +5246,7 @@ ztest_run_zdb(char *pool) int isalen; FILE *fp; - (void) realpath(getexecname(), zdb); + strlcpy(zdb, "/usr/bin/ztest", sizeof(zdb)); /* zdb lives in /usr/sbin, while ztest lives in /usr/bin */ bin = strstr(zdb, "/usr/bin/"); @@ -4723,22 +5255,24 @@ ztest_run_zdb(char *pool) isalen = ztest - isa; isa = strdup(isa); /* LINTED */ - (void) snprintf(bin, sizeof(zdb) - (bin - zdb), - "/usr/sbin%.*s/zdb -bcc%s%s -U /tmp/zpool.cache %s", + (void) sprintf(bin, + "/usr/sbin%.*s/zdb -bcc%s%s -d -U %s %s", isalen, isa, - zopt_verbose >= 3 ? "s" : "", - zopt_verbose >= 4 ? "v" : "", + ztest_opts.zo_verbose >= 3 ? "s" : "", + ztest_opts.zo_verbose >= 4 ? "v" : "", + spa_config_path, pool); free(isa); - if (zopt_verbose >= 5) + if (ztest_opts.zo_verbose >= 5) (void) printf("Executing %s\n", strstr(zdb, "zdb ")); fp = popen(zdb, "r"); + assert(fp != NULL); while (fgets(zbuf, sizeof (zbuf), fp) != NULL) - if (zopt_verbose >= 3) + if (ztest_opts.zo_verbose >= 3) (void) printf("%s", zbuf); status = pclose(fp); @@ -4758,12 +5292,12 @@ ztest_walk_pool_directory(char *header) { spa_t *spa = NULL; - if (zopt_verbose >= 6) + if (ztest_opts.zo_verbose >= 6) (void) printf("%s\n", header); mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) - if (zopt_verbose >= 6) + if (ztest_opts.zo_verbose >= 6) (void) printf("\t%s\n", spa_name(spa)); mutex_exit(&spa_namespace_lock); } @@ -4774,8 +5308,9 @@ ztest_spa_import_export(char *oldname, c nvlist_t *config, *newconfig; uint64_t pool_guid; spa_t *spa; + int error; - if (zopt_verbose >= 4) { + if (ztest_opts.zo_verbose >= 4) { (void) printf("import/export: old = %s, new = %s\n", oldname, newname); } @@ -4794,7 +5329,7 @@ ztest_spa_import_export(char *oldname, c * Kick off a scrub to tickle scrub/export races. */ if (ztest_random(2) == 0) - (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); + (void) spa_scan(spa, POOL_SCAN_SCRUB); pool_guid = spa_guid(spa); spa_close(spa, FTAG); @@ -4818,19 +5353,24 @@ ztest_spa_import_export(char *oldname, c /* * Import it under the new name. */ - VERIFY3U(0, ==, spa_import(newname, config, NULL)); + error = spa_import(newname, config, NULL, 0); + if (error != 0) { + dump_nvlist(config, 0); + fatal(B_FALSE, "couldn't import pool %s as %s: error %u", + oldname, newname, error); + } ztest_walk_pool_directory("pools after import"); /* * Try to import it again -- should fail with EEXIST. */ - VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL)); + VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); /* * Try to import it under a different name -- should fail with EEXIST. */ - VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL)); + VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); /* * Verify that the pool is no longer visible under the old name. @@ -4850,7 +5390,7 @@ ztest_spa_import_export(char *oldname, c static void ztest_resume(spa_t *spa) { - if (spa_suspended(spa) && zopt_verbose >= 6) + if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) (void) printf("resuming from suspended state\n"); spa_vdev_state_enter(spa, SCL_NONE); vdev_clear(spa, NULL); @@ -4867,6 +5407,12 @@ ztest_resume_thread(void *arg) if (spa_suspended(spa)) ztest_resume(spa); (void) poll(NULL, 0, 100); + + /* + * Periodically change the zfs_compressed_arc_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_compressed_arc_enabled = ztest_random(2); } return (NULL); } @@ -4875,23 +5421,40 @@ static void * ztest_deadman_thread(void *arg) { ztest_shared_t *zs = arg; - int grace = 300; - hrtime_t delta; + spa_t *spa = ztest_spa; + hrtime_t delta, total = 0; - delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace; - - (void) poll(NULL, 0, (int)(1000 * delta)); + for (;;) { + delta = zs->zs_thread_stop - zs->zs_thread_start + + MSEC2NSEC(zfs_deadman_synctime_ms); - fatal(0, "failed to complete within %d seconds of deadline", grace); + (void) poll(NULL, 0, (int)NSEC2MSEC(delta)); - return (NULL); + /* + * If the pool is suspended then fail immediately. Otherwise, + * check to see if the pool is making any progress. If + * vdev_deadman() discovers that there hasn't been any recent + * I/Os then it will end up aborting the tests. + */ + if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { + fatal(0, "aborting test after %llu seconds because " + "pool has transitioned to a suspended state.", + zfs_deadman_synctime_ms / 1000); + return (NULL); + } + vdev_deadman(spa->spa_root_vdev); + + total += zfs_deadman_synctime_ms/1000; + (void) printf("ztest has been running for %lld seconds\n", + total); + } } static void -ztest_execute(ztest_info_t *zi, uint64_t id) +ztest_execute(int test, ztest_info_t *zi, uint64_t id) { - ztest_shared_t *zs = ztest_shared; - ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets]; + ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; + ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); hrtime_t functime = gethrtime(); for (int i = 0; i < zi->zi_iters; i++) @@ -4899,10 +5462,10 @@ ztest_execute(ztest_info_t *zi, uint64_t functime = gethrtime() - functime; - atomic_add_64(&zi->zi_call_count, 1); - atomic_add_64(&zi->zi_call_time, functime); + atomic_add_64(&zc->zc_count, 1); + atomic_add_64(&zc->zc_time, functime); - if (zopt_verbose >= 4) { + if (ztest_opts.zo_verbose >= 4) { Dl_info dli; (void) dladdr((void *)zi->zi_func, &dli); (void) printf("%6.2f sec in %s\n", @@ -4913,11 +5476,13 @@ ztest_execute(ztest_info_t *zi, uint64_t static void * ztest_thread(void *arg) { + int rand; uint64_t id = (uintptr_t)arg; ztest_shared_t *zs = ztest_shared; uint64_t call_next; hrtime_t now; ztest_info_t *zi; + ztest_shared_callstate_t *zc; while ((now = gethrtime()) < zs->zs_thread_stop) { /* @@ -4935,13 +5500,16 @@ ztest_thread(void *arg) /* * Pick a random function to execute. */ - zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)]; - call_next = zi->zi_call_next; + rand = ztest_random(ZTEST_FUNCS); + zi = &ztest_info[rand]; + zc = ZTEST_GET_SHARED_CALLSTATE(rand); + call_next = zc->zc_next; if (now >= call_next && - atomic_cas_64(&zi->zi_call_next, call_next, call_next + - ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) - ztest_execute(zi, id); + atomic_cas_64(&zc->zc_next, call_next, call_next + + ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { + ztest_execute(rand, zi, id); + } } return (NULL); @@ -4950,17 +5518,17 @@ ztest_thread(void *arg) static void ztest_dataset_name(char *dsname, char *pool, int d) { - (void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d); + (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); } static void -ztest_dataset_destroy(ztest_shared_t *zs, int d) +ztest_dataset_destroy(int d) { - char name[MAXNAMELEN]; + char name[ZFS_MAX_DATASET_NAME_LEN]; - ztest_dataset_name(name, zs->zs_pool, d); + ztest_dataset_name(name, ztest_opts.zo_pool, d); - if (zopt_verbose >= 3) + if (ztest_opts.zo_verbose >= 3) (void) printf("Destroying %s to free up space\n", name); /* @@ -4968,8 +5536,10 @@ ztest_dataset_destroy(ztest_shared_t *zs * ztest thread t operates on dataset (t % zopt_datasets), * so there may be more than one thing to clean up. */ - for (int t = d; t < zopt_threads; t += zopt_datasets) + for (int t = d; t < ztest_opts.zo_threads; + t += ztest_opts.zo_datasets) { ztest_dsl_dataset_cleanup(name, t); + } (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); @@ -4997,32 +5567,31 @@ ztest_dataset_dirobj_verify(ztest_ds_t * } static int -ztest_dataset_open(ztest_shared_t *zs, int d) +ztest_dataset_open(int d) { - ztest_ds_t *zd = &zs->zs_zd[d]; - uint64_t committed_seq = zd->zd_seq; + ztest_ds_t *zd = &ztest_ds[d]; + uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; objset_t *os; zilog_t *zilog; - char name[MAXNAMELEN]; + char name[ZFS_MAX_DATASET_NAME_LEN]; int error; - ztest_dataset_name(name, zs->zs_pool, d); + ztest_dataset_name(name, ztest_opts.zo_pool, d); - (void) rw_rdlock(&zs->zs_name_lock); + (void) rw_rdlock(&ztest_name_lock); - error = dmu_objset_create(name, DMU_OST_OTHER, 0, - ztest_objset_create_cb, NULL); + error = ztest_dataset_create(name); if (error == ENOSPC) { - (void) rw_unlock(&zs->zs_name_lock); + (void) rw_unlock(&ztest_name_lock); ztest_record_enospc(FTAG); return (error); } ASSERT(error == 0 || error == EEXIST); - VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0); - (void) rw_unlock(&zs->zs_name_lock); + VERIFY0(dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, zd, &os)); + (void) rw_unlock(&ztest_name_lock); - ztest_zd_init(zd, os); + ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); zilog = zd->zd_zilog; @@ -5037,7 +5606,7 @@ ztest_dataset_open(ztest_shared_t *zs, i ztest_dataset_dirobj_verify(zd); - if (zopt_verbose >= 6) + if (ztest_opts.zo_verbose >= 6) (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", zd->zd_name, (u_longlong_t)zilog->zl_parse_blk_count, @@ -5055,12 +5624,12 @@ ztest_dataset_open(ztest_shared_t *zs, i } static void -ztest_dataset_close(ztest_shared_t *zs, int d) +ztest_dataset_close(int d) { - ztest_ds_t *zd = &zs->zs_zd[d]; + ztest_ds_t *zd = &ztest_ds[d]; zil_close(zd->zd_zilog); - dmu_objset_rele(zd->zd_os, zd); + dmu_objset_disown(zd->zd_os, zd); ztest_zd_fini(zd); } @@ -5073,6 +5642,7 @@ ztest_run(ztest_shared_t *zs) { thread_t *tid; spa_t *spa; + objset_t *os; thread_t resume_tid; int error; @@ -5081,15 +5651,18 @@ ztest_run(ztest_shared_t *zs) /* * Initialize parent/child shared state. */ - VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0); - VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0); + VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0); + VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0); zs->zs_thread_start = gethrtime(); - zs->zs_thread_stop = zs->zs_thread_start + zopt_passtime * NANOSEC; + zs->zs_thread_stop = + zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); zs->zs_thread_kill = zs->zs_thread_stop; - if (ztest_random(100) < zopt_killrate) - zs->zs_thread_kill -= ztest_random(zopt_passtime * NANOSEC); + if (ztest_random(100) < ztest_opts.zo_killrate) { + zs->zs_thread_kill -= + ztest_random(ztest_opts.zo_passtime * NANOSEC); + } (void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL); @@ -5100,8 +5673,19 @@ ztest_run(ztest_shared_t *zs) * Open our pool. */ kernel_init(FREAD | FWRITE); - VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0); - zs->zs_spa = spa; + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + spa->spa_debug = B_TRUE; + metaslab_preload_limit = ztest_random(20) + 1; + ztest_spa = spa; + + dmu_objset_stats_t dds; + VERIFY0(dmu_objset_own(ztest_opts.zo_pool, + DMU_OST_ANY, B_TRUE, FTAG, &os)); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + dmu_objset_fast_stat(os, &dds); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + zs->zs_guid = dds.dds_guid; + dmu_objset_disown(os, FTAG); spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN; @@ -5146,21 +5730,23 @@ ztest_run(ztest_shared_t *zs) * If we got any ENOSPC errors on the previous run, destroy something. */ if (zs->zs_enospc_count != 0) { - int d = ztest_random(zopt_datasets); - ztest_dataset_destroy(zs, d); + int d = ztest_random(ztest_opts.zo_datasets); + ztest_dataset_destroy(d); } zs->zs_enospc_count = 0; - tid = umem_zalloc(zopt_threads * sizeof (thread_t), UMEM_NOFAIL); + tid = umem_zalloc(ztest_opts.zo_threads * sizeof (thread_t), + UMEM_NOFAIL); - if (zopt_verbose >= 4) + if (ztest_opts.zo_verbose >= 4) (void) printf("starting main threads...\n"); /* * Kick off all the tests that run in parallel. */ - for (int t = 0; t < zopt_threads; t++) { - if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0) + for (int t = 0; t < ztest_opts.zo_threads; t++) { + if (t < ztest_opts.zo_datasets && + ztest_dataset_open(t) != 0) return; VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t, THR_BOUND, &tid[t]) == 0); @@ -5170,18 +5756,19 @@ ztest_run(ztest_shared_t *zs) * Wait for all of the tests to complete. We go in reverse order * so we don't close datasets while threads are still using them. */ - for (int t = zopt_threads - 1; t >= 0; t--) { + for (int t = ztest_opts.zo_threads - 1; t >= 0; t--) { VERIFY(thr_join(tid[t], NULL, NULL) == 0); - if (t < zopt_datasets) - ztest_dataset_close(zs, t); + if (t < ztest_opts.zo_datasets) + ztest_dataset_close(t); } txg_wait_synced(spa_get_dsl(spa), 0); zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); + zfs_dbgmsg_print(FTAG); - umem_free(tid, zopt_threads * sizeof (thread_t)); + umem_free(tid, ztest_opts.zo_threads * sizeof (thread_t)); /* Kill the resume thread */ ztest_exiting = B_TRUE; @@ -5192,8 +5779,10 @@ ztest_run(ztest_shared_t *zs) * Right before closing the pool, kick off a bunch of async I/O; * spa_close() should wait for it to complete. */ - for (uint64_t object = 1; object < 50; object++) - dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20); + for (uint64_t object = 1; object < 50; object++) { + dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, + ZIO_PRIORITY_SYNC_READ); + } spa_close(spa, FTAG); @@ -5202,7 +5791,7 @@ ztest_run(ztest_shared_t *zs) */ mutex_enter(&spa_namespace_lock); for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) - if (zopt_verbose > 3) + if (ztest_opts.zo_verbose > 3) (void) printf("spa_next: found %s\n", spa_name(spa)); mutex_exit(&spa_namespace_lock); @@ -5211,27 +5800,38 @@ ztest_run(ztest_shared_t *zs) * different name. */ if (ztest_random(2) == 0) { - char name[MAXNAMELEN]; - (void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool); - ztest_spa_import_export(zs->zs_pool, name); - ztest_spa_import_export(name, zs->zs_pool); + char name[ZFS_MAX_DATASET_NAME_LEN]; + (void) snprintf(name, sizeof (name), "%s_import", + ztest_opts.zo_pool); + ztest_spa_import_export(ztest_opts.zo_pool, name); + ztest_spa_import_export(name, ztest_opts.zo_pool); } kernel_fini(); + + list_destroy(&zcl.zcl_callbacks); + + (void) _mutex_destroy(&zcl.zcl_callbacks_lock); + + (void) rwlock_destroy(&ztest_name_lock); + (void) _mutex_destroy(&ztest_vdev_lock); } static void -ztest_freeze(ztest_shared_t *zs) +ztest_freeze(void) { - ztest_ds_t *zd = &zs->zs_zd[0]; + ztest_ds_t *zd = &ztest_ds[0]; spa_t *spa; + int numloops = 0; - if (zopt_verbose >= 3) + if (ztest_opts.zo_verbose >= 3) (void) printf("testing spa_freeze()...\n"); kernel_init(FREAD | FWRITE); - VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); - VERIFY3U(0, ==, ztest_dataset_open(zs, 0)); + VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); + VERIFY3U(0, ==, ztest_dataset_open(0)); + spa->spa_debug = B_TRUE; + ztest_spa = spa; /* * Force the first log block to be transactionally allocated. @@ -5240,7 +5840,7 @@ ztest_freeze(ztest_shared_t *zs) */ while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { ztest_dmu_object_alloc_free(zd, 0); - zil_commit(zd->zd_zilog, UINT64_MAX, 0); + zil_commit(zd->zd_zilog, 0); } txg_wait_synced(spa_get_dsl(spa), 0); @@ -5252,28 +5852,43 @@ ztest_freeze(ztest_shared_t *zs) spa_freeze(spa); /* + * Because it is hard to predict how much space a write will actually + * require beforehand, we leave ourselves some fudge space to write over + * capacity. + */ + uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; + + /* * Run tests that generate log records but don't alter the pool config * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). * We do a txg_wait_synced() after each iteration to force the txg * to increase well beyond the last synced value in the uberblock. * The ZIL should be OK with that. + * + * Run a random number of times less than zo_maxloops and ensure we do + * not run out of space on the pool. */ - while (ztest_random(20) != 0) { - ztest_dmu_write_parallel(zd, 0); - ztest_dmu_object_alloc_free(zd, 0); + while (ztest_random(10) != 0 && + numloops++ < ztest_opts.zo_maxloops && + metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { + ztest_od_t od; + ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); + VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); + ztest_io(zd, od.od_object, + ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); txg_wait_synced(spa_get_dsl(spa), 0); } /* * Commit all of the changes we just generated. */ - zil_commit(zd->zd_zilog, UINT64_MAX, 0); + zil_commit(zd->zd_zilog, 0); txg_wait_synced(spa_get_dsl(spa), 0); /* * Close our dataset and close the pool. */ - ztest_dataset_close(zs, 0); + ztest_dataset_close(0); spa_close(spa, FTAG); kernel_fini(); @@ -5281,22 +5896,22 @@ ztest_freeze(ztest_shared_t *zs) * Open and close the pool and dataset to induce log replay. */ kernel_init(FREAD | FWRITE); - VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); - VERIFY3U(0, ==, ztest_dataset_open(zs, 0)); - ztest_dataset_close(zs, 0); - spa_close(spa, FTAG); - kernel_fini(); - - list_destroy(&zcl.zcl_callbacks); + VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); + ASSERT(spa_freeze_txg(spa) == UINT64_MAX); + VERIFY3U(0, ==, ztest_dataset_open(0)); + ztest_dataset_close(0); - (void) _mutex_destroy(&zcl.zcl_callbacks_lock); + spa->spa_debug = B_TRUE; + ztest_spa = spa; + txg_wait_synced(spa_get_dsl(spa), 0); + ztest_reguid(NULL, 0); - (void) rwlock_destroy(&zs->zs_name_lock); - (void) _mutex_destroy(&zs->zs_vdev_lock); + spa_close(spa, FTAG); + kernel_fini(); } void -print_time(hrtime_t t, char *timebuf, size_t timelen) +print_time(hrtime_t t, char *timebuf) { hrtime_t s = t / NANOSEC; hrtime_t m = s / 60; @@ -5310,14 +5925,14 @@ print_time(hrtime_t t, char *timebuf, si timebuf[0] = '\0'; if (d) - (void) snprintf(timebuf, timelen, + (void) sprintf(timebuf, "%llud%02lluh%02llum%02llus", d, h, m, s); else if (h) - (void) snprintf(timebuf, timelen, "%lluh%02llum%02llus", h, m, s); + (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); else if (m) - (void) snprintf(timebuf, timelen, "%llum%02llus", m, s); + (void) sprintf(timebuf, "%llum%02llus", m, s); else - (void) snprintf(timebuf, timelen, "%llus", s); + (void) sprintf(timebuf, "%llus", s); } static nvlist_t * @@ -5325,15 +5940,11 @@ make_random_props() { nvlist_t *props; - if (ztest_random(2) == 0) - return (NULL); - VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); + if (ztest_random(2) == 0) + return (props); VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0); - (void) printf("props:\n"); - dump_nvlist(props, 4); - return (props); } @@ -5347,35 +5958,225 @@ ztest_init(ztest_shared_t *zs) spa_t *spa; nvlist_t *nvroot, *props; - VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0); - VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0); + VERIFY(_mutex_init(&ztest_vdev_lock, USYNC_THREAD, NULL) == 0); + VERIFY(rwlock_init(&ztest_name_lock, USYNC_THREAD, NULL) == 0); kernel_init(FREAD | FWRITE); /* * Create the storage pool. */ - (void) spa_destroy(zs->zs_pool); + (void) spa_destroy(ztest_opts.zo_pool); ztest_shared->zs_vdev_next_leaf = 0; zs->zs_splits = 0; - zs->zs_mirrors = zopt_mirrors; - nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, - 0, zopt_raidz, zs->zs_mirrors, 1); + zs->zs_mirrors = ztest_opts.zo_mirrors; + nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, + 0, ztest_opts.zo_raidz, zs->zs_mirrors, 1); props = make_random_props(); - VERIFY3U(0, ==, spa_create(zs->zs_pool, nvroot, props, NULL, NULL)); + for (int i = 0; i < SPA_FEATURES; i++) { + char buf[1024]; + (void) snprintf(buf, sizeof (buf), "feature@%s", + spa_feature_table[i].fi_uname); + VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0)); + } + VERIFY3U(0, ==, spa_create(ztest_opts.zo_pool, nvroot, props, NULL)); nvlist_free(nvroot); + nvlist_free(props); + + VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); + zs->zs_metaslab_sz = + 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; - VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); - metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; spa_close(spa, FTAG); kernel_fini(); - ztest_run_zdb(zs->zs_pool); + ztest_run_zdb(ztest_opts.zo_pool); + + ztest_freeze(); + + ztest_run_zdb(ztest_opts.zo_pool); + + (void) rwlock_destroy(&ztest_name_lock); + (void) _mutex_destroy(&ztest_vdev_lock); +} + +static void +setup_data_fd(void) +{ + static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; + + ztest_fd_data = mkstemp(ztest_name_data); + ASSERT3S(ztest_fd_data, >=, 0); + (void) unlink(ztest_name_data); +} + + +static int +shared_data_size(ztest_shared_hdr_t *hdr) +{ + int size; + + size = hdr->zh_hdr_size; + size += hdr->zh_opts_size; + size += hdr->zh_size; + size += hdr->zh_stats_size * hdr->zh_stats_count; + size += hdr->zh_ds_size * hdr->zh_ds_count; + + return (size); +} + +static void +setup_hdr(void) +{ + int size; + ztest_shared_hdr_t *hdr; + + hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), + PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); + ASSERT(hdr != MAP_FAILED); + + VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); + + hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); + hdr->zh_opts_size = sizeof (ztest_shared_opts_t); + hdr->zh_size = sizeof (ztest_shared_t); + hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); + hdr->zh_stats_count = ZTEST_FUNCS; + hdr->zh_ds_size = sizeof (ztest_shared_ds_t); + hdr->zh_ds_count = ztest_opts.zo_datasets; + + size = shared_data_size(hdr); + VERIFY3U(0, ==, ftruncate(ztest_fd_data, size)); + + (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); +} + +static void +setup_data(void) +{ + int size, offset; + ztest_shared_hdr_t *hdr; + uint8_t *buf; + + hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), + PROT_READ, MAP_SHARED, ztest_fd_data, 0); + ASSERT(hdr != MAP_FAILED); + + size = shared_data_size(hdr); + + (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); + hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), + PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); + ASSERT(hdr != MAP_FAILED); + buf = (uint8_t *)hdr; + + offset = hdr->zh_hdr_size; + ztest_shared_opts = (void *)&buf[offset]; + offset += hdr->zh_opts_size; + ztest_shared = (void *)&buf[offset]; + offset += hdr->zh_size; + ztest_shared_callstate = (void *)&buf[offset]; + offset += hdr->zh_stats_size * hdr->zh_stats_count; + ztest_shared_ds = (void *)&buf[offset]; +} + +static boolean_t +exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) +{ + pid_t pid; + int status; + char *cmdbuf = NULL; + + pid = fork(); + + if (cmd == NULL) { + cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); + cmd = cmdbuf; + } + + if (pid == -1) + fatal(1, "fork failed"); + + if (pid == 0) { /* child */ + char *emptyargv[2] = { cmd, NULL }; + char fd_data_str[12]; + + struct rlimit rl = { 1024, 1024 }; + (void) setrlimit(RLIMIT_NOFILE, &rl); + + (void) close(ztest_fd_rand); + VERIFY3U(11, >=, + snprintf(fd_data_str, 12, "%d", ztest_fd_data)); + VERIFY0(setenv("ZTEST_FD_DATA", fd_data_str, 1)); + + (void) enable_extended_FILE_stdio(-1, -1); + if (libpath != NULL) + VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1)); +#ifdef illumos + (void) execv(cmd, emptyargv); +#else + (void) execvp(cmd, emptyargv); +#endif + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "exec failed: %s", cmd); + } + + if (cmdbuf != NULL) { + umem_free(cmdbuf, MAXPATHLEN); + cmd = NULL; + } + + while (waitpid(pid, &status, 0) != pid) + continue; + if (statusp != NULL) + *statusp = status; + + if (WIFEXITED(status)) { + if (WEXITSTATUS(status) != 0) { + (void) fprintf(stderr, "child exited with code %d\n", + WEXITSTATUS(status)); + exit(2); + } + return (B_FALSE); + } else if (WIFSIGNALED(status)) { + if (!ignorekill || WTERMSIG(status) != SIGKILL) { + (void) fprintf(stderr, "child died with signal %d\n", + WTERMSIG(status)); + exit(3); + } + return (B_TRUE); + } else { + (void) fprintf(stderr, "something strange happened to child\n"); + exit(4); + /* NOTREACHED */ + } +} + +static void +ztest_run_init(void) +{ + ztest_shared_t *zs = ztest_shared; + + ASSERT(ztest_opts.zo_init != 0); - ztest_freeze(zs); + /* + * Blow away any existing copy of zpool.cache + */ + (void) remove(spa_config_path); - ztest_run_zdb(zs->zs_pool); + /* + * Create and initialize our storage pool. + */ + for (int i = 1; i <= ztest_opts.zo_init; i++) { + bzero(zs, sizeof (ztest_shared_t)); + if (ztest_opts.zo_verbose >= 3 && + ztest_opts.zo_init != 1) { + (void) printf("ztest_init(), pass %d\n", i); + } + ztest_init(zs); + } } int @@ -5383,63 +6184,98 @@ main(int argc, char **argv) { int kills = 0; int iters = 0; + int older = 0; + int newer = 0; ztest_shared_t *zs; - size_t shared_size; ztest_info_t *zi; + ztest_shared_callstate_t *zc; char timebuf[100]; char numbuf[6]; spa_t *spa; + char *cmd; + boolean_t hasalt; + char *fd_data_str = getenv("ZTEST_FD_DATA"); (void) setvbuf(stdout, NULL, _IOLBF, 0); - /* Override location of zpool.cache */ - spa_config_path = "/tmp/zpool.cache"; + dprintf_setup(&argc, argv); + zfs_deadman_synctime_ms = 300000; - ztest_random_fd = open("/dev/urandom", O_RDONLY); + ztest_fd_rand = open("/dev/urandom", O_RDONLY); + ASSERT3S(ztest_fd_rand, >=, 0); - process_options(argc, argv); + if (!fd_data_str) { + process_options(argc, argv); - /* - * Blow away any existing copy of zpool.cache - */ - if (zopt_init != 0) - (void) remove("/tmp/zpool.cache"); + setup_data_fd(); + setup_hdr(); + setup_data(); + bcopy(&ztest_opts, ztest_shared_opts, + sizeof (*ztest_shared_opts)); + } else { + ztest_fd_data = atoi(fd_data_str); + setup_data(); + bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts)); + } + ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); - shared_size = sizeof (*zs) + zopt_datasets * sizeof (ztest_ds_t); + /* Override location of zpool.cache */ + VERIFY3U(asprintf((char **)&spa_config_path, "%s/zpool.cache", + ztest_opts.zo_dir), !=, -1); - zs = ztest_shared = (void *)mmap(0, - P2ROUNDUP(shared_size, getpagesize()), - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); + ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), + UMEM_NOFAIL); + zs = ztest_shared; + + if (fd_data_str) { + metaslab_gang_bang = ztest_opts.zo_metaslab_gang_bang; + metaslab_df_alloc_threshold = + zs->zs_metaslab_df_alloc_threshold; - if (zopt_verbose >= 1) { - (void) printf("%llu vdevs, %d datasets, %d threads," - " %llu seconds...\n", - (u_longlong_t)zopt_vdevs, zopt_datasets, zopt_threads, - (u_longlong_t)zopt_time); + if (zs->zs_do_init) + ztest_run_init(); + else + ztest_run(zs); + exit(0); } - /* - * Create and initialize our storage pool. - */ - for (int i = 1; i <= zopt_init; i++) { - bzero(zs, sizeof (ztest_shared_t)); - if (zopt_verbose >= 3 && zopt_init != 1) - (void) printf("ztest_init(), pass %d\n", i); - zs->zs_pool = zopt_pool; - ztest_init(zs); + hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("%llu vdevs, %d datasets, %d threads," + " %llu seconds...\n", + (u_longlong_t)ztest_opts.zo_vdevs, + ztest_opts.zo_datasets, + ztest_opts.zo_threads, + (u_longlong_t)ztest_opts.zo_time); + } + + cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); + (void) strlcpy(cmd, getexecname(), MAXNAMELEN); + + zs->zs_do_init = B_TRUE; + if (strlen(ztest_opts.zo_alt_ztest) != 0) { + if (ztest_opts.zo_verbose >= 1) { + (void) printf("Executing older ztest for " + "initialization: %s\n", ztest_opts.zo_alt_ztest); + } + VERIFY(!exec_child(ztest_opts.zo_alt_ztest, + ztest_opts.zo_alt_libpath, B_FALSE, NULL)); + } else { + VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); } + zs->zs_do_init = B_FALSE; - zs->zs_pool = zopt_pool; zs->zs_proc_start = gethrtime(); - zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC; + zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; for (int f = 0; f < ZTEST_FUNCS; f++) { - zi = &zs->zs_info[f]; - *zi = ztest_info[f]; + zi = &ztest_info[f]; + zc = ZTEST_GET_SHARED_CALLSTATE(f); if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) - zi->zi_call_next = UINT64_MAX; + zc->zc_next = UINT64_MAX; else - zi->zi_call_next = zs->zs_proc_start + + zc->zc_next = zs->zs_proc_start + ztest_random(2 * zi->zi_interval[0] + 1); } @@ -5450,65 +6286,48 @@ main(int argc, char **argv) */ while (gethrtime() < zs->zs_proc_stop) { int status; - pid_t pid; + boolean_t killed; /* * Initialize the workload counters for each function. */ for (int f = 0; f < ZTEST_FUNCS; f++) { - zi = &zs->zs_info[f]; - zi->zi_call_count = 0; - zi->zi_call_time = 0; + zc = ZTEST_GET_SHARED_CALLSTATE(f); + zc->zc_count = 0; + zc->zc_time = 0; } /* Set the allocation switch size */ - metaslab_df_alloc_threshold = ztest_random(metaslab_sz / 4) + 1; - - pid = fork(); - - if (pid == -1) - fatal(1, "fork failed"); + zs->zs_metaslab_df_alloc_threshold = + ztest_random(zs->zs_metaslab_sz / 4) + 1; - if (pid == 0) { /* child */ - struct rlimit rl = { 1024, 1024 }; - (void) setrlimit(RLIMIT_NOFILE, &rl); - (void) enable_extended_FILE_stdio(-1, -1); - ztest_run(zs); - exit(0); - } - - while (waitpid(pid, &status, 0) != pid) - continue; - - if (WIFEXITED(status)) { - if (WEXITSTATUS(status) != 0) { - (void) fprintf(stderr, - "child exited with code %d\n", - WEXITSTATUS(status)); - exit(2); - } - } else if (WIFSIGNALED(status)) { - if (WTERMSIG(status) != SIGKILL) { - (void) fprintf(stderr, - "child died with signal %d\n", - WTERMSIG(status)); - exit(3); + if (!hasalt || ztest_random(2) == 0) { + if (hasalt && ztest_opts.zo_verbose >= 1) { + (void) printf("Executing newer ztest: %s\n", + cmd); } - kills++; + newer++; + killed = exec_child(cmd, NULL, B_TRUE, &status); } else { - (void) fprintf(stderr, "something strange happened " - "to child\n"); - exit(4); + if (hasalt && ztest_opts.zo_verbose >= 1) { + (void) printf("Executing older ztest: %s\n", + ztest_opts.zo_alt_ztest); + } + older++; + killed = exec_child(ztest_opts.zo_alt_ztest, + ztest_opts.zo_alt_libpath, B_TRUE, &status); } + if (killed) + kills++; iters++; - if (zopt_verbose >= 1) { + if (ztest_opts.zo_verbose >= 1) { hrtime_t now = gethrtime(); now = MIN(now, zs->zs_proc_stop); - print_time(zs->zs_proc_stop - now, timebuf, sizeof(timebuf)); - nicenum(zs->zs_space, numbuf, sizeof(numbuf)); + print_time(zs->zs_proc_stop - now, timebuf); + nicenum(zs->zs_space, numbuf); (void) printf("Pass %3d, %8s, %3llu ENOSPC, " "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", @@ -5518,10 +6337,10 @@ main(int argc, char **argv) 100.0 * zs->zs_alloc / zs->zs_space, numbuf, 100.0 * (now - zs->zs_proc_start) / - (zopt_time * NANOSEC), timebuf); + (ztest_opts.zo_time * NANOSEC), timebuf); } - if (zopt_verbose >= 2) { + if (ztest_opts.zo_verbose >= 2) { (void) printf("\nWorkload summary:\n\n"); (void) printf("%7s %9s %s\n", "Calls", "Time", "Function"); @@ -5530,11 +6349,12 @@ main(int argc, char **argv) for (int f = 0; f < ZTEST_FUNCS; f++) { Dl_info dli; - zi = &zs->zs_info[f]; - print_time(zi->zi_call_time, timebuf, sizeof(timebuf)); + zi = &ztest_info[f]; + zc = ZTEST_GET_SHARED_CALLSTATE(f); + print_time(zc->zc_time, timebuf); (void) dladdr((void *)zi->zi_func, &dli); (void) printf("%7llu %9s %s\n", - (u_longlong_t)zi->zi_call_count, timebuf, + (u_longlong_t)zc->zc_count, timebuf, dli.dli_sname); } (void) printf("\n"); @@ -5546,25 +6366,33 @@ main(int argc, char **argv) * instead of 'ztest'. Do a blind rename in case this happened. */ kernel_init(FREAD); - if (spa_open(zopt_pool, &spa, FTAG) == 0) { + if (spa_open(ztest_opts.zo_pool, &spa, FTAG) == 0) { spa_close(spa, FTAG); } else { - char tmpname[MAXNAMELEN]; + char tmpname[ZFS_MAX_DATASET_NAME_LEN]; kernel_fini(); kernel_init(FREAD | FWRITE); (void) snprintf(tmpname, sizeof (tmpname), "%s_tmp", - zopt_pool); - (void) spa_rename(tmpname, zopt_pool); + ztest_opts.zo_pool); + (void) spa_rename(tmpname, ztest_opts.zo_pool); } kernel_fini(); - ztest_run_zdb(zopt_pool); + ztest_run_zdb(ztest_opts.zo_pool); } - if (zopt_verbose >= 1) { + if (ztest_opts.zo_verbose >= 1) { + if (hasalt) { + (void) printf("%d runs of older ztest: %s\n", older, + ztest_opts.zo_alt_ztest); + (void) printf("%d runs of newer ztest: %s\n", newer, + cmd); + } (void) printf("%d killed, %d completed, %.0f%% kill rate\n", kills, iters - kills, (100.0 * kills) / MAX(1, iters)); } + umem_free(cmd, MAXNAMELEN); + return (0); } Index: src/external/cddl/osnet/dist/common/acl/acl_common.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/acl/acl_common.c,v retrieving revision 1.5 diff -u -p -r1.5 acl_common.c --- src/external/cddl/osnet/dist/common/acl/acl_common.c 20 Nov 2011 02:54:25 -0000 1.5 +++ src/external/cddl/osnet/dist/common/acl/acl_common.c 14 Jul 2016 01:26:26 -0000 @@ -19,21 +19,20 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include -#include #include #include +#include #if defined(_KERNEL) +#include #include #include #include -#include +#include #else #include #include @@ -151,166 +150,6 @@ typedef struct ace_list { int seen; /* bitmask of all aclent_t a_type values seen */ } ace_list_t; -ace_t trivial_acl[] = { - {(uid_t)-1, 0, ACE_OWNER, ACE_ACCESS_DENIED_ACE_TYPE}, - {(uid_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| - ACE_WRITE_NAMED_ATTRS, ACE_OWNER, ACE_ACCESS_ALLOWED_ACE_TYPE}, - {(uid_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, - ACE_ACCESS_DENIED_ACE_TYPE}, - {(uid_t)-1, 0, ACE_GROUP|ACE_IDENTIFIER_GROUP, - ACE_ACCESS_ALLOWED_ACE_TYPE}, - {(uid_t)-1, ACE_WRITE_ACL|ACE_WRITE_OWNER| ACE_WRITE_ATTRIBUTES| - ACE_WRITE_NAMED_ATTRS, ACE_EVERYONE, ACE_ACCESS_DENIED_ACE_TYPE}, - {(uid_t)-1, ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| - ACE_SYNCHRONIZE, ACE_EVERYONE, ACE_ACCESS_ALLOWED_ACE_TYPE} -}; - - -void -adjust_ace_pair_common(void *pair, size_t access_off, - size_t pairsize, mode_t mode) -{ - char *datap = (char *)pair; - uint32_t *amask0 = (uint32_t *)(uintptr_t)(datap + access_off); - uint32_t *amask1 = (uint32_t *)(uintptr_t)(datap + pairsize + - access_off); - if (mode & S_IROTH) - *amask1 |= ACE_READ_DATA; - else - *amask0 |= ACE_READ_DATA; - if (mode & S_IWOTH) - *amask1 |= ACE_WRITE_DATA|ACE_APPEND_DATA; - else - *amask0 |= ACE_WRITE_DATA|ACE_APPEND_DATA; - if (mode & S_IXOTH) - *amask1 |= ACE_EXECUTE; - else - *amask0 |= ACE_EXECUTE; -} - -void -adjust_ace_pair(ace_t *pair, mode_t mode) -{ - adjust_ace_pair_common(pair, offsetof(ace_t, a_access_mask), - sizeof (ace_t), mode); -} - -static void -ace_allow_deny_helper(uint16_t type, boolean_t *allow, boolean_t *deny) -{ - if (type == ACE_ACCESS_ALLOWED_ACE_TYPE) - *allow = B_TRUE; - else if (type == ACE_ACCESS_DENIED_ACE_TYPE) - *deny = B_TRUE; -} - -/* - * ace_trivial: - * determine whether an ace_t acl is trivial - * - * Trivialness implies that the acl is composed of only - * owner, group, everyone entries. ACL can't - * have read_acl denied, and write_owner/write_acl/write_attributes - * can only be owner@ entry. - */ -int -ace_trivial_common(void *acep, int aclcnt, - uint64_t (*walk)(void *, uint64_t, int aclcnt, - uint16_t *, uint16_t *, uint32_t *)) -{ - boolean_t owner_allow = B_FALSE; - boolean_t group_allow = B_FALSE; - boolean_t everyone_allow = B_FALSE; - boolean_t owner_deny = B_FALSE; - boolean_t group_deny = B_FALSE; - boolean_t everyone_deny = B_FALSE; - uint16_t flags; - uint32_t mask; - uint16_t type; - uint64_t cookie = 0; - - while (cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask)) { - switch (flags & ACE_TYPE_FLAGS) { - case ACE_OWNER: - if (group_allow || group_deny || everyone_allow || - everyone_deny) - return (1); - ace_allow_deny_helper(type, &owner_allow, &owner_deny); - break; - case ACE_GROUP|ACE_IDENTIFIER_GROUP: - if (everyone_allow || everyone_deny && - (!owner_allow && !owner_deny)) - return (1); - ace_allow_deny_helper(type, &group_allow, &group_deny); - break; - - case ACE_EVERYONE: - if (!owner_allow && !owner_deny && - !group_allow && !group_deny) - return (1); - ace_allow_deny_helper(type, - &everyone_allow, &everyone_deny); - break; - default: - return (1); - - } - - if (flags & (ACE_FILE_INHERIT_ACE| - ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| - ACE_INHERIT_ONLY_ACE)) - return (1); - - /* - * Special check for some special bits - * - * Don't allow anybody to deny reading basic - * attributes or a files ACL. - */ - if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && - (type == ACE_ACCESS_DENIED_ACE_TYPE)) - return (1); - - /* - * Allow on owner@ to allow - * write_acl/write_owner/write_attributes - */ - if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && - (!(flags & ACE_OWNER) && (mask & - (ACE_WRITE_OWNER|ACE_WRITE_ACL|ACE_WRITE_ATTRIBUTES)))) - return (1); - - } - - if (!owner_allow || !owner_deny || !group_allow || !group_deny || - !everyone_allow || !everyone_deny) - return (1); - - return (0); -} - -uint64_t -ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags, - uint16_t *type, uint32_t *mask) -{ - ace_t *acep = datap; - - if (cookie >= aclcnt) - return (0); - - *flags = acep[cookie].a_flags; - *type = acep[cookie].a_type; - *mask = acep[cookie++].a_access_mask; - - return (cookie); -} - -int -ace_trivial(ace_t *acep, int aclcnt) -{ - return (ace_trivial_common(acep, aclcnt, ace_walk)); -} - /* * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified. * v = Ptr to array/vector of objs @@ -469,7 +308,6 @@ acl_free(acl_t *aclp) cacl_free(aclp, sizeof (acl_t)); } -#endif static uint32_t access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow) @@ -539,7 +377,7 @@ access_mask_set(int haswriteperm, int ha * by nfsace, assuming aclent_t -> nfsace semantics. */ static uint32_t -mode_to_ace_access(mode_t mode, int isdir, int isowner, int isallow) +mode_to_ace_access(mode_t mode, boolean_t isdir, int isowner, int isallow) { uint32_t access = 0; int haswriteperm = 0; @@ -582,7 +420,7 @@ mode_to_ace_access(mode_t mode, int isdi access |= ACE_DELETE_CHILD; } /* exec */ - if (mode & 01) { + if (mode & S_IXOTH) { access |= ACE_EXECUTE; } @@ -833,7 +671,7 @@ out: } static int -convert_aent_to_ace(aclent_t *aclentp, int aclcnt, int isdir, +convert_aent_to_ace(aclent_t *aclentp, int aclcnt, boolean_t isdir, ace_t **retacep, int *retacecnt) { ace_t *acep; @@ -859,7 +697,7 @@ convert_aent_to_ace(aclent_t *aclentp, i dfaclcnt = aclcnt - i; } - if (dfaclcnt && isdir == 0) { + if (dfaclcnt && !isdir) { return (EINVAL); } @@ -897,7 +735,7 @@ convert_aent_to_ace(aclent_t *aclentp, i } static int -ace_mask_to_mode(uint32_t mask, o_mode_t *modep, int isdir) +ace_mask_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir) { int error = 0; o_mode_t mode = 0; @@ -1194,7 +1032,7 @@ out: } static int -ace_allow_to_mode(uint32_t mask, o_mode_t *modep, int isdir) +ace_allow_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir) { /* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */ if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) != @@ -1207,7 +1045,7 @@ ace_allow_to_mode(uint32_t mask, o_mode_ static int acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list, - uid_t owner, gid_t group, int isdir) + uid_t owner, gid_t group, boolean_t isdir) { int error; uint32_t flips = ACE_POSIX_SUPPORTED_BITS; @@ -1247,7 +1085,7 @@ out: static int ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt, - uid_t owner, gid_t group, int isdir) + uid_t owner, gid_t group, boolean_t isdir) { int error = 0; aclent_t *aent, *result = NULL; @@ -1427,7 +1265,7 @@ acevals_compare(const void *va, const vo static int ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group, aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt, - int isdir) + boolean_t isdir) { int error = 0; ace_t *acep; @@ -1622,7 +1460,7 @@ out: } static int -convert_ace_to_aent(ace_t *acebufp, int acecnt, int isdir, +convert_ace_to_aent(ace_t *acebufp, int acecnt, boolean_t isdir, uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt) { int error = 0; @@ -1664,7 +1502,7 @@ convert_ace_to_aent(ace_t *acebufp, int int -acl_translate(acl_t *aclp, int target_flavor, int isdir, uid_t owner, +acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, uid_t owner, gid_t group) { int aclcnt; @@ -1726,3 +1564,202 @@ out: return (error); #endif } +#endif /* !_KERNEL */ + +#define SET_ACE(acl, index, who, mask, type, flags) { \ + acl[0][index].a_who = (uint32_t)who; \ + acl[0][index].a_type = type; \ + acl[0][index].a_flags = flags; \ + acl[0][index++].a_access_mask = mask; \ +} + +void +acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks) +{ + uint32_t read_mask = ACE_READ_DATA; + uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA; + uint32_t execute_mask = ACE_EXECUTE; + + (void) isdir; /* will need this later */ + + masks->deny1 = 0; + if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) + masks->deny1 |= read_mask; + if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) + masks->deny1 |= write_mask; + if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) + masks->deny1 |= execute_mask; + + masks->deny2 = 0; + if (!(mode & S_IRGRP) && (mode & S_IROTH)) + masks->deny2 |= read_mask; + if (!(mode & S_IWGRP) && (mode & S_IWOTH)) + masks->deny2 |= write_mask; + if (!(mode & S_IXGRP) && (mode & S_IXOTH)) + masks->deny2 |= execute_mask; + + masks->allow0 = 0; + if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) + masks->allow0 |= read_mask; + if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) + masks->allow0 |= write_mask; + if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) + masks->allow0 |= execute_mask; + + masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| + ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; + if (mode & S_IRUSR) + masks->owner |= read_mask; + if (mode & S_IWUSR) + masks->owner |= write_mask; + if (mode & S_IXUSR) + masks->owner |= execute_mask; + + masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IRGRP) + masks->group |= read_mask; + if (mode & S_IWGRP) + masks->group |= write_mask; + if (mode & S_IXGRP) + masks->group |= execute_mask; + + masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IROTH) + masks->everyone |= read_mask; + if (mode & S_IWOTH) + masks->everyone |= write_mask; + if (mode & S_IXOTH) + masks->everyone |= execute_mask; +} + +int +acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count) +{ + int index = 0; + int error; + trivial_acl_t masks; + + *count = 3; + acl_trivial_access_masks(mode, isdir, &masks); + + if (masks.allow0) + (*count)++; + if (masks.deny1) + (*count)++; + if (masks.deny2) + (*count)++; + + if ((error = cacl_malloc((void **)acl, *count * sizeof (ace_t))) != 0) + return (error); + + if (masks.allow0) { + SET_ACE(acl, index, -1, masks.allow0, + ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER); + } + if (masks.deny1) { + SET_ACE(acl, index, -1, masks.deny1, + ACE_ACCESS_DENIED_ACE_TYPE, ACE_OWNER); + } + if (masks.deny2) { + SET_ACE(acl, index, -1, masks.deny2, + ACE_ACCESS_DENIED_ACE_TYPE, ACE_GROUP|ACE_IDENTIFIER_GROUP); + } + + SET_ACE(acl, index, -1, masks.owner, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_OWNER); + SET_ACE(acl, index, -1, masks.group, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_IDENTIFIER_GROUP|ACE_GROUP); + SET_ACE(acl, index, -1, masks.everyone, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_EVERYONE); + + return (0); +} + +/* + * ace_trivial: + * determine whether an ace_t acl is trivial + * + * Trivialness implies that the acl is composed of only + * owner, group, everyone entries. ACL can't + * have read_acl denied, and write_owner/write_acl/write_attributes + * can only be owner@ entry. + */ +int +ace_trivial_common(void *acep, int aclcnt, + uint64_t (*walk)(void *, uint64_t, int aclcnt, + uint16_t *, uint16_t *, uint32_t *)) +{ + uint16_t flags; + uint32_t mask; + uint16_t type; + uint64_t cookie = 0; + + while (cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask)) { + switch (flags & ACE_TYPE_FLAGS) { + case ACE_OWNER: + case ACE_GROUP|ACE_IDENTIFIER_GROUP: + case ACE_EVERYONE: + break; + default: + return (1); + + } + + if (flags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| + ACE_INHERIT_ONLY_ACE)) + return (1); + + /* + * Special check for some special bits + * + * Don't allow anybody to deny reading basic + * attributes or a files ACL. + */ + if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + (type == ACE_ACCESS_DENIED_ACE_TYPE)) + return (1); + + /* + * Delete permissions are never set by default + */ + if (mask & (ACE_DELETE|ACE_DELETE_CHILD)) + return (1); + /* + * only allow owner@ to have + * write_acl/write_owner/write_attributes/write_xattr/ + */ + if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && + (!(flags & ACE_OWNER) && (mask & + (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES| + ACE_WRITE_NAMED_ATTRS)))) + return (1); + + } + return (0); +} + +uint64_t +ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags, + uint16_t *type, uint32_t *mask) +{ + ace_t *acep = datap; + + if (cookie >= aclcnt) + return (0); + + *flags = acep[cookie].a_flags; + *type = acep[cookie].a_type; + *mask = acep[cookie++].a_access_mask; + + return (cookie); +} + +int +ace_trivial(ace_t *acep, int aclcnt) +{ + return (ace_trivial_common(acep, aclcnt, ace_walk)); +} Index: src/external/cddl/osnet/dist/common/acl/acl_common.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/acl/acl_common.h,v retrieving revision 1.1.1.1 diff -u -p -r1.1.1.1 acl_common.h --- src/external/cddl/osnet/dist/common/acl/acl_common.h 7 Aug 2009 18:32:26 -0000 1.1.1.1 +++ src/external/cddl/osnet/dist/common/acl/acl_common.h 12 Jun 2012 05:57:26 -0000 @@ -19,16 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ #ifndef _ACL_COMMON_H #define _ACL_COMMON_H -#pragma ident "%Z%%M% %I% %E% SMI" - - #include #include #include @@ -37,7 +34,14 @@ extern "C" { #endif -extern ace_t trivial_acl[6]; +typedef struct trivial_acl { + uint32_t allow0; /* allow mask for bits only in owner */ + uint32_t deny1; /* deny mask for bits not in owner */ + uint32_t deny2; /* deny mask for bits not in group */ + uint32_t owner; /* allow mask matching mode */ + uint32_t group; /* allow mask matching mode */ + uint32_t everyone; /* allow mask matching mode */ +} trivial_acl_t; extern int acltrivial(const char *); extern void adjust_ace_pair(ace_t *pair, mode_t mode); @@ -46,13 +50,17 @@ extern int ace_trivial(ace_t *acep, int extern int ace_trivial_common(void *, int, uint64_t (*walk)(void *, uint64_t, int aclcnt, uint16_t *, uint16_t *, uint32_t *mask)); +#if !defined(_KERNEL) extern acl_t *acl_alloc(acl_type_t); extern void acl_free(acl_t *aclp); -extern int acl_translate(acl_t *aclp, int target_flavor, - int isdir, uid_t owner, gid_t group); +extern int acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, + uid_t owner, gid_t group); +#endif /* !_KERNEL */ void ksort(caddr_t v, int n, int s, int (*f)()); int cmp2acls(void *a, void *b); - +int acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count); +void acl_trivial_access_masks(mode_t mode, boolean_t isdir, + trivial_acl_t *masks); #ifdef __cplusplus } Index: src/external/cddl/osnet/dist/common/avl/avl.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/avl/avl.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 avl.c --- src/external/cddl/osnet/dist/common/avl/avl.c 27 Feb 2010 22:29:42 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/common/avl/avl.c 20 Sep 2015 01:09:06 -0000 @@ -24,6 +24,11 @@ */ /* + * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + */ + +/* * AVL - generic AVL tree implementation for kernel use * * A complete description of AVL trees can be found in many CS textbooks. @@ -37,7 +42,7 @@ * insertion and deletion relatively efficiently. Searching the tree is * still a fast operation, roughly O(log(N)). * - * The key to insertion and deletion is a set of tree maniuplations called + * The key to insertion and deletion is a set of tree manipulations called * rotations, which bring unbalanced subtrees back into the semi-balanced state. * * This implementation of AVL trees has the following peculiarities: @@ -45,7 +50,7 @@ * - The AVL specific data structures are physically embedded as fields * in the "using" data structures. To maintain generality the code * must constantly translate between "avl_node_t *" and containing - * data structure "void *"s by adding/subracting the avl_offset. + * data structure "void *"s by adding/subtracting the avl_offset. * * - Since the AVL data is always embedded in other structures, there is * no locking or memory allocation in the AVL routines. This must be @@ -85,16 +90,22 @@ * is a modified "avl_node_t *". The bottom bit (normally 0 for a * pointer) is set to indicate if that the new node has a value greater * than the value of the indicated "avl_node_t *". + * + * Note - in addition to userland (e.g. libavl and libutil) and the kernel + * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module, + * which each have their own compilation environments and subsequent + * requirements. Each of these environments must be considered when adding + * dependencies from avl.c. */ #include #include +#include #include #include -#include /* - * Small arrays to translate between balance (or diff) values and child indeces. + * Small arrays to translate between balance (or diff) values and child indices. * * Code that deals with binary tree data structures will randomly use * left and right children when examining a tree. C "if()" statements @@ -114,7 +125,8 @@ static const int avl_balance2child[] = * * - If there is a left child, go to it, then to it's rightmost descendant. * - * - otherwise we return thru parent nodes until we've come from a right child. + * - otherwise we return through parent nodes until we've come from a right + * child. * * Return Value: * NULL - if at the end of the nodes @@ -624,14 +636,17 @@ avl_add(avl_tree_t *tree, void *new_node /* * This is unfortunate. We want to call panic() here, even for * non-DEBUG kernels. In userland, however, we can't depend on anything - * in libc or else the rtld build process gets confused. So, all we can - * do in userland is resort to a normal ASSERT(). + * in libc or else the rtld build process gets confused. + * Thankfully, rtld provides us with its own assfail() so we can use + * that here. We use assfail() directly to get a nice error message + * in the core - much like what panic() does for crashdumps. */ if (avl_find(tree, new_node, &where) != NULL) #ifdef _KERNEL panic("avl_find() succeeded inside avl_add()"); #else - ASSERT(0); + (void) assfail("avl_find() succeeded inside avl_add()", + __FILE__, __LINE__); #endif avl_insert(tree, new_node, where); } @@ -863,6 +878,24 @@ avl_update(avl_tree_t *t, void *obj) return (B_FALSE); } +void +avl_swap(avl_tree_t *tree1, avl_tree_t *tree2) +{ + avl_node_t *temp_node; + ulong_t temp_numnodes; + + ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar); + ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset); + ASSERT3U(tree1->avl_size, ==, tree2->avl_size); + + temp_node = tree1->avl_root; + temp_numnodes = tree1->avl_numnodes; + tree1->avl_root = tree2->avl_root; + tree1->avl_numnodes = tree2->avl_numnodes; + tree2->avl_root = temp_node; + tree2->avl_numnodes = temp_numnodes; +} + /* * initialize a new AVL tree */ @@ -919,7 +952,7 @@ avl_is_empty(avl_tree_t *tree) /* * Post-order tree walk used to visit all tree nodes and destroy the tree - * in post order. This is used for destroying a tree w/o paying any cost + * in post order. This is used for destroying a tree without paying any cost * for rebalancing it. * * example: Index: src/external/cddl/osnet/dist/common/ctf/ctf_types.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/ctf/ctf_types.c,v retrieving revision 1.4 diff -u -p -r1.4 ctf_types.c --- src/external/cddl/osnet/dist/common/ctf/ctf_types.c 27 Dec 2015 21:39:01 -0000 1.4 +++ src/external/cddl/osnet/dist/common/ctf/ctf_types.c 11 Apr 2017 09:42:30 -0000 @@ -647,11 +647,8 @@ ctf_type_compat(ctf_file_t *lfp, ctf_id_ } } -/* - * Return the type and offset for a given member of a STRUCT or UNION. - */ -int -ctf_member_info(ctf_file_t *fp, ctf_id_t type, const char *name, +static int +_ctf_member_info(ctf_file_t *fp, ctf_id_t type, const char *name, ulong_t off, ctf_membinfo_t *mip) { ctf_file_t *ofp = fp; @@ -676,9 +673,13 @@ ctf_member_info(ctf_file_t *fp, ctf_id_t ((uintptr_t)tp + increment); for (n = LCTF_INFO_VLEN(fp, tp->ctt_info); n != 0; n--, mp++) { + if (mp->ctm_name == 0 && + _ctf_member_info(fp, mp->ctm_type, name, + mp->ctm_offset + off, mip) == 0) + return (0); if (strcmp(ctf_strptr(fp, mp->ctm_name), name) == 0) { mip->ctm_type = mp->ctm_type; - mip->ctm_offset = mp->ctm_offset; + mip->ctm_offset = mp->ctm_offset + off; return (0); } } @@ -687,9 +688,14 @@ ctf_member_info(ctf_file_t *fp, ctf_id_t ((uintptr_t)tp + increment); for (n = LCTF_INFO_VLEN(fp, tp->ctt_info); n != 0; n--, lmp++) { + if (lmp->ctlm_name == 0 && + _ctf_member_info(fp, lmp->ctlm_name, name, + (ulong_t)CTF_LMEM_OFFSET(lmp) + off, mip) == 0) + return (0); if (strcmp(ctf_strptr(fp, lmp->ctlm_name), name) == 0) { mip->ctm_type = lmp->ctlm_type; - mip->ctm_offset = (ulong_t)CTF_LMEM_OFFSET(lmp); + mip->ctm_offset = + (ulong_t)CTF_LMEM_OFFSET(lmp) + off; return (0); } } @@ -699,6 +705,17 @@ ctf_member_info(ctf_file_t *fp, ctf_id_t } /* + * Return the type and offset for a given member of a STRUCT or UNION. + */ +int +ctf_member_info(ctf_file_t *fp, ctf_id_t type, const char *name, + ctf_membinfo_t *mip) +{ + + return (_ctf_member_info(fp, type, name, 0, mip)); +} + +/* * Return the array type, index, and size information for the specified ARRAY. */ int Index: src/external/cddl/osnet/dist/common/nvpair/fnvpair.c =================================================================== RCS file: src/external/cddl/osnet/dist/common/nvpair/fnvpair.c diff -N src/external/cddl/osnet/dist/common/nvpair/fnvpair.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/common/nvpair/fnvpair.c 16 Jun 2017 17:49:37 -0000 @@ -0,0 +1,512 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include +#ifndef _KERNEL +#include +#include +#else +#include +#include +#include +#include +#endif + +/* + * "Force" nvlist wrapper. + * + * These functions wrap the nvlist_* functions with assertions that assume + * the operation is successful. This allows the caller's code to be much + * more readable, especially for the fnvlist_lookup_* and fnvpair_value_* + * functions, which can return the requested value (rather than filling in + * a pointer). + * + * These functions use NV_UNIQUE_NAME, encoding NV_ENCODE_NATIVE, and allocate + * with KM_SLEEP. + * + * More wrappers should be added as needed -- for example + * nvlist_lookup_*_array and nvpair_value_*_array. + */ + +nvlist_t * +fnvlist_alloc(void) +{ + nvlist_t *nvl; + VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)); + return (nvl); +} + +void +fnvlist_free(nvlist_t *nvl) +{ + nvlist_free(nvl); +} + +size_t +fnvlist_size(nvlist_t *nvl) +{ + size_t size; + VERIFY0(nvlist_size(nvl, &size, NV_ENCODE_NATIVE)); + return (size); +} + +/* + * Returns allocated buffer of size *sizep. Caller must free the buffer with + * fnvlist_pack_free(). + */ +char * +fnvlist_pack(nvlist_t *nvl, size_t *sizep) +{ + char *packed = 0; + VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE, + KM_SLEEP), ==, 0); + return (packed); +} + +/*ARGSUSED*/ +void +fnvlist_pack_free(char *pack, size_t size) +{ +#ifdef _KERNEL + kmem_free(pack, size); +#else + free(pack); +#endif +} + +nvlist_t * +fnvlist_unpack(char *buf, size_t buflen) +{ + nvlist_t *rv; + VERIFY0(nvlist_unpack(buf, buflen, &rv, KM_SLEEP)); + return (rv); +} + +nvlist_t * +fnvlist_dup(nvlist_t *nvl) +{ + nvlist_t *rv; + VERIFY0(nvlist_dup(nvl, &rv, KM_SLEEP)); + return (rv); +} + +void +fnvlist_merge(nvlist_t *dst, nvlist_t *src) +{ + VERIFY0(nvlist_merge(dst, src, KM_SLEEP)); +} + +size_t +fnvlist_num_pairs(nvlist_t *nvl) +{ + size_t count = 0; + nvpair_t *pair; + + for (pair = nvlist_next_nvpair(nvl, 0); pair != NULL; + pair = nvlist_next_nvpair(nvl, pair)) + count++; + return (count); +} + +void +fnvlist_add_boolean(nvlist_t *nvl, const char *name) +{ + VERIFY0(nvlist_add_boolean(nvl, name)); +} + +void +fnvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val) +{ + VERIFY0(nvlist_add_boolean_value(nvl, name, val)); +} + +void +fnvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val) +{ + VERIFY0(nvlist_add_byte(nvl, name, val)); +} + +void +fnvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val) +{ + VERIFY0(nvlist_add_int8(nvl, name, val)); +} + +void +fnvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val) +{ + VERIFY0(nvlist_add_uint8(nvl, name, val)); +} + +void +fnvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val) +{ + VERIFY0(nvlist_add_int16(nvl, name, val)); +} + +void +fnvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val) +{ + VERIFY0(nvlist_add_uint16(nvl, name, val)); +} + +void +fnvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val) +{ + VERIFY0(nvlist_add_int32(nvl, name, val)); +} + +void +fnvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val) +{ + VERIFY0(nvlist_add_uint32(nvl, name, val)); +} + +void +fnvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val) +{ + VERIFY0(nvlist_add_int64(nvl, name, val)); +} + +void +fnvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val) +{ + VERIFY0(nvlist_add_uint64(nvl, name, val)); +} + +void +fnvlist_add_string(nvlist_t *nvl, const char *name, const char *val) +{ + VERIFY0(nvlist_add_string(nvl, name, val)); +} + +void +fnvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val) +{ + VERIFY0(nvlist_add_nvlist(nvl, name, val)); +} + +void +fnvlist_add_nvpair(nvlist_t *nvl, nvpair_t *pair) +{ + VERIFY0(nvlist_add_nvpair(nvl, pair)); +} + +void +fnvlist_add_boolean_array(nvlist_t *nvl, const char *name, + boolean_t *val, uint_t n) +{ + VERIFY0(nvlist_add_boolean_array(nvl, name, val, n)); +} + +void +fnvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *val, uint_t n) +{ + VERIFY0(nvlist_add_byte_array(nvl, name, val, n)); +} + +void +fnvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *val, uint_t n) +{ + VERIFY0(nvlist_add_int8_array(nvl, name, val, n)); +} + +void +fnvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *val, uint_t n) +{ + VERIFY0(nvlist_add_uint8_array(nvl, name, val, n)); +} + +void +fnvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *val, uint_t n) +{ + VERIFY0(nvlist_add_int16_array(nvl, name, val, n)); +} + +void +fnvlist_add_uint16_array(nvlist_t *nvl, const char *name, + uint16_t *val, uint_t n) +{ + VERIFY0(nvlist_add_uint16_array(nvl, name, val, n)); +} + +void +fnvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *val, uint_t n) +{ + VERIFY0(nvlist_add_int32_array(nvl, name, val, n)); +} + +void +fnvlist_add_uint32_array(nvlist_t *nvl, const char *name, + uint32_t *val, uint_t n) +{ + VERIFY0(nvlist_add_uint32_array(nvl, name, val, n)); +} + +void +fnvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *val, uint_t n) +{ + VERIFY0(nvlist_add_int64_array(nvl, name, val, n)); +} + +void +fnvlist_add_uint64_array(nvlist_t *nvl, const char *name, + uint64_t *val, uint_t n) +{ + VERIFY0(nvlist_add_uint64_array(nvl, name, val, n)); +} + +void +fnvlist_add_string_array(nvlist_t *nvl, const char *name, + char * const *val, uint_t n) +{ + VERIFY0(nvlist_add_string_array(nvl, name, val, n)); +} + +void +fnvlist_add_nvlist_array(nvlist_t *nvl, const char *name, + nvlist_t **val, uint_t n) +{ + VERIFY0(nvlist_add_nvlist_array(nvl, name, val, n)); +} + +void +fnvlist_remove(nvlist_t *nvl, const char *name) +{ + VERIFY0(nvlist_remove_all(nvl, name)); +} + +void +fnvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *pair) +{ + VERIFY0(nvlist_remove_nvpair(nvl, pair)); +} + +nvpair_t * +fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name) +{ + nvpair_t *rv; + VERIFY0(nvlist_lookup_nvpair(nvl, name, &rv)); + return (rv); +} + +/* returns B_TRUE if the entry exists */ +boolean_t +fnvlist_lookup_boolean(nvlist_t *nvl, const char *name) +{ + return (nvlist_lookup_boolean(nvl, name) == 0); +} + +boolean_t +fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name) +{ + boolean_t rv; + VERIFY0(nvlist_lookup_boolean_value(nvl, name, &rv)); + return (rv); +} + +uchar_t +fnvlist_lookup_byte(nvlist_t *nvl, const char *name) +{ + uchar_t rv; + VERIFY0(nvlist_lookup_byte(nvl, name, &rv)); + return (rv); +} + +int8_t +fnvlist_lookup_int8(nvlist_t *nvl, const char *name) +{ + int8_t rv; + VERIFY0(nvlist_lookup_int8(nvl, name, &rv)); + return (rv); +} + +int16_t +fnvlist_lookup_int16(nvlist_t *nvl, const char *name) +{ + int16_t rv; + VERIFY0(nvlist_lookup_int16(nvl, name, &rv)); + return (rv); +} + +int32_t +fnvlist_lookup_int32(nvlist_t *nvl, const char *name) +{ + int32_t rv; + VERIFY0(nvlist_lookup_int32(nvl, name, &rv)); + return (rv); +} + +int64_t +fnvlist_lookup_int64(nvlist_t *nvl, const char *name) +{ + int64_t rv; + VERIFY0(nvlist_lookup_int64(nvl, name, &rv)); + return (rv); +} + +uint8_t +fnvlist_lookup_uint8_t(nvlist_t *nvl, const char *name) +{ + uint8_t rv; + VERIFY0(nvlist_lookup_uint8(nvl, name, &rv)); + return (rv); +} + +uint16_t +fnvlist_lookup_uint16(nvlist_t *nvl, const char *name) +{ + uint16_t rv; + VERIFY0(nvlist_lookup_uint16(nvl, name, &rv)); + return (rv); +} + +uint32_t +fnvlist_lookup_uint32(nvlist_t *nvl, const char *name) +{ + uint32_t rv; + VERIFY0(nvlist_lookup_uint32(nvl, name, &rv)); + return (rv); +} + +uint64_t +fnvlist_lookup_uint64(nvlist_t *nvl, const char *name) +{ + uint64_t rv; + VERIFY0(nvlist_lookup_uint64(nvl, name, &rv)); + return (rv); +} + +char * +fnvlist_lookup_string(nvlist_t *nvl, const char *name) +{ + char *rv; + VERIFY0(nvlist_lookup_string(nvl, name, &rv)); + return (rv); +} + +nvlist_t * +fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name) +{ + nvlist_t *rv; + VERIFY0(nvlist_lookup_nvlist(nvl, name, &rv)); + return (rv); +} + +boolean_t +fnvpair_value_boolean_value(nvpair_t *nvp) +{ + boolean_t rv; + VERIFY0(nvpair_value_boolean_value(nvp, &rv)); + return (rv); +} + +uchar_t +fnvpair_value_byte(nvpair_t *nvp) +{ + uchar_t rv; + VERIFY0(nvpair_value_byte(nvp, &rv)); + return (rv); +} + +int8_t +fnvpair_value_int8(nvpair_t *nvp) +{ + int8_t rv; + VERIFY0(nvpair_value_int8(nvp, &rv)); + return (rv); +} + +int16_t +fnvpair_value_int16(nvpair_t *nvp) +{ + int16_t rv; + VERIFY0(nvpair_value_int16(nvp, &rv)); + return (rv); +} + +int32_t +fnvpair_value_int32(nvpair_t *nvp) +{ + int32_t rv; + VERIFY0(nvpair_value_int32(nvp, &rv)); + return (rv); +} + +int64_t +fnvpair_value_int64(nvpair_t *nvp) +{ + int64_t rv; + VERIFY0(nvpair_value_int64(nvp, &rv)); + return (rv); +} + +uint8_t +fnvpair_value_uint8_t(nvpair_t *nvp) +{ + uint8_t rv; + VERIFY0(nvpair_value_uint8(nvp, &rv)); + return (rv); +} + +uint16_t +fnvpair_value_uint16(nvpair_t *nvp) +{ + uint16_t rv; + VERIFY0(nvpair_value_uint16(nvp, &rv)); + return (rv); +} + +uint32_t +fnvpair_value_uint32(nvpair_t *nvp) +{ + uint32_t rv; + VERIFY0(nvpair_value_uint32(nvp, &rv)); + return (rv); +} + +uint64_t +fnvpair_value_uint64(nvpair_t *nvp) +{ + uint64_t rv; + VERIFY0(nvpair_value_uint64(nvp, &rv)); + return (rv); +} + +char * +fnvpair_value_string(nvpair_t *nvp) +{ + char *rv; + VERIFY0(nvpair_value_string(nvp, &rv)); + return (rv); +} + +nvlist_t * +fnvpair_value_nvlist(nvpair_t *nvp) +{ + nvlist_t *rv; + VERIFY0(nvpair_value_nvlist(nvp, &rv)); + return (rv); +} Index: src/external/cddl/osnet/dist/common/nvpair/nvpair.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/nvpair/nvpair.c,v retrieving revision 1.3 diff -u -p -r1.3 nvpair.c --- src/external/cddl/osnet/dist/common/nvpair/nvpair.c 10 Apr 2015 22:29:35 -0000 1.3 +++ src/external/cddl/osnet/dist/common/nvpair/nvpair.c 10 Jun 2017 05:30:43 -0000 @@ -35,7 +35,6 @@ #if defined(_KERNEL) && !defined(_BOOT) #include -#include #include #else #include @@ -49,6 +48,14 @@ #endif #define skip_whitespace(p) while ((*(p) == ' ') || (*(p) == '\t')) p++ +#if !defined(illumos) && !defined(_KERNEL) +/* + * libnvpair is the lowest commen denominator for ZFS related libraries, + * defining aok here makes it usable by all ZFS related libraries + */ +int aok; +#endif + /* * nvpair.c - Provides kernel & userland interfaces for manipulating * name-value pairs. Index: src/external/cddl/osnet/dist/common/unicode/u8_textprep.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/unicode/u8_textprep.c,v retrieving revision 1.1.1.1 diff -u -p -r1.1.1.1 u8_textprep.c --- src/external/cddl/osnet/dist/common/unicode/u8_textprep.c 7 Aug 2009 18:32:31 -0000 1.1.1.1 +++ src/external/cddl/osnet/dist/common/unicode/u8_textprep.c 18 Apr 2017 15:05:00 -0000 @@ -23,7 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" /* @@ -42,14 +41,13 @@ #include #include #include -#include #include #else -#include #include #endif /* _KERNEL */ #include #include +#include #include @@ -144,10 +142,10 @@ #define U8_16BIT_TABLE_INDICATOR (0x8000U) /* The following are some convenience macros. */ -#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ - (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \ - (uint32_t)(b3) & 0x3F; - +#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ + (u) = ((((uint32_t)(b1) & 0x0F) << 12) | \ + (((uint32_t)(b2) & 0x3F) << 6) | \ + ((uint32_t)(b3) & 0x3F)); #define U8_SIMPLE_SWAP(a, b, t) \ (t) = (a); \ (a) = (b); \ @@ -217,10 +215,10 @@ const int8_t u8_number_of_bytes[0x100] = /* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, -/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ +/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, -/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ +/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, /* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ Index: src/external/cddl/osnet/dist/common/util/strtolctype.h =================================================================== RCS file: src/external/cddl/osnet/dist/common/util/strtolctype.h diff -N src/external/cddl/osnet/dist/common/util/strtolctype.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/common/util/strtolctype.h 4 Feb 2015 07:24:07 -0000 @@ -0,0 +1,79 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1988 AT&T */ +/* All Rights Reserved */ + +#ifndef _COMMON_UTIL_CTYPE_H +#define _COMMON_UTIL_CTYPE_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This header file contains a collection of macros that the strtou?ll? + * functions in common/util use to test characters. What we need is a kernel + * version of ctype.h. + * + * NOTE: These macros are used within several DTrace probe context functions. + * They must not be altered to make function calls or perform actions not + * safe in probe context. + */ + +#if defined(illumos) && (defined(_KERNEL) || defined(_BOOT)) + +#define isalnum(ch) (isalpha(ch) || isdigit(ch)) +#define isalpha(ch) (isupper(ch) || islower(ch)) +#define isdigit(ch) ((ch) >= '0' && (ch) <= '9') +#define islower(ch) ((ch) >= 'a' && (ch) <= 'z') +#define isspace(ch) (((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \ + ((ch) == '\t') || ((ch) == '\f')) +#define isupper(ch) ((ch) >= 'A' && (ch) <= 'Z') +#define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \ + ((ch) >= 'A' && (ch) <= 'F')) + +#endif /* _KERNEL || _BOOT */ + +#define DIGIT(x) \ + (isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A') + +#define MBASE ('z' - 'a' + 1 + 10) + +/* + * The following macro is a version of isalnum() that limits alphabetic + * characters to the ranges a-z and A-Z; locale dependent characters will not + * return 1. The members of a-z and A-Z are assumed to be in ascending order + * and contiguous. + */ +#define lisalnum(x) \ + (isdigit(x) || ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z')) + +#ifdef __cplusplus +} +#endif + +#endif /* _COMMON_UTIL_CTYPE_H */ Index: src/external/cddl/osnet/dist/common/zfs/zfeature_common.c =================================================================== RCS file: src/external/cddl/osnet/dist/common/zfs/zfeature_common.c diff -N src/external/cddl/osnet/dist/common/zfs/zfeature_common.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/common/zfs/zfeature_common.c 27 Jun 2017 23:57:12 -0000 @@ -0,0 +1,252 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#ifdef _KERNEL +#include +#else +#include +#include +#endif +#include +#include +#include +#include "zfeature_common.h" + +/* + * Set to disable all feature checks while opening pools, allowing pools with + * unsupported features to be opened. Set for testing only. + */ +boolean_t zfeature_checks_disable = B_FALSE; + +zfeature_info_t spa_feature_table[SPA_FEATURES]; + +/* + * Valid characters for feature guids. This list is mainly for aesthetic + * purposes and could be expanded in the future. There are different allowed + * characters in the guids reverse dns portion (before the colon) and its + * short name (after the colon). + */ +static int +valid_char(char c, boolean_t after_colon) +{ + return ((c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9') || + (after_colon && c == '_') || + (!after_colon && (c == '.' || c == '-'))); +} + +/* + * Every feature guid must contain exactly one colon which separates a reverse + * dns organization name from the feature's "short" name (e.g. + * "com.company:feature_name"). + */ +boolean_t +zfeature_is_valid_guid(const char *name) +{ + int i; + boolean_t has_colon = B_FALSE; + + i = 0; + while (name[i] != '\0') { + char c = name[i++]; + if (c == ':') { + if (has_colon) + return (B_FALSE); + has_colon = B_TRUE; + continue; + } + if (!valid_char(c, has_colon)) + return (B_FALSE); + } + + return (has_colon); +} + +boolean_t +zfeature_is_supported(const char *guid) +{ + if (zfeature_checks_disable) + return (B_TRUE); + + for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *feature = &spa_feature_table[i]; + if (strcmp(guid, feature->fi_guid) == 0) + return (B_TRUE); + } + return (B_FALSE); +} + +int +zfeature_lookup_name(const char *name, spa_feature_t *res) +{ + for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *feature = &spa_feature_table[i]; + if (strcmp(name, feature->fi_uname) == 0) { + if (res != NULL) + *res = i; + return (0); + } + } + + return (ENOENT); +} + +boolean_t +zfeature_depends_on(spa_feature_t fid, spa_feature_t check) +{ + zfeature_info_t *feature = &spa_feature_table[fid]; + + for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) { + if (feature->fi_depends[i] == check) + return (B_TRUE); + } + return (B_FALSE); +} + +static void +zfeature_register(spa_feature_t fid, const char *guid, const char *name, + const char *desc, zfeature_flags_t flags, const spa_feature_t *deps) +{ + zfeature_info_t *feature = &spa_feature_table[fid]; + static spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; + + ASSERT(name != NULL); + ASSERT(desc != NULL); + ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 || + (flags & ZFEATURE_FLAG_MOS) == 0); + ASSERT3U(fid, <, SPA_FEATURES); + ASSERT(zfeature_is_valid_guid(guid)); + + if (deps == NULL) + deps = nodeps; + + feature->fi_feature = fid; + feature->fi_guid = guid; + feature->fi_uname = name; + feature->fi_desc = desc; + feature->fi_flags = flags; + feature->fi_depends = deps; +} + +void +zpool_feature_init(void) +{ + zfeature_register(SPA_FEATURE_ASYNC_DESTROY, + "com.delphix:async_destroy", "async_destroy", + "Destroy filesystems asynchronously.", + ZFEATURE_FLAG_READONLY_COMPAT, NULL); + + zfeature_register(SPA_FEATURE_EMPTY_BPOBJ, + "com.delphix:empty_bpobj", "empty_bpobj", + "Snapshots use less space.", + ZFEATURE_FLAG_READONLY_COMPAT, NULL); + + zfeature_register(SPA_FEATURE_LZ4_COMPRESS, + "org.illumos:lz4_compress", "lz4_compress", + "LZ4 compression algorithm support.", + ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, NULL); + + zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, + "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump", + "Crash dumps to multiple vdev pools.", + 0, NULL); + + zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM, + "com.delphix:spacemap_histogram", "spacemap_histogram", + "Spacemaps maintain space histograms.", + ZFEATURE_FLAG_READONLY_COMPAT, NULL); + + zfeature_register(SPA_FEATURE_ENABLED_TXG, + "com.delphix:enabled_txg", "enabled_txg", + "Record txg at which a feature is enabled", + ZFEATURE_FLAG_READONLY_COMPAT, NULL); + + static spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG, + SPA_FEATURE_NONE }; + zfeature_register(SPA_FEATURE_HOLE_BIRTH, + "com.delphix:hole_birth", "hole_birth", + "Retain hole birth txg for more precise zfs send", + ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, + hole_birth_deps); + + zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET, + "com.delphix:extensible_dataset", "extensible_dataset", + "Enhanced dataset functionality, used by other features.", + 0, NULL); + + static const spa_feature_t bookmarks_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_BOOKMARKS, + "com.delphix:bookmarks", "bookmarks", + "\"zfs bookmark\" command", + ZFEATURE_FLAG_READONLY_COMPAT, bookmarks_deps); + + static const spa_feature_t filesystem_limits_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_FS_SS_LIMIT, + "com.joyent:filesystem_limits", "filesystem_limits", + "Filesystem and snapshot limits.", + ZFEATURE_FLAG_READONLY_COMPAT, filesystem_limits_deps); + + zfeature_register(SPA_FEATURE_EMBEDDED_DATA, + "com.delphix:embedded_data", "embedded_data", + "Blocks which compress very well use even less space.", + ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, + NULL); + + static const spa_feature_t large_blocks_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LARGE_BLOCKS, + "org.open-zfs:large_blocks", "large_blocks", + "Support for blocks larger than 128KB.", + ZFEATURE_FLAG_PER_DATASET, large_blocks_deps); +#ifndef __NetBSD__ + zfeature_register(SPA_FEATURE_SHA512, + "org.illumos:sha512", "sha512", + "SHA-512/256 hash algorithm.", + ZFEATURE_FLAG_PER_DATASET, NULL); + zfeature_register(SPA_FEATURE_SKEIN, + "org.illumos:skein", "skein", + "Skein hash algorithm.", + ZFEATURE_FLAG_PER_DATASET, NULL); +#endif + +#ifdef illumos + zfeature_register(SPA_FEATURE_EDONR, + "org.illumos:edonr", "edonr", + "Edon-R hash algorithm.", + ZFEATURE_FLAG_PER_DATASET, NULL); +#endif +} Index: src/external/cddl/osnet/dist/common/zfs/zfeature_common.h =================================================================== RCS file: src/external/cddl/osnet/dist/common/zfs/zfeature_common.h diff -N src/external/cddl/osnet/dist/common/zfs/zfeature_common.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/common/zfs/zfeature_common.h 27 Jun 2017 23:57:31 -0000 @@ -0,0 +1,106 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#ifndef _ZFEATURE_COMMON_H +#define _ZFEATURE_COMMON_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct zfeature_info; + +typedef enum spa_feature { + SPA_FEATURE_NONE = -1, + SPA_FEATURE_ASYNC_DESTROY, + SPA_FEATURE_EMPTY_BPOBJ, + SPA_FEATURE_LZ4_COMPRESS, + SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, + SPA_FEATURE_SPACEMAP_HISTOGRAM, + SPA_FEATURE_ENABLED_TXG, + SPA_FEATURE_HOLE_BIRTH, + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_EMBEDDED_DATA, + SPA_FEATURE_BOOKMARKS, + SPA_FEATURE_FS_SS_LIMIT, + SPA_FEATURE_LARGE_BLOCKS, +#ifndef __NetBSD__ + SPA_FEATURE_SHA512, + SPA_FEATURE_SKEIN, +#endif +#ifdef illumos + SPA_FEATURE_EDONR, +#endif + SPA_FEATURES +} spa_feature_t; + +#define SPA_FEATURE_DISABLED (-1ULL) + +typedef enum zfeature_flags { + /* Can open pool readonly even if this feature is not supported. */ + ZFEATURE_FLAG_READONLY_COMPAT = (1 << 0), + /* Is this feature necessary to read the MOS? */ + ZFEATURE_FLAG_MOS = (1 << 1), + /* Activate this feature at the same time it is enabled. */ + ZFEATURE_FLAG_ACTIVATE_ON_ENABLE = (1 << 2), + /* Each dataset has a field set if it has ever used this feature. */ + ZFEATURE_FLAG_PER_DATASET = (1 << 3) +} zfeature_flags_t; + +typedef struct zfeature_info { + spa_feature_t fi_feature; + const char *fi_uname; /* User-facing feature name */ + const char *fi_guid; /* On-disk feature identifier */ + const char *fi_desc; /* Feature description */ + zfeature_flags_t fi_flags; + /* array of dependencies, terminated by SPA_FEATURE_NONE */ + const spa_feature_t *fi_depends; +} zfeature_info_t; + +typedef int (zfeature_func_t)(zfeature_info_t *, void *); + +#define ZFS_FEATURE_DEBUG + +extern zfeature_info_t spa_feature_table[SPA_FEATURES]; + +extern boolean_t zfeature_is_valid_guid(const char *); + +extern boolean_t zfeature_is_supported(const char *); +extern int zfeature_lookup_name(const char *, spa_feature_t *); +extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t); + +extern void zpool_feature_init(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFEATURE_COMMON_H */ Index: src/external/cddl/osnet/dist/common/zfs/zfs_comutil.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_comutil.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zfs_comutil.c --- src/external/cddl/osnet/dist/common/zfs/zfs_comutil.c 27 Feb 2010 22:29:40 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/common/zfs/zfs_comutil.c 23 Mar 2013 15:29:25 -0000 @@ -19,8 +19,8 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ /* @@ -37,8 +37,8 @@ #include #include -#include #include +#include "zfs_comutil.h" /* * Are there allocatable vdevs? @@ -103,3 +103,104 @@ zpool_get_rewind_policy(nvlist_t *nvl, z if (zrpp->zrp_request == 0) zrpp->zrp_request = ZPOOL_NO_REWIND; } + +typedef struct zfs_version_spa_map { + int version_zpl; + int version_spa; +} zfs_version_spa_map_t; + +/* + * Keep this table in monotonically increasing version number order. + */ +static zfs_version_spa_map_t zfs_version_table[] = { + {ZPL_VERSION_INITIAL, SPA_VERSION_INITIAL}, + {ZPL_VERSION_DIRENT_TYPE, SPA_VERSION_INITIAL}, + {ZPL_VERSION_FUID, SPA_VERSION_FUID}, + {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE}, + {ZPL_VERSION_SA, SPA_VERSION_SA}, + {0, 0} +}; + +/* + * Return the max zpl version for a corresponding spa version + * -1 is returned if no mapping exists. + */ +int +zfs_zpl_version_map(int spa_version) +{ + int i; + int version = -1; + + for (i = 0; zfs_version_table[i].version_spa; i++) { + if (spa_version >= zfs_version_table[i].version_spa) + version = zfs_version_table[i].version_zpl; + } + + return (version); +} + +/* + * Return the min spa version for a corresponding spa version + * -1 is returned if no mapping exists. + */ +int +zfs_spa_version_map(int zpl_version) +{ + int i; + int version = -1; + + for (i = 0; zfs_version_table[i].version_zpl; i++) { + if (zfs_version_table[i].version_zpl >= zpl_version) + return (zfs_version_table[i].version_spa); + } + + return (version); +} + +/* + * This is the table of legacy internal event names; it should not be modified. + * The internal events are now stored in the history log as strings. + */ +const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = { + "invalid event", + "pool create", + "vdev add", + "pool remove", + "pool destroy", + "pool export", + "pool import", + "vdev attach", + "vdev replace", + "vdev detach", + "vdev online", + "vdev offline", + "vdev upgrade", + "pool clear", + "pool scrub", + "pool property set", + "create", + "clone", + "destroy", + "destroy_begin_sync", + "inherit", + "property set", + "quota set", + "permission update", + "permission remove", + "permission who remove", + "promote", + "receive", + "rename", + "reservation set", + "replay_inc_sync", + "replay_full_sync", + "rollback", + "snapshot", + "filesystem version upgrade", + "refquota set", + "refreservation set", + "pool scrub done", + "user hold", + "user release", + "pool split", +}; Index: src/external/cddl/osnet/dist/common/zfs/zfs_comutil.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_comutil.h,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zfs_comutil.h --- src/external/cddl/osnet/dist/common/zfs/zfs_comutil.h 27 Feb 2010 22:29:40 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/common/zfs/zfs_comutil.h 23 Mar 2013 15:29:25 -0000 @@ -19,8 +19,8 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #ifndef _ZFS_COMUTIL_H @@ -36,6 +36,11 @@ extern "C" { extern boolean_t zfs_allocatable_devs(nvlist_t *); extern void zpool_get_rewind_policy(nvlist_t *, zpool_rewind_policy_t *); +extern int zfs_zpl_version_map(int spa_version); +extern int zfs_spa_version_map(int zpl_version); +#define ZFS_NUM_LEGACY_HISTORY_EVENTS 41 +extern const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS]; + #ifdef __cplusplus } #endif Index: src/external/cddl/osnet/dist/common/zfs/zfs_deleg.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_deleg.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zfs_deleg.c --- src/external/cddl/osnet/dist/common/zfs/zfs_deleg.c 27 Feb 2010 22:29:40 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/common/zfs/zfs_deleg.c 10 Oct 2016 11:10:02 -0000 @@ -19,10 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov */ +#include + #if defined(_KERNEL) #include #include @@ -34,42 +38,34 @@ #include #include #endif -/* XXX includes zfs_context.h, so why bother with the above? */ #include #include "zfs_prop.h" #include "zfs_deleg.h" #include "zfs_namecheck.h" -/* - * permission table - * - * Keep this table in sorted order - * - * This table is used for displaying all permissions for - * zfs allow - */ - zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = { - {ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW}, - {ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE }, - {ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE }, - {ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY }, - {ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT }, - {ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE }, - {ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE }, - {ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, - {ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, - {ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, - {ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, - {ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_NONE }, - {ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, - {ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, - {ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, - {ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, - {ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, - {ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD }, - {ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE }, - {NULL, ZFS_DELEG_NOTE_NONE } + {ZFS_DELEG_PERM_ALLOW}, + {ZFS_DELEG_PERM_BOOKMARK}, + {ZFS_DELEG_PERM_CLONE}, + {ZFS_DELEG_PERM_CREATE}, + {ZFS_DELEG_PERM_DESTROY}, + {ZFS_DELEG_PERM_DIFF}, + {ZFS_DELEG_PERM_MOUNT}, + {ZFS_DELEG_PERM_PROMOTE}, + {ZFS_DELEG_PERM_RECEIVE}, + {ZFS_DELEG_PERM_RENAME}, + {ZFS_DELEG_PERM_ROLLBACK}, + {ZFS_DELEG_PERM_SNAPSHOT}, + {ZFS_DELEG_PERM_SHARE}, + {ZFS_DELEG_PERM_SEND}, + {ZFS_DELEG_PERM_USERPROP}, + {ZFS_DELEG_PERM_USERQUOTA}, + {ZFS_DELEG_PERM_GROUPQUOTA}, + {ZFS_DELEG_PERM_USERUSED}, + {ZFS_DELEG_PERM_GROUPUSED}, + {ZFS_DELEG_PERM_HOLD}, + {ZFS_DELEG_PERM_RELEASE}, + {NULL} }; static int @@ -182,8 +178,9 @@ zfs_deleg_verify_nvlist(nvlist_t *nvp) nvpair_name(perm_name)); if (error) return (-1); - } while (perm_name = nvlist_next_nvpair(perms, perm_name)); - } while (who = nvlist_next_nvpair(nvp, who)); + } while ((perm_name = nvlist_next_nvpair(perms, perm_name)) + != NULL); + } while ((who = nvlist_next_nvpair(nvp, who)) != NULL); return (0); } Index: src/external/cddl/osnet/dist/common/zfs/zfs_deleg.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_deleg.h,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zfs_deleg.h --- src/external/cddl/osnet/dist/common/zfs/zfs_deleg.h 27 Feb 2010 22:29:40 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/common/zfs/zfs_deleg.h 13 Jan 2014 02:59:29 -0000 @@ -19,8 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _ZFS_DELEG_H @@ -52,6 +53,7 @@ typedef enum { ZFS_DELEG_NOTE_CLONE, ZFS_DELEG_NOTE_PROMOTE, ZFS_DELEG_NOTE_RENAME, + ZFS_DELEG_NOTE_SEND, ZFS_DELEG_NOTE_RECEIVE, ZFS_DELEG_NOTE_ALLOW, ZFS_DELEG_NOTE_USERPROP, @@ -63,6 +65,8 @@ typedef enum { ZFS_DELEG_NOTE_GROUPUSED, ZFS_DELEG_NOTE_HOLD, ZFS_DELEG_NOTE_RELEASE, + ZFS_DELEG_NOTE_DIFF, + ZFS_DELEG_NOTE_BOOKMARK, ZFS_DELEG_NOTE_NONE } zfs_deleg_note_t; Index: src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.c,v retrieving revision 1.1.1.1 diff -u -p -r1.1.1.1 zfs_fletcher.c --- src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.c 27 Feb 2010 22:29:40 -0000 1.1.1.1 +++ src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.c 22 Nov 2015 17:22:21 -0000 @@ -22,6 +22,9 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ /* * Fletcher Checksums @@ -131,8 +134,10 @@ #include #include +/*ARGSUSED*/ void -fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +fletcher_2_native(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) { const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); @@ -148,8 +153,10 @@ fletcher_2_native(const void *buf, uint6 ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); } +/*ARGSUSED*/ void -fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +fletcher_2_byteswap(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) { const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); @@ -165,8 +172,10 @@ fletcher_2_byteswap(const void *buf, uin ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); } +/*ARGSUSED*/ void -fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +fletcher_4_native(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) { const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); @@ -182,8 +191,10 @@ fletcher_4_native(const void *buf, uint6 ZIO_SET_CHECKSUM(zcp, a, b, c, d); } +/*ARGSUSED*/ void -fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +fletcher_4_byteswap(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) { const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); Index: src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.h,v retrieving revision 1.1.1.1 diff -u -p -r1.1.1.1 zfs_fletcher.h --- src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.h 27 Feb 2010 22:29:40 -0000 1.1.1.1 +++ src/external/cddl/osnet/dist/common/zfs/zfs_fletcher.h 22 Nov 2015 17:22:21 -0000 @@ -22,6 +22,9 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ #ifndef _ZFS_FLETCHER_H #define _ZFS_FLETCHER_H @@ -37,14 +40,12 @@ extern "C" { * fletcher checksum functions */ -void fletcher_2_native(const void *, uint64_t, zio_cksum_t *); -void fletcher_2_byteswap(const void *, uint64_t, zio_cksum_t *); -void fletcher_4_native(const void *, uint64_t, zio_cksum_t *); -void fletcher_4_byteswap(const void *, uint64_t, zio_cksum_t *); -void fletcher_4_incremental_native(const void *, uint64_t, - zio_cksum_t *); -void fletcher_4_incremental_byteswap(const void *, uint64_t, - zio_cksum_t *); +void fletcher_2_native(const void *, uint64_t, const void *, zio_cksum_t *); +void fletcher_2_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); +void fletcher_4_native(const void *, uint64_t, const void *, zio_cksum_t *); +void fletcher_4_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); +void fletcher_4_incremental_native(const void *, uint64_t, zio_cksum_t *); +void fletcher_4_incremental_byteswap(const void *, uint64_t, zio_cksum_t *); #ifdef __cplusplus } Index: src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.c =================================================================== RCS file: src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.c diff -N src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.c 30 Apr 2017 03:32:02 -0000 @@ -0,0 +1,1380 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2013 Xin Li . All rights reserved. + * Copyright 2013 Martin Matuska . All rights reserved. + * Portions Copyright 2005, 2010, Oracle and/or its affiliates. + * All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_namecheck.h" +#include "zfs_ioctl_compat.h" + +static int zfs_version_ioctl = ZFS_IOCVER_CURRENT; +SYSCTL_DECL(_vfs_zfs_version); +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl, + 0, "ZFS_IOCTL_VERSION"); + +/* + * FreeBSD zfs_cmd compatibility with older binaries + * appropriately remap/extend the zfs_cmd_t structure + */ +void +zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) +{ + zfs_cmd_v15_t *zc_c; + zfs_cmd_v28_t *zc28_c; + zfs_cmd_deadman_t *zcdm_c; + zfs_cmd_zcmd_t *zcmd_c; + zfs_cmd_edbp_t *edbp_c; + zfs_cmd_resume_t *resume_c; + zfs_cmd_inlanes_t *inlanes_c; + + switch (cflag) { + case ZFS_CMD_COMPAT_INLANES: + inlanes_c = (void *)addr; + /* zc */ + strlcpy(zc->zc_name, inlanes_c->zc_name, MAXPATHLEN); + strlcpy(zc->zc_value, inlanes_c->zc_value, MAXPATHLEN * 2); + strlcpy(zc->zc_string, inlanes_c->zc_string, MAXPATHLEN); + +#define FIELD_COPY(field) zc->field = inlanes_c->field + FIELD_COPY(zc_nvlist_src); + FIELD_COPY(zc_nvlist_src_size); + FIELD_COPY(zc_nvlist_dst); + FIELD_COPY(zc_nvlist_dst_size); + FIELD_COPY(zc_nvlist_dst_filled); + FIELD_COPY(zc_pad2); + FIELD_COPY(zc_history); + FIELD_COPY(zc_guid); + FIELD_COPY(zc_nvlist_conf); + FIELD_COPY(zc_nvlist_conf_size); + FIELD_COPY(zc_cookie); + FIELD_COPY(zc_objset_type); + FIELD_COPY(zc_perm_action); + FIELD_COPY(zc_history_len); + FIELD_COPY(zc_history_offset); + FIELD_COPY(zc_obj); + FIELD_COPY(zc_iflags); + FIELD_COPY(zc_share); + FIELD_COPY(zc_jailid); + FIELD_COPY(zc_objset_stats); + FIELD_COPY(zc_begin_record); + FIELD_COPY(zc_inject_record); + FIELD_COPY(zc_defer_destroy); + FIELD_COPY(zc_flags); + FIELD_COPY(zc_action_handle); + FIELD_COPY(zc_cleanup_fd); + FIELD_COPY(zc_simple); + FIELD_COPY(zc_resumable); + FIELD_COPY(zc_sendobj); + FIELD_COPY(zc_fromobj); + FIELD_COPY(zc_createtxg); + FIELD_COPY(zc_stat); +#undef FIELD_COPY + break; + + case ZFS_CMD_COMPAT_RESUME: + resume_c = (void *)addr; + /* zc */ + strlcpy(zc->zc_name, resume_c->zc_name, MAXPATHLEN); + strlcpy(zc->zc_value, resume_c->zc_value, MAXPATHLEN * 2); + strlcpy(zc->zc_string, resume_c->zc_string, MAXPATHLEN); + +#define FIELD_COPY(field) zc->field = resume_c->field + FIELD_COPY(zc_nvlist_src); + FIELD_COPY(zc_nvlist_src_size); + FIELD_COPY(zc_nvlist_dst); + FIELD_COPY(zc_nvlist_dst_size); + FIELD_COPY(zc_nvlist_dst_filled); + FIELD_COPY(zc_pad2); + FIELD_COPY(zc_history); + FIELD_COPY(zc_guid); + FIELD_COPY(zc_nvlist_conf); + FIELD_COPY(zc_nvlist_conf_size); + FIELD_COPY(zc_cookie); + FIELD_COPY(zc_objset_type); + FIELD_COPY(zc_perm_action); + FIELD_COPY(zc_history_len); + FIELD_COPY(zc_history_offset); + FIELD_COPY(zc_obj); + FIELD_COPY(zc_iflags); + FIELD_COPY(zc_share); + FIELD_COPY(zc_jailid); + FIELD_COPY(zc_objset_stats); + FIELD_COPY(zc_begin_record); + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(zc->zc_inject_record.zi_func, + resume_c->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + zc->zc_inject_record.zi_nlanes = 1; + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); + FIELD_COPY(zc_defer_destroy); + FIELD_COPY(zc_flags); + FIELD_COPY(zc_action_handle); + FIELD_COPY(zc_cleanup_fd); + FIELD_COPY(zc_simple); + FIELD_COPY(zc_resumable); + FIELD_COPY(zc_sendobj); + FIELD_COPY(zc_fromobj); + FIELD_COPY(zc_createtxg); + FIELD_COPY(zc_stat); +#undef FIELD_COPY + break; + + case ZFS_CMD_COMPAT_EDBP: + edbp_c = (void *)addr; + /* zc */ + strlcpy(zc->zc_name, edbp_c->zc_name, MAXPATHLEN); + strlcpy(zc->zc_value, edbp_c->zc_value, MAXPATHLEN * 2); + strlcpy(zc->zc_string, edbp_c->zc_string, MAXPATHLEN); + +#define FIELD_COPY(field) zc->field = edbp_c->field + FIELD_COPY(zc_nvlist_src); + FIELD_COPY(zc_nvlist_src_size); + FIELD_COPY(zc_nvlist_dst); + FIELD_COPY(zc_nvlist_dst_size); + FIELD_COPY(zc_nvlist_dst_filled); + FIELD_COPY(zc_pad2); + FIELD_COPY(zc_history); + FIELD_COPY(zc_guid); + FIELD_COPY(zc_nvlist_conf); + FIELD_COPY(zc_nvlist_conf_size); + FIELD_COPY(zc_cookie); + FIELD_COPY(zc_objset_type); + FIELD_COPY(zc_perm_action); + FIELD_COPY(zc_history_len); + FIELD_COPY(zc_history_offset); + FIELD_COPY(zc_obj); + FIELD_COPY(zc_iflags); + FIELD_COPY(zc_share); + FIELD_COPY(zc_jailid); + FIELD_COPY(zc_objset_stats); + zc->zc_begin_record.drr_u.drr_begin = edbp_c->zc_begin_record; + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(zc->zc_inject_record.zi_func, + edbp_c->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + zc->zc_inject_record.zi_nlanes = 1; + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); + FIELD_COPY(zc_defer_destroy); + FIELD_COPY(zc_flags); + FIELD_COPY(zc_action_handle); + FIELD_COPY(zc_cleanup_fd); + FIELD_COPY(zc_simple); + zc->zc_resumable = B_FALSE; + FIELD_COPY(zc_sendobj); + FIELD_COPY(zc_fromobj); + FIELD_COPY(zc_createtxg); + FIELD_COPY(zc_stat); +#undef FIELD_COPY + break; + + case ZFS_CMD_COMPAT_ZCMD: + zcmd_c = (void *)addr; + /* zc */ + strlcpy(zc->zc_name, zcmd_c->zc_name, MAXPATHLEN); + strlcpy(zc->zc_value, zcmd_c->zc_value, MAXPATHLEN * 2); + strlcpy(zc->zc_string, zcmd_c->zc_string, MAXPATHLEN); + +#define FIELD_COPY(field) zc->field = zcmd_c->field + FIELD_COPY(zc_nvlist_src); + FIELD_COPY(zc_nvlist_src_size); + FIELD_COPY(zc_nvlist_dst); + FIELD_COPY(zc_nvlist_dst_size); + FIELD_COPY(zc_nvlist_dst_filled); + FIELD_COPY(zc_pad2); + FIELD_COPY(zc_history); + FIELD_COPY(zc_guid); + FIELD_COPY(zc_nvlist_conf); + FIELD_COPY(zc_nvlist_conf_size); + FIELD_COPY(zc_cookie); + FIELD_COPY(zc_objset_type); + FIELD_COPY(zc_perm_action); + FIELD_COPY(zc_history_len); + FIELD_COPY(zc_history_offset); + FIELD_COPY(zc_obj); + FIELD_COPY(zc_iflags); + FIELD_COPY(zc_share); + FIELD_COPY(zc_jailid); + FIELD_COPY(zc_objset_stats); + zc->zc_begin_record.drr_u.drr_begin = zcmd_c->zc_begin_record; + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(zc->zc_inject_record.zi_func, + zcmd_c->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + zc->zc_inject_record.zi_nlanes = 1; + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); + + /* boolean_t -> uint32_t */ + zc->zc_defer_destroy = (uint32_t)(zcmd_c->zc_defer_destroy); + zc->zc_flags = 0; + + FIELD_COPY(zc_action_handle); + FIELD_COPY(zc_cleanup_fd); + FIELD_COPY(zc_simple); + zc->zc_resumable = B_FALSE; + FIELD_COPY(zc_sendobj); + FIELD_COPY(zc_fromobj); + FIELD_COPY(zc_createtxg); + FIELD_COPY(zc_stat); +#undef FIELD_COPY + + break; + + case ZFS_CMD_COMPAT_DEADMAN: + zcdm_c = (void *)addr; + /* zc */ + strlcpy(zc->zc_name, zcdm_c->zc_name, MAXPATHLEN); + strlcpy(zc->zc_value, zcdm_c->zc_value, MAXPATHLEN * 2); + strlcpy(zc->zc_string, zcdm_c->zc_string, MAXPATHLEN); + +#define FIELD_COPY(field) zc->field = zcdm_c->field + zc->zc_guid = zcdm_c->zc_guid; + zc->zc_nvlist_conf = zcdm_c->zc_nvlist_conf; + zc->zc_nvlist_conf_size = zcdm_c->zc_nvlist_conf_size; + zc->zc_nvlist_src = zcdm_c->zc_nvlist_src; + zc->zc_nvlist_src_size = zcdm_c->zc_nvlist_src_size; + zc->zc_nvlist_dst = zcdm_c->zc_nvlist_dst; + zc->zc_nvlist_dst_size = zcdm_c->zc_nvlist_dst_size; + zc->zc_cookie = zcdm_c->zc_cookie; + zc->zc_objset_type = zcdm_c->zc_objset_type; + zc->zc_perm_action = zcdm_c->zc_perm_action; + zc->zc_history = zcdm_c->zc_history; + zc->zc_history_len = zcdm_c->zc_history_len; + zc->zc_history_offset = zcdm_c->zc_history_offset; + zc->zc_obj = zcdm_c->zc_obj; + zc->zc_iflags = zcdm_c->zc_iflags; + zc->zc_share = zcdm_c->zc_share; + zc->zc_jailid = zcdm_c->zc_jailid; + zc->zc_objset_stats = zcdm_c->zc_objset_stats; + zc->zc_begin_record.drr_u.drr_begin = zcdm_c->zc_begin_record; + zc->zc_defer_destroy = zcdm_c->zc_defer_destroy; + (void)zcdm_c->zc_temphold; + zc->zc_action_handle = zcdm_c->zc_action_handle; + zc->zc_cleanup_fd = zcdm_c->zc_cleanup_fd; + zc->zc_simple = zcdm_c->zc_simple; + zc->zc_resumable = B_FALSE; + zc->zc_sendobj = zcdm_c->zc_sendobj; + zc->zc_fromobj = zcdm_c->zc_fromobj; + zc->zc_createtxg = zcdm_c->zc_createtxg; + zc->zc_stat = zcdm_c->zc_stat; + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(zc->zc_inject_record.zi_func, + resume_c->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + zc->zc_inject_record.zi_nlanes = 1; + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); + + /* we always assume zc_nvlist_dst_filled is true */ + zc->zc_nvlist_dst_filled = B_TRUE; +#undef FIELD_COPY + break; + + case ZFS_CMD_COMPAT_V28: + zc28_c = (void *)addr; + + /* zc */ + strlcpy(zc->zc_name, zc28_c->zc_name, MAXPATHLEN); + strlcpy(zc->zc_value, zc28_c->zc_value, MAXPATHLEN * 2); + strlcpy(zc->zc_string, zc28_c->zc_string, MAXPATHLEN); + zc->zc_guid = zc28_c->zc_guid; + zc->zc_nvlist_conf = zc28_c->zc_nvlist_conf; + zc->zc_nvlist_conf_size = zc28_c->zc_nvlist_conf_size; + zc->zc_nvlist_src = zc28_c->zc_nvlist_src; + zc->zc_nvlist_src_size = zc28_c->zc_nvlist_src_size; + zc->zc_nvlist_dst = zc28_c->zc_nvlist_dst; + zc->zc_nvlist_dst_size = zc28_c->zc_nvlist_dst_size; + zc->zc_cookie = zc28_c->zc_cookie; + zc->zc_objset_type = zc28_c->zc_objset_type; + zc->zc_perm_action = zc28_c->zc_perm_action; + zc->zc_history = zc28_c->zc_history; + zc->zc_history_len = zc28_c->zc_history_len; + zc->zc_history_offset = zc28_c->zc_history_offset; + zc->zc_obj = zc28_c->zc_obj; + zc->zc_iflags = zc28_c->zc_iflags; + zc->zc_share = zc28_c->zc_share; + zc->zc_jailid = zc28_c->zc_jailid; + zc->zc_objset_stats = zc28_c->zc_objset_stats; + zc->zc_begin_record.drr_u.drr_begin = zc28_c->zc_begin_record; + zc->zc_defer_destroy = zc28_c->zc_defer_destroy; + (void)zc28_c->zc_temphold; + zc->zc_action_handle = zc28_c->zc_action_handle; + zc->zc_cleanup_fd = zc28_c->zc_cleanup_fd; + zc->zc_simple = zc28_c->zc_simple; + zc->zc_resumable = B_FALSE; + zc->zc_sendobj = zc28_c->zc_sendobj; + zc->zc_fromobj = zc28_c->zc_fromobj; + zc->zc_createtxg = zc28_c->zc_createtxg; + zc->zc_stat = zc28_c->zc_stat; + + /* zc->zc_inject_record */ + zc->zc_inject_record.zi_objset = + zc28_c->zc_inject_record.zi_objset; + zc->zc_inject_record.zi_object = + zc28_c->zc_inject_record.zi_object; + zc->zc_inject_record.zi_start = + zc28_c->zc_inject_record.zi_start; + zc->zc_inject_record.zi_end = + zc28_c->zc_inject_record.zi_end; + zc->zc_inject_record.zi_guid = + zc28_c->zc_inject_record.zi_guid; + zc->zc_inject_record.zi_level = + zc28_c->zc_inject_record.zi_level; + zc->zc_inject_record.zi_error = + zc28_c->zc_inject_record.zi_error; + zc->zc_inject_record.zi_type = + zc28_c->zc_inject_record.zi_type; + zc->zc_inject_record.zi_freq = + zc28_c->zc_inject_record.zi_freq; + zc->zc_inject_record.zi_failfast = + zc28_c->zc_inject_record.zi_failfast; + strlcpy(zc->zc_inject_record.zi_func, + zc28_c->zc_inject_record.zi_func, MAXNAMELEN); + zc->zc_inject_record.zi_iotype = + zc28_c->zc_inject_record.zi_iotype; + zc->zc_inject_record.zi_duration = + zc28_c->zc_inject_record.zi_duration; + zc->zc_inject_record.zi_timer = + zc28_c->zc_inject_record.zi_timer; + zc->zc_inject_record.zi_nlanes = 1; + zc->zc_inject_record.zi_cmd = ZINJECT_UNINITIALIZED; + zc->zc_inject_record.zi_pad = 0; + break; + + case ZFS_CMD_COMPAT_V15: + zc_c = (void *)addr; + + /* zc */ + strlcpy(zc->zc_name, zc_c->zc_name, MAXPATHLEN); + strlcpy(zc->zc_value, zc_c->zc_value, MAXPATHLEN); + strlcpy(zc->zc_string, zc_c->zc_string, MAXPATHLEN); + zc->zc_guid = zc_c->zc_guid; + zc->zc_nvlist_conf = zc_c->zc_nvlist_conf; + zc->zc_nvlist_conf_size = zc_c->zc_nvlist_conf_size; + zc->zc_nvlist_src = zc_c->zc_nvlist_src; + zc->zc_nvlist_src_size = zc_c->zc_nvlist_src_size; + zc->zc_nvlist_dst = zc_c->zc_nvlist_dst; + zc->zc_nvlist_dst_size = zc_c->zc_nvlist_dst_size; + zc->zc_cookie = zc_c->zc_cookie; + zc->zc_objset_type = zc_c->zc_objset_type; + zc->zc_perm_action = zc_c->zc_perm_action; + zc->zc_history = zc_c->zc_history; + zc->zc_history_len = zc_c->zc_history_len; + zc->zc_history_offset = zc_c->zc_history_offset; + zc->zc_obj = zc_c->zc_obj; + zc->zc_share = zc_c->zc_share; + zc->zc_jailid = zc_c->zc_jailid; + zc->zc_objset_stats = zc_c->zc_objset_stats; + zc->zc_begin_record.drr_u.drr_begin = zc_c->zc_begin_record; + + /* zc->zc_inject_record */ + zc->zc_inject_record.zi_objset = + zc_c->zc_inject_record.zi_objset; + zc->zc_inject_record.zi_object = + zc_c->zc_inject_record.zi_object; + zc->zc_inject_record.zi_start = + zc_c->zc_inject_record.zi_start; + zc->zc_inject_record.zi_end = + zc_c->zc_inject_record.zi_end; + zc->zc_inject_record.zi_guid = + zc_c->zc_inject_record.zi_guid; + zc->zc_inject_record.zi_level = + zc_c->zc_inject_record.zi_level; + zc->zc_inject_record.zi_error = + zc_c->zc_inject_record.zi_error; + zc->zc_inject_record.zi_type = + zc_c->zc_inject_record.zi_type; + zc->zc_inject_record.zi_freq = + zc_c->zc_inject_record.zi_freq; + zc->zc_inject_record.zi_failfast = + zc_c->zc_inject_record.zi_failfast; + break; + } +} + +void +zfs_cmd_compat_put(zfs_cmd_t *zc, caddr_t addr, const int request, + const int cflag) +{ + zfs_cmd_v15_t *zc_c; + zfs_cmd_v28_t *zc28_c; + zfs_cmd_deadman_t *zcdm_c; + zfs_cmd_zcmd_t *zcmd_c; + zfs_cmd_edbp_t *edbp_c; + zfs_cmd_resume_t *resume_c; + zfs_cmd_inlanes_t *inlanes_c; + + switch (cflag) { + case ZFS_CMD_COMPAT_INLANES: + inlanes_c = (void *)addr; + strlcpy(inlanes_c->zc_name, zc->zc_name, MAXPATHLEN); + strlcpy(inlanes_c->zc_value, zc->zc_value, MAXPATHLEN * 2); + strlcpy(inlanes_c->zc_string, zc->zc_string, MAXPATHLEN); + +#define FIELD_COPY(field) inlanes_c->field = zc->field + FIELD_COPY(zc_nvlist_src); + FIELD_COPY(zc_nvlist_src_size); + FIELD_COPY(zc_nvlist_dst); + FIELD_COPY(zc_nvlist_dst_size); + FIELD_COPY(zc_nvlist_dst_filled); + FIELD_COPY(zc_pad2); + FIELD_COPY(zc_history); + FIELD_COPY(zc_guid); + FIELD_COPY(zc_nvlist_conf); + FIELD_COPY(zc_nvlist_conf_size); + FIELD_COPY(zc_cookie); + FIELD_COPY(zc_objset_type); + FIELD_COPY(zc_perm_action); + FIELD_COPY(zc_history_len); + FIELD_COPY(zc_history_offset); + FIELD_COPY(zc_obj); + FIELD_COPY(zc_iflags); + FIELD_COPY(zc_share); + FIELD_COPY(zc_jailid); + FIELD_COPY(zc_objset_stats); + FIELD_COPY(zc_begin_record); + FIELD_COPY(zc_inject_record); + FIELD_COPY(zc_defer_destroy); + FIELD_COPY(zc_flags); + FIELD_COPY(zc_action_handle); + FIELD_COPY(zc_cleanup_fd); + FIELD_COPY(zc_simple); + FIELD_COPY(zc_sendobj); + FIELD_COPY(zc_fromobj); + FIELD_COPY(zc_createtxg); + FIELD_COPY(zc_stat); +#undef FIELD_COPY + break; + + case ZFS_CMD_COMPAT_RESUME: + resume_c = (void *)addr; + strlcpy(resume_c->zc_name, zc->zc_name, MAXPATHLEN); + strlcpy(resume_c->zc_value, zc->zc_value, MAXPATHLEN * 2); + strlcpy(resume_c->zc_string, zc->zc_string, MAXPATHLEN); + +#define FIELD_COPY(field) resume_c->field = zc->field + FIELD_COPY(zc_nvlist_src); + FIELD_COPY(zc_nvlist_src_size); + FIELD_COPY(zc_nvlist_dst); + FIELD_COPY(zc_nvlist_dst_size); + FIELD_COPY(zc_nvlist_dst_filled); + FIELD_COPY(zc_pad2); + FIELD_COPY(zc_history); + FIELD_COPY(zc_guid); + FIELD_COPY(zc_nvlist_conf); + FIELD_COPY(zc_nvlist_conf_size); + FIELD_COPY(zc_cookie); + FIELD_COPY(zc_objset_type); + FIELD_COPY(zc_perm_action); + FIELD_COPY(zc_history_len); + FIELD_COPY(zc_history_offset); + FIELD_COPY(zc_obj); + FIELD_COPY(zc_iflags); + FIELD_COPY(zc_share); + FIELD_COPY(zc_jailid); + FIELD_COPY(zc_objset_stats); + FIELD_COPY(zc_begin_record); + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(resume_c->zc_inject_record.zi_func, + zc->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); + FIELD_COPY(zc_defer_destroy); + FIELD_COPY(zc_flags); + FIELD_COPY(zc_action_handle); + FIELD_COPY(zc_cleanup_fd); + FIELD_COPY(zc_simple); + FIELD_COPY(zc_sendobj); + FIELD_COPY(zc_fromobj); + FIELD_COPY(zc_createtxg); + FIELD_COPY(zc_stat); +#undef FIELD_COPY + break; + + case ZFS_CMD_COMPAT_EDBP: + edbp_c = (void *)addr; + strlcpy(edbp_c->zc_name, zc->zc_name, MAXPATHLEN); + strlcpy(edbp_c->zc_value, zc->zc_value, MAXPATHLEN * 2); + strlcpy(edbp_c->zc_string, zc->zc_string, MAXPATHLEN); + +#define FIELD_COPY(field) edbp_c->field = zc->field + FIELD_COPY(zc_nvlist_src); + FIELD_COPY(zc_nvlist_src_size); + FIELD_COPY(zc_nvlist_dst); + FIELD_COPY(zc_nvlist_dst_size); + FIELD_COPY(zc_nvlist_dst_filled); + FIELD_COPY(zc_pad2); + FIELD_COPY(zc_history); + FIELD_COPY(zc_guid); + FIELD_COPY(zc_nvlist_conf); + FIELD_COPY(zc_nvlist_conf_size); + FIELD_COPY(zc_cookie); + FIELD_COPY(zc_objset_type); + FIELD_COPY(zc_perm_action); + FIELD_COPY(zc_history_len); + FIELD_COPY(zc_history_offset); + FIELD_COPY(zc_obj); + FIELD_COPY(zc_iflags); + FIELD_COPY(zc_share); + FIELD_COPY(zc_jailid); + FIELD_COPY(zc_objset_stats); + edbp_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(resume_c->zc_inject_record.zi_func, + zc->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); + FIELD_COPY(zc_defer_destroy); + FIELD_COPY(zc_flags); + FIELD_COPY(zc_action_handle); + FIELD_COPY(zc_cleanup_fd); + FIELD_COPY(zc_simple); + FIELD_COPY(zc_sendobj); + FIELD_COPY(zc_fromobj); + FIELD_COPY(zc_createtxg); + FIELD_COPY(zc_stat); +#undef FIELD_COPY + break; + + case ZFS_CMD_COMPAT_ZCMD: + zcmd_c = (void *)addr; + /* zc */ + strlcpy(zcmd_c->zc_name, zc->zc_name, MAXPATHLEN); + strlcpy(zcmd_c->zc_value, zc->zc_value, MAXPATHLEN * 2); + strlcpy(zcmd_c->zc_string, zc->zc_string, MAXPATHLEN); + +#define FIELD_COPY(field) zcmd_c->field = zc->field + FIELD_COPY(zc_nvlist_src); + FIELD_COPY(zc_nvlist_src_size); + FIELD_COPY(zc_nvlist_dst); + FIELD_COPY(zc_nvlist_dst_size); + FIELD_COPY(zc_nvlist_dst_filled); + FIELD_COPY(zc_pad2); + FIELD_COPY(zc_history); + FIELD_COPY(zc_guid); + FIELD_COPY(zc_nvlist_conf); + FIELD_COPY(zc_nvlist_conf_size); + FIELD_COPY(zc_cookie); + FIELD_COPY(zc_objset_type); + FIELD_COPY(zc_perm_action); + FIELD_COPY(zc_history_len); + FIELD_COPY(zc_history_offset); + FIELD_COPY(zc_obj); + FIELD_COPY(zc_iflags); + FIELD_COPY(zc_share); + FIELD_COPY(zc_jailid); + FIELD_COPY(zc_objset_stats); + zcmd_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(resume_c->zc_inject_record.zi_func, + zc->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); + + /* boolean_t -> uint32_t */ + zcmd_c->zc_defer_destroy = (uint32_t)(zc->zc_defer_destroy); + zcmd_c->zc_temphold = 0; + + FIELD_COPY(zc_action_handle); + FIELD_COPY(zc_cleanup_fd); + FIELD_COPY(zc_simple); + FIELD_COPY(zc_sendobj); + FIELD_COPY(zc_fromobj); + FIELD_COPY(zc_createtxg); + FIELD_COPY(zc_stat); +#undef FIELD_COPY + + break; + + case ZFS_CMD_COMPAT_DEADMAN: + zcdm_c = (void *)addr; + + strlcpy(zcdm_c->zc_name, zc->zc_name, MAXPATHLEN); + strlcpy(zcdm_c->zc_value, zc->zc_value, MAXPATHLEN * 2); + strlcpy(zcdm_c->zc_string, zc->zc_string, MAXPATHLEN); + +#define FIELD_COPY(field) zcdm_c->field = zc->field + zcdm_c->zc_guid = zc->zc_guid; + zcdm_c->zc_nvlist_conf = zc->zc_nvlist_conf; + zcdm_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size; + zcdm_c->zc_nvlist_src = zc->zc_nvlist_src; + zcdm_c->zc_nvlist_src_size = zc->zc_nvlist_src_size; + zcdm_c->zc_nvlist_dst = zc->zc_nvlist_dst; + zcdm_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size; + zcdm_c->zc_cookie = zc->zc_cookie; + zcdm_c->zc_objset_type = zc->zc_objset_type; + zcdm_c->zc_perm_action = zc->zc_perm_action; + zcdm_c->zc_history = zc->zc_history; + zcdm_c->zc_history_len = zc->zc_history_len; + zcdm_c->zc_history_offset = zc->zc_history_offset; + zcdm_c->zc_obj = zc->zc_obj; + zcdm_c->zc_iflags = zc->zc_iflags; + zcdm_c->zc_share = zc->zc_share; + zcdm_c->zc_jailid = zc->zc_jailid; + zcdm_c->zc_objset_stats = zc->zc_objset_stats; + zcdm_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; + zcdm_c->zc_defer_destroy = zc->zc_defer_destroy; + zcdm_c->zc_temphold = 0; + zcdm_c->zc_action_handle = zc->zc_action_handle; + zcdm_c->zc_cleanup_fd = zc->zc_cleanup_fd; + zcdm_c->zc_simple = zc->zc_simple; + zcdm_c->zc_sendobj = zc->zc_sendobj; + zcdm_c->zc_fromobj = zc->zc_fromobj; + zcdm_c->zc_createtxg = zc->zc_createtxg; + zcdm_c->zc_stat = zc->zc_stat; + FIELD_COPY(zc_inject_record.zi_objset); + FIELD_COPY(zc_inject_record.zi_object); + FIELD_COPY(zc_inject_record.zi_start); + FIELD_COPY(zc_inject_record.zi_end); + FIELD_COPY(zc_inject_record.zi_guid); + FIELD_COPY(zc_inject_record.zi_level); + FIELD_COPY(zc_inject_record.zi_error); + FIELD_COPY(zc_inject_record.zi_type); + FIELD_COPY(zc_inject_record.zi_freq); + FIELD_COPY(zc_inject_record.zi_failfast); + strlcpy(resume_c->zc_inject_record.zi_func, + zc->zc_inject_record.zi_func, MAXNAMELEN); + FIELD_COPY(zc_inject_record.zi_iotype); + FIELD_COPY(zc_inject_record.zi_duration); + FIELD_COPY(zc_inject_record.zi_timer); + FIELD_COPY(zc_inject_record.zi_cmd); + FIELD_COPY(zc_inject_record.zi_pad); +#undef FIELD_COPY +#ifndef _KERNEL + if (request == ZFS_IOC_RECV) + strlcpy(zcdm_c->zc_top_ds, + zc->zc_value + strlen(zc->zc_value) + 1, + (MAXPATHLEN * 2) - strlen(zc->zc_value) - 1); +#endif + break; + + case ZFS_CMD_COMPAT_V28: + zc28_c = (void *)addr; + + strlcpy(zc28_c->zc_name, zc->zc_name, MAXPATHLEN); + strlcpy(zc28_c->zc_value, zc->zc_value, MAXPATHLEN * 2); + strlcpy(zc28_c->zc_string, zc->zc_string, MAXPATHLEN); + zc28_c->zc_guid = zc->zc_guid; + zc28_c->zc_nvlist_conf = zc->zc_nvlist_conf; + zc28_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size; + zc28_c->zc_nvlist_src = zc->zc_nvlist_src; + zc28_c->zc_nvlist_src_size = zc->zc_nvlist_src_size; + zc28_c->zc_nvlist_dst = zc->zc_nvlist_dst; + zc28_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size; + zc28_c->zc_cookie = zc->zc_cookie; + zc28_c->zc_objset_type = zc->zc_objset_type; + zc28_c->zc_perm_action = zc->zc_perm_action; + zc28_c->zc_history = zc->zc_history; + zc28_c->zc_history_len = zc->zc_history_len; + zc28_c->zc_history_offset = zc->zc_history_offset; + zc28_c->zc_obj = zc->zc_obj; + zc28_c->zc_iflags = zc->zc_iflags; + zc28_c->zc_share = zc->zc_share; + zc28_c->zc_jailid = zc->zc_jailid; + zc28_c->zc_objset_stats = zc->zc_objset_stats; + zc28_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; + zc28_c->zc_defer_destroy = zc->zc_defer_destroy; + zc28_c->zc_temphold = 0; + zc28_c->zc_action_handle = zc->zc_action_handle; + zc28_c->zc_cleanup_fd = zc->zc_cleanup_fd; + zc28_c->zc_simple = zc->zc_simple; + zc28_c->zc_sendobj = zc->zc_sendobj; + zc28_c->zc_fromobj = zc->zc_fromobj; + zc28_c->zc_createtxg = zc->zc_createtxg; + zc28_c->zc_stat = zc->zc_stat; +#ifndef _KERNEL + if (request == ZFS_IOC_RECV) + strlcpy(zc28_c->zc_top_ds, + zc->zc_value + strlen(zc->zc_value) + 1, + MAXPATHLEN * 2 - strlen(zc->zc_value) - 1); +#endif + /* zc_inject_record */ + zc28_c->zc_inject_record.zi_objset = + zc->zc_inject_record.zi_objset; + zc28_c->zc_inject_record.zi_object = + zc->zc_inject_record.zi_object; + zc28_c->zc_inject_record.zi_start = + zc->zc_inject_record.zi_start; + zc28_c->zc_inject_record.zi_end = + zc->zc_inject_record.zi_end; + zc28_c->zc_inject_record.zi_guid = + zc->zc_inject_record.zi_guid; + zc28_c->zc_inject_record.zi_level = + zc->zc_inject_record.zi_level; + zc28_c->zc_inject_record.zi_error = + zc->zc_inject_record.zi_error; + zc28_c->zc_inject_record.zi_type = + zc->zc_inject_record.zi_type; + zc28_c->zc_inject_record.zi_freq = + zc->zc_inject_record.zi_freq; + zc28_c->zc_inject_record.zi_failfast = + zc->zc_inject_record.zi_failfast; + strlcpy(zc28_c->zc_inject_record.zi_func, + zc->zc_inject_record.zi_func, MAXNAMELEN); + zc28_c->zc_inject_record.zi_iotype = + zc->zc_inject_record.zi_iotype; + zc28_c->zc_inject_record.zi_duration = + zc->zc_inject_record.zi_duration; + zc28_c->zc_inject_record.zi_timer = + zc->zc_inject_record.zi_timer; + break; + + case ZFS_CMD_COMPAT_V15: + zc_c = (void *)addr; + + /* zc */ + strlcpy(zc_c->zc_name, zc->zc_name, MAXPATHLEN); + strlcpy(zc_c->zc_value, zc->zc_value, MAXPATHLEN); + strlcpy(zc_c->zc_string, zc->zc_string, MAXPATHLEN); + zc_c->zc_guid = zc->zc_guid; + zc_c->zc_nvlist_conf = zc->zc_nvlist_conf; + zc_c->zc_nvlist_conf_size = zc->zc_nvlist_conf_size; + zc_c->zc_nvlist_src = zc->zc_nvlist_src; + zc_c->zc_nvlist_src_size = zc->zc_nvlist_src_size; + zc_c->zc_nvlist_dst = zc->zc_nvlist_dst; + zc_c->zc_nvlist_dst_size = zc->zc_nvlist_dst_size; + zc_c->zc_cookie = zc->zc_cookie; + zc_c->zc_objset_type = zc->zc_objset_type; + zc_c->zc_perm_action = zc->zc_perm_action; + zc_c->zc_history = zc->zc_history; + zc_c->zc_history_len = zc->zc_history_len; + zc_c->zc_history_offset = zc->zc_history_offset; + zc_c->zc_obj = zc->zc_obj; + zc_c->zc_share = zc->zc_share; + zc_c->zc_jailid = zc->zc_jailid; + zc_c->zc_objset_stats = zc->zc_objset_stats; + zc_c->zc_begin_record = zc->zc_begin_record.drr_u.drr_begin; + + /* zc_inject_record */ + zc_c->zc_inject_record.zi_objset = + zc->zc_inject_record.zi_objset; + zc_c->zc_inject_record.zi_object = + zc->zc_inject_record.zi_object; + zc_c->zc_inject_record.zi_start = + zc->zc_inject_record.zi_start; + zc_c->zc_inject_record.zi_end = + zc->zc_inject_record.zi_end; + zc_c->zc_inject_record.zi_guid = + zc->zc_inject_record.zi_guid; + zc_c->zc_inject_record.zi_level = + zc->zc_inject_record.zi_level; + zc_c->zc_inject_record.zi_error = + zc->zc_inject_record.zi_error; + zc_c->zc_inject_record.zi_type = + zc->zc_inject_record.zi_type; + zc_c->zc_inject_record.zi_freq = + zc->zc_inject_record.zi_freq; + zc_c->zc_inject_record.zi_failfast = + zc->zc_inject_record.zi_failfast; + + break; + } +} + +static int +zfs_ioctl_compat_get_nvlist(uint64_t nvl, size_t size, int iflag, + nvlist_t **nvp) +{ + char *packed; + int error; + nvlist_t *list = NULL; + + /* + * Read in and unpack the user-supplied nvlist. + */ + if (size == 0) + return (EINVAL); + +#ifdef _KERNEL + packed = kmem_alloc(size, KM_SLEEP); + if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, + iflag)) != 0) { + kmem_free(packed, size); + return (error); + } +#else + packed = (void *)(uintptr_t)nvl; +#endif + + error = nvlist_unpack(packed, size, &list, 0); + +#ifdef _KERNEL + kmem_free(packed, size); +#endif + + if (error != 0) + return (error); + + *nvp = list; + return (0); +} + +static int +zfs_ioctl_compat_put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) +{ + char *packed = NULL; + int error = 0; + size_t size; + + VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); + +#ifdef _KERNEL + packed = kmem_alloc(size, KM_SLEEP); + VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, + KM_SLEEP) == 0); + + if (ddi_copyout(packed, + (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) + error = EFAULT; + kmem_free(packed, size); +#else + packed = (void *)(uintptr_t)zc->zc_nvlist_dst; + VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, + 0) == 0); +#endif + + zc->zc_nvlist_dst_size = size; + return (error); +} + +static void +zfs_ioctl_compat_fix_stats_nvlist(nvlist_t *nvl) +{ + nvlist_t **child; + nvlist_t *nvroot = NULL; + vdev_stat_t *vs; + uint_t c, children, nelem; + + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + zfs_ioctl_compat_fix_stats_nvlist(child[c]); + } + } + + if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0) + zfs_ioctl_compat_fix_stats_nvlist(nvroot); +#ifdef _KERNEL + if ((nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS, +#else + if ((nvlist_lookup_uint64_array(nvl, "stats", +#endif + + (uint64_t **)&vs, &nelem) == 0)) { + nvlist_add_uint64_array(nvl, +#ifdef _KERNEL + "stats", +#else + ZPOOL_CONFIG_VDEV_STATS, +#endif + (uint64_t *)vs, nelem); +#ifdef _KERNEL + nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS, +#else + nvlist_remove(nvl, "stats", +#endif + DATA_TYPE_UINT64_ARRAY); + } +} + +static int +zfs_ioctl_compat_fix_stats(zfs_cmd_t *zc, const int nc) +{ + nvlist_t *nv, *nvp = NULL; + nvpair_t *elem; + int error; + + if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0) + return (error); + + if (nc == 5) { /* ZFS_IOC_POOL_STATS */ + elem = NULL; + while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) { + if (nvpair_value_nvlist(elem, &nvp) == 0) + zfs_ioctl_compat_fix_stats_nvlist(nvp); + } + elem = NULL; + } else + zfs_ioctl_compat_fix_stats_nvlist(nv); + + error = zfs_ioctl_compat_put_nvlist(zc, nv); + + nvlist_free(nv); + + return (error); +} + +static int +zfs_ioctl_compat_pool_get_props(zfs_cmd_t *zc) +{ + nvlist_t *nv, *nva = NULL; + int error; + + if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0) + return (error); + +#ifdef _KERNEL + if (nvlist_lookup_nvlist(nv, "allocated", &nva) == 0) { + nvlist_add_nvlist(nv, "used", nva); + nvlist_remove(nv, "allocated", DATA_TYPE_NVLIST); + } + + if (nvlist_lookup_nvlist(nv, "free", &nva) == 0) { + nvlist_add_nvlist(nv, "available", nva); + nvlist_remove(nv, "free", DATA_TYPE_NVLIST); + } +#else + if (nvlist_lookup_nvlist(nv, "used", &nva) == 0) { + nvlist_add_nvlist(nv, "allocated", nva); + nvlist_remove(nv, "used", DATA_TYPE_NVLIST); + } + + if (nvlist_lookup_nvlist(nv, "available", &nva) == 0) { + nvlist_add_nvlist(nv, "free", nva); + nvlist_remove(nv, "available", DATA_TYPE_NVLIST); + } +#endif + + error = zfs_ioctl_compat_put_nvlist(zc, nv); + + nvlist_free(nv); + + return (error); +} + +#ifndef _KERNEL +int +zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag) +{ + int nc, ret; + void *zc_c; + unsigned long ncmd; + zfs_iocparm_t zp; + + switch (cflag) { + case ZFS_CMD_COMPAT_NONE: + ncmd = _IOWR('Z', request, struct zfs_iocparm); + zp.zfs_cmd = (uint64_t)zc; + zp.zfs_cmd_size = sizeof(zfs_cmd_t); + zp.zfs_ioctl_version = ZFS_IOCVER_CURRENT; + return (ioctl(fd, ncmd, &zp)); + case ZFS_CMD_COMPAT_INLANES: + ncmd = _IOWR('Z', request, struct zfs_iocparm); + zp.zfs_cmd = (uint64_t)zc; + zp.zfs_cmd_size = sizeof(zfs_cmd_inlanes_t); + zp.zfs_ioctl_version = ZFS_IOCVER_INLANES; + return (ioctl(fd, ncmd, &zp)); + case ZFS_CMD_COMPAT_RESUME: + ncmd = _IOWR('Z', request, struct zfs_iocparm); + zp.zfs_cmd = (uint64_t)zc; + zp.zfs_cmd_size = sizeof(zfs_cmd_resume_t); + zp.zfs_ioctl_version = ZFS_IOCVER_RESUME; + return (ioctl(fd, ncmd, &zp)); + case ZFS_CMD_COMPAT_EDBP: + ncmd = _IOWR('Z', request, struct zfs_iocparm); + zp.zfs_cmd = (uint64_t)zc; + zp.zfs_cmd_size = sizeof(zfs_cmd_edbp_t); + zp.zfs_ioctl_version = ZFS_IOCVER_EDBP; + return (ioctl(fd, ncmd, &zp)); + case ZFS_CMD_COMPAT_ZCMD: + ncmd = _IOWR('Z', request, struct zfs_iocparm); + zp.zfs_cmd = (uint64_t)zc; + zp.zfs_cmd_size = sizeof(zfs_cmd_zcmd_t); + zp.zfs_ioctl_version = ZFS_IOCVER_ZCMD; + return (ioctl(fd, ncmd, &zp)); + case ZFS_CMD_COMPAT_LZC: + ncmd = _IOWR('Z', request, struct zfs_cmd); + return (ioctl(fd, ncmd, zc)); + case ZFS_CMD_COMPAT_DEADMAN: + zc_c = malloc(sizeof(zfs_cmd_deadman_t)); + ncmd = _IOWR('Z', request, struct zfs_cmd_deadman); + break; + case ZFS_CMD_COMPAT_V28: + zc_c = malloc(sizeof(zfs_cmd_v28_t)); + ncmd = _IOWR('Z', request, struct zfs_cmd_v28); + break; + case ZFS_CMD_COMPAT_V15: + nc = zfs_ioctl_v28_to_v15[request]; + zc_c = malloc(sizeof(zfs_cmd_v15_t)); + ncmd = _IOWR('Z', nc, struct zfs_cmd_v15); + break; + default: + return (EINVAL); + } + + if (ZFS_IOCREQ(ncmd) == ZFS_IOC_COMPAT_FAIL) + return (ENOTSUP); + + zfs_cmd_compat_put(zc, (caddr_t)zc_c, request, cflag); + + ret = ioctl(fd, ncmd, zc_c); + if (cflag == ZFS_CMD_COMPAT_V15 && + nc == ZFS_IOC_POOL_IMPORT) + ret = ioctl(fd, _IOWR('Z', ZFS_IOC_POOL_CONFIGS, + struct zfs_cmd_v15), zc_c); + zfs_cmd_compat_get(zc, (caddr_t)zc_c, cflag); + free(zc_c); + + if (cflag == ZFS_CMD_COMPAT_V15) { + switch (nc) { + case ZFS_IOC_POOL_IMPORT: + case ZFS_IOC_POOL_CONFIGS: + case ZFS_IOC_POOL_STATS: + case ZFS_IOC_POOL_TRYIMPORT: + zfs_ioctl_compat_fix_stats(zc, nc); + break; + case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */ + zfs_ioctl_compat_pool_get_props(zc); + break; + } + } + + return (ret); +} +#else /* _KERNEL */ +int +zfs_ioctl_compat_pre(zfs_cmd_t *zc, int *vec, const int cflag) +{ + int error = 0; + + /* are we creating a clone? */ + if (*vec == ZFS_IOC_CREATE && zc->zc_value[0] != '\0') + *vec = ZFS_IOC_CLONE; + + if (cflag == ZFS_CMD_COMPAT_V15) { + switch (*vec) { + + case 7: /* ZFS_IOC_POOL_SCRUB (v15) */ + zc->zc_cookie = POOL_SCAN_SCRUB; + break; + } + } + + return (error); +} + +void +zfs_ioctl_compat_post(zfs_cmd_t *zc, int vec, const int cflag) +{ + if (cflag == ZFS_CMD_COMPAT_V15) { + switch (vec) { + case ZFS_IOC_POOL_CONFIGS: + case ZFS_IOC_POOL_STATS: + case ZFS_IOC_POOL_TRYIMPORT: + zfs_ioctl_compat_fix_stats(zc, vec); + break; + case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */ + zfs_ioctl_compat_pool_get_props(zc); + break; + } + } +} + +nvlist_t * +zfs_ioctl_compat_innvl(zfs_cmd_t *zc, nvlist_t * innvl, const int vec, + const int cflag) +{ + nvlist_t *nvl, *tmpnvl, *hnvl; + nvpair_t *elem; + char *poolname, *snapname; + int err; + + if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC || + cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP || + cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES) + goto out; + + switch (vec) { + case ZFS_IOC_CREATE: + nvl = fnvlist_alloc(); + fnvlist_add_int32(nvl, "type", zc->zc_objset_type); + if (innvl != NULL) { + fnvlist_add_nvlist(nvl, "props", innvl); + nvlist_free(innvl); + } + return (nvl); + break; + case ZFS_IOC_CLONE: + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, "origin", zc->zc_value); + if (innvl != NULL) { + fnvlist_add_nvlist(nvl, "props", innvl); + nvlist_free(innvl); + } + return (nvl); + break; + case ZFS_IOC_SNAPSHOT: + if (innvl == NULL) + goto out; + nvl = fnvlist_alloc(); + fnvlist_add_nvlist(nvl, "props", innvl); + tmpnvl = fnvlist_alloc(); + snapname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value); + fnvlist_add_boolean(tmpnvl, snapname); + kmem_free(snapname, strlen(snapname + 1)); + /* check if we are doing a recursive snapshot */ + if (zc->zc_cookie) + dmu_get_recursive_snaps_nvl(zc->zc_name, zc->zc_value, + tmpnvl); + fnvlist_add_nvlist(nvl, "snaps", tmpnvl); + fnvlist_free(tmpnvl); + nvlist_free(innvl); + /* strip dataset part from zc->zc_name */ + zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; + return (nvl); + break; + case ZFS_IOC_SPACE_SNAPS: + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, "firstsnap", zc->zc_value); + if (innvl != NULL) + nvlist_free(innvl); + return (nvl); + break; + case ZFS_IOC_DESTROY_SNAPS: + if (innvl == NULL && cflag == ZFS_CMD_COMPAT_DEADMAN) + goto out; + nvl = fnvlist_alloc(); + if (innvl != NULL) { + fnvlist_add_nvlist(nvl, "snaps", innvl); + } else { + /* + * We are probably called by even older binaries, + * allocate and populate nvlist with recursive + * snapshots + */ + if (zfs_component_namecheck(zc->zc_value, NULL, + NULL) == 0) { + tmpnvl = fnvlist_alloc(); + if (dmu_get_recursive_snaps_nvl(zc->zc_name, + zc->zc_value, tmpnvl) == 0) + fnvlist_add_nvlist(nvl, "snaps", + tmpnvl); + nvlist_free(tmpnvl); + } + } + if (innvl != NULL) + nvlist_free(innvl); + /* strip dataset part from zc->zc_name */ + zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; + return (nvl); + break; + case ZFS_IOC_HOLD: + nvl = fnvlist_alloc(); + tmpnvl = fnvlist_alloc(); + if (zc->zc_cleanup_fd != -1) + fnvlist_add_int32(nvl, "cleanup_fd", + (int32_t)zc->zc_cleanup_fd); + if (zc->zc_cookie) { + hnvl = fnvlist_alloc(); + if (dmu_get_recursive_snaps_nvl(zc->zc_name, + zc->zc_value, hnvl) == 0) { + elem = NULL; + while ((elem = nvlist_next_nvpair(hnvl, + elem)) != NULL) { + nvlist_add_string(tmpnvl, + nvpair_name(elem), zc->zc_string); + } + } + nvlist_free(hnvl); + } else { + snapname = kmem_asprintf("%s@%s", zc->zc_name, + zc->zc_value); + nvlist_add_string(tmpnvl, snapname, zc->zc_string); + kmem_free(snapname, strlen(snapname + 1)); + } + fnvlist_add_nvlist(nvl, "holds", tmpnvl); + nvlist_free(tmpnvl); + if (innvl != NULL) + nvlist_free(innvl); + /* strip dataset part from zc->zc_name */ + zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; + return (nvl); + break; + case ZFS_IOC_RELEASE: + nvl = fnvlist_alloc(); + tmpnvl = fnvlist_alloc(); + if (zc->zc_cookie) { + hnvl = fnvlist_alloc(); + if (dmu_get_recursive_snaps_nvl(zc->zc_name, + zc->zc_value, hnvl) == 0) { + elem = NULL; + while ((elem = nvlist_next_nvpair(hnvl, + elem)) != NULL) { + fnvlist_add_boolean(tmpnvl, + zc->zc_string); + fnvlist_add_nvlist(nvl, + nvpair_name(elem), tmpnvl); + } + } + nvlist_free(hnvl); + } else { + snapname = kmem_asprintf("%s@%s", zc->zc_name, + zc->zc_value); + fnvlist_add_boolean(tmpnvl, zc->zc_string); + fnvlist_add_nvlist(nvl, snapname, tmpnvl); + kmem_free(snapname, strlen(snapname + 1)); + } + nvlist_free(tmpnvl); + if (innvl != NULL) + nvlist_free(innvl); + /* strip dataset part from zc->zc_name */ + zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; + return (nvl); + break; + } +out: + return (innvl); +} + +nvlist_t * +zfs_ioctl_compat_outnvl(zfs_cmd_t *zc, nvlist_t * outnvl, const int vec, + const int cflag) +{ + nvlist_t *tmpnvl; + + if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC || + cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP || + cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES) + return (outnvl); + + switch (vec) { + case ZFS_IOC_SPACE_SNAPS: + (void) nvlist_lookup_uint64(outnvl, "used", &zc->zc_cookie); + (void) nvlist_lookup_uint64(outnvl, "compressed", + &zc->zc_objset_type); + (void) nvlist_lookup_uint64(outnvl, "uncompressed", + &zc->zc_perm_action); + nvlist_free(outnvl); + /* return empty outnvl */ + tmpnvl = fnvlist_alloc(); + return (tmpnvl); + break; + case ZFS_IOC_CREATE: + case ZFS_IOC_CLONE: + case ZFS_IOC_HOLD: + case ZFS_IOC_RELEASE: + nvlist_free(outnvl); + /* return empty outnvl */ + tmpnvl = fnvlist_alloc(); + return (tmpnvl); + break; + } + + return (outnvl); +} +#endif /* KERNEL */ Index: src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.h =================================================================== RCS file: src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.h diff -N src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/common/zfs/zfs_ioctl_compat.h 25 Apr 2017 23:40:33 -0000 @@ -0,0 +1,544 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2014 Xin Li . All rights reserved. + * Copyright 2013 Martin Matuska . All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_IOCTL_COMPAT_H +#define _SYS_ZFS_IOCTL_COMPAT_H + +#include +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Backwards ioctl compatibility + */ + +/* ioctl versions for vfs.zfs.version.ioctl */ +#define ZFS_IOCVER_UNDEF -1 +#define ZFS_IOCVER_NONE 0 +#define ZFS_IOCVER_DEADMAN 1 +#define ZFS_IOCVER_LZC 2 +#define ZFS_IOCVER_ZCMD 3 +#define ZFS_IOCVER_EDBP 4 +#define ZFS_IOCVER_RESUME 5 +#define ZFS_IOCVER_INLANES 6 +#define ZFS_IOCVER_PAD 7 +#define ZFS_IOCVER_CURRENT ZFS_IOCVER_PAD + +/* compatibility conversion flag */ +#define ZFS_CMD_COMPAT_NONE 0 +#define ZFS_CMD_COMPAT_V15 1 +#define ZFS_CMD_COMPAT_V28 2 +#define ZFS_CMD_COMPAT_DEADMAN 3 +#define ZFS_CMD_COMPAT_LZC 4 +#define ZFS_CMD_COMPAT_ZCMD 5 +#define ZFS_CMD_COMPAT_EDBP 6 +#define ZFS_CMD_COMPAT_RESUME 7 +#define ZFS_CMD_COMPAT_INLANES 8 + +#define ZFS_IOC_COMPAT_PASS 254 +#define ZFS_IOC_COMPAT_FAIL 255 + +#define ZFS_IOCREQ(ioreq) ((ioreq) & 0xff) + +typedef struct zfs_iocparm { + uint32_t zfs_ioctl_version; + uint64_t zfs_cmd; + uint64_t zfs_cmd_size; +} zfs_iocparm_t; + +typedef struct zinject_record_v15 { + uint64_t zi_objset; + uint64_t zi_object; + uint64_t zi_start; + uint64_t zi_end; + uint64_t zi_guid; + uint32_t zi_level; + uint32_t zi_error; + uint64_t zi_type; + uint32_t zi_freq; + uint32_t zi_failfast; +} zinject_record_v15_t; + +typedef struct zfs_cmd_v15 { + char zc_name[MAXPATHLEN]; + char zc_value[MAXPATHLEN]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history; /* really (char *) */ + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_v15_t zc_inject_record; +} zfs_cmd_v15_t; + +typedef struct zinject_record_v28 { + uint64_t zi_objset; + uint64_t zi_object; + uint64_t zi_start; + uint64_t zi_end; + uint64_t zi_guid; + uint32_t zi_level; + uint32_t zi_error; + uint64_t zi_type; + uint32_t zi_freq; + uint32_t zi_failfast; + char zi_func[MAXNAMELEN]; + uint32_t zi_iotype; + int32_t zi_duration; + uint64_t zi_timer; +} zinject_record_v28_t; + +typedef struct zfs_cmd_v28 { + char zc_name[MAXPATHLEN]; + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + char zc_top_ds[MAXPATHLEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history; /* really (char *) */ + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_v28_t zc_inject_record; + boolean_t zc_defer_destroy; + boolean_t zc_temphold; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad[3]; /* alignment */ + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_v28_t; + +typedef struct zinject_record_deadman { + uint64_t zi_objset; + uint64_t zi_object; + uint64_t zi_start; + uint64_t zi_end; + uint64_t zi_guid; + uint32_t zi_level; + uint32_t zi_error; + uint64_t zi_type; + uint32_t zi_freq; + uint32_t zi_failfast; + char zi_func[MAXNAMELEN]; + uint32_t zi_iotype; + int32_t zi_duration; + uint64_t zi_timer; + uint32_t zi_cmd; + uint32_t zi_pad; +} zinject_record_deadman_t; + +typedef struct zfs_cmd_deadman { + char zc_name[MAXPATHLEN]; + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + char zc_top_ds[MAXPATHLEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history; /* really (char *) */ + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + /* zc_inject_record doesn't change in libzfs_core */ + zinject_record_deadman_t zc_inject_record; + boolean_t zc_defer_destroy; + boolean_t zc_temphold; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad[3]; /* alignment */ + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_deadman_t; + +typedef struct zfs_cmd_zcmd { + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_deadman_t zc_inject_record; + boolean_t zc_defer_destroy; + boolean_t zc_temphold; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad[3]; /* alignment */ + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_zcmd_t; + +typedef struct zfs_cmd_edbp { + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_deadman_t zc_inject_record; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad[3]; /* alignment */ + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_edbp_t; + +typedef struct zfs_cmd_resume { + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + dmu_replay_record_t zc_begin_record; + zinject_record_deadman_t zc_inject_record; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + boolean_t zc_resumable; + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_resume_t; + +typedef struct zfs_cmd_inlanes { + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + dmu_replay_record_t zc_begin_record; + zinject_record_t zc_inject_record; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + boolean_t zc_resumable; + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_inlanes_t; + +#ifdef _KERNEL +static unsigned long zfs_ioctl_v15_to_v28[] = { + 0, /* 0 ZFS_IOC_POOL_CREATE */ + 1, /* 1 ZFS_IOC_POOL_DESTROY */ + 2, /* 2 ZFS_IOC_POOL_IMPORT */ + 3, /* 3 ZFS_IOC_POOL_EXPORT */ + 4, /* 4 ZFS_IOC_POOL_CONFIGS */ + 5, /* 5 ZFS_IOC_POOL_STATS */ + 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */ + 7, /* 7 ZFS_IOC_POOL_SCRUB */ + 8, /* 8 ZFS_IOC_POOL_FREEZE */ + 9, /* 9 ZFS_IOC_POOL_UPGRADE */ + 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */ + 11, /* 11 ZFS_IOC_VDEV_ADD */ + 12, /* 12 ZFS_IOC_VDEV_REMOVE */ + 13, /* 13 ZFS_IOC_VDEV_SET_STATE */ + 14, /* 14 ZFS_IOC_VDEV_ATTACH */ + 15, /* 15 ZFS_IOC_VDEV_DETACH */ + 16, /* 16 ZFS_IOC_VDEV_SETPATH */ + 18, /* 17 ZFS_IOC_OBJSET_STATS */ + 19, /* 18 ZFS_IOC_OBJSET_ZPLPROPS */ + 20, /* 19 ZFS_IOC_DATASET_LIST_NEXT */ + 21, /* 20 ZFS_IOC_SNAPSHOT_LIST_NEXT */ + 22, /* 21 ZFS_IOC_SET_PROP */ + ZFS_IOC_COMPAT_PASS, /* 22 ZFS_IOC_CREATE_MINOR */ + ZFS_IOC_COMPAT_PASS, /* 23 ZFS_IOC_REMOVE_MINOR */ + 23, /* 24 ZFS_IOC_CREATE */ + 24, /* 25 ZFS_IOC_DESTROY */ + 25, /* 26 ZFS_IOC_ROLLBACK */ + 26, /* 27 ZFS_IOC_RENAME */ + 27, /* 28 ZFS_IOC_RECV */ + 28, /* 29 ZFS_IOC_SEND */ + 29, /* 30 ZFS_IOC_INJECT_FAULT */ + 30, /* 31 ZFS_IOC_CLEAR_FAULT */ + 31, /* 32 ZFS_IOC_INJECT_LIST_NEXT */ + 32, /* 33 ZFS_IOC_ERROR_LOG */ + 33, /* 34 ZFS_IOC_CLEAR */ + 34, /* 35 ZFS_IOC_PROMOTE */ + 35, /* 36 ZFS_IOC_DESTROY_SNAPS */ + 36, /* 37 ZFS_IOC_SNAPSHOT */ + 37, /* 38 ZFS_IOC_DSOBJ_TO_DSNAME */ + 38, /* 39 ZFS_IOC_OBJ_TO_PATH */ + 39, /* 40 ZFS_IOC_POOL_SET_PROPS */ + 40, /* 41 ZFS_IOC_POOL_GET_PROPS */ + 41, /* 42 ZFS_IOC_SET_FSACL */ + 42, /* 43 ZFS_IOC_GET_FSACL */ + ZFS_IOC_COMPAT_PASS, /* 44 ZFS_IOC_ISCSI_PERM_CHECK */ + 43, /* 45 ZFS_IOC_SHARE */ + 44, /* 46 ZFS_IOC_IHNERIT_PROP */ + 58, /* 47 ZFS_IOC_JAIL */ + 59, /* 48 ZFS_IOC_UNJAIL */ + 45, /* 49 ZFS_IOC_SMB_ACL */ + 46, /* 50 ZFS_IOC_USERSPACE_ONE */ + 47, /* 51 ZFS_IOC_USERSPACE_MANY */ + 48, /* 52 ZFS_IOC_USERSPACE_UPGRADE */ + 17, /* 53 ZFS_IOC_SETFRU */ +}; + +#else /* KERNEL */ +static unsigned long zfs_ioctl_v28_to_v15[] = { + 0, /* 0 ZFS_IOC_POOL_CREATE */ + 1, /* 1 ZFS_IOC_POOL_DESTROY */ + 2, /* 2 ZFS_IOC_POOL_IMPORT */ + 3, /* 3 ZFS_IOC_POOL_EXPORT */ + 4, /* 4 ZFS_IOC_POOL_CONFIGS */ + 5, /* 5 ZFS_IOC_POOL_STATS */ + 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */ + 7, /* 7 ZFS_IOC_POOL_SCAN */ + 8, /* 8 ZFS_IOC_POOL_FREEZE */ + 9, /* 9 ZFS_IOC_POOL_UPGRADE */ + 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */ + 11, /* 11 ZFS_IOC_VDEV_ADD */ + 12, /* 12 ZFS_IOC_VDEV_REMOVE */ + 13, /* 13 ZFS_IOC_VDEV_SET_STATE */ + 14, /* 14 ZFS_IOC_VDEV_ATTACH */ + 15, /* 15 ZFS_IOC_VDEV_DETACH */ + 16, /* 16 ZFS_IOC_VDEV_SETPATH */ + 53, /* 17 ZFS_IOC_VDEV_SETFRU */ + 17, /* 18 ZFS_IOC_OBJSET_STATS */ + 18, /* 19 ZFS_IOC_OBJSET_ZPLPROPS */ + 19, /* 20 ZFS_IOC_DATASET_LIST_NEXT */ + 20, /* 21 ZFS_IOC_SNAPSHOT_LIST_NEXT */ + 21, /* 22 ZFS_IOC_SET_PROP */ + 24, /* 23 ZFS_IOC_CREATE */ + 25, /* 24 ZFS_IOC_DESTROY */ + 26, /* 25 ZFS_IOC_ROLLBACK */ + 27, /* 26 ZFS_IOC_RENAME */ + 28, /* 27 ZFS_IOC_RECV */ + 29, /* 28 ZFS_IOC_SEND */ + 30, /* 39 ZFS_IOC_INJECT_FAULT */ + 31, /* 30 ZFS_IOC_CLEAR_FAULT */ + 32, /* 31 ZFS_IOC_INJECT_LIST_NEXT */ + 33, /* 32 ZFS_IOC_ERROR_LOG */ + 34, /* 33 ZFS_IOC_CLEAR */ + 35, /* 34 ZFS_IOC_PROMOTE */ + 36, /* 35 ZFS_IOC_DESTROY_SNAPS */ + 37, /* 36 ZFS_IOC_SNAPSHOT */ + 38, /* 37 ZFS_IOC_DSOBJ_TO_DSNAME */ + 39, /* 38 ZFS_IOC_OBJ_TO_PATH */ + 40, /* 39 ZFS_IOC_POOL_SET_PROPS */ + 41, /* 40 ZFS_IOC_POOL_GET_PROPS */ + 42, /* 41 ZFS_IOC_SET_FSACL */ + 43, /* 42 ZFS_IOC_GET_FSACL */ + 45, /* 43 ZFS_IOC_SHARE */ + 46, /* 44 ZFS_IOC_IHNERIT_PROP */ + 49, /* 45 ZFS_IOC_SMB_ACL */ + 50, /* 46 ZFS_IOC_USERSPACE_ONE */ + 51, /* 47 ZFS_IOC_USERSPACE_MANY */ + 52, /* 48 ZFS_IOC_USERSPACE_UPGRADE */ + ZFS_IOC_COMPAT_FAIL, /* 49 ZFS_IOC_HOLD */ + ZFS_IOC_COMPAT_FAIL, /* 50 ZFS_IOC_RELEASE */ + ZFS_IOC_COMPAT_FAIL, /* 51 ZFS_IOC_GET_HOLDS */ + ZFS_IOC_COMPAT_FAIL, /* 52 ZFS_IOC_OBJSET_RECVD_PROPS */ + ZFS_IOC_COMPAT_FAIL, /* 53 ZFS_IOC_VDEV_SPLIT */ + ZFS_IOC_COMPAT_FAIL, /* 54 ZFS_IOC_NEXT_OBJ */ + ZFS_IOC_COMPAT_FAIL, /* 55 ZFS_IOC_DIFF */ + ZFS_IOC_COMPAT_FAIL, /* 56 ZFS_IOC_TMP_SNAPSHOT */ + ZFS_IOC_COMPAT_FAIL, /* 57 ZFS_IOC_OBJ_TO_STATS */ + 47, /* 58 ZFS_IOC_JAIL */ + 48, /* 59 ZFS_IOC_UNJAIL */ +}; +#endif /* ! _KERNEL */ + +#ifdef _KERNEL +int zfs_ioctl_compat_pre(zfs_cmd_t *, int *, const int); +void zfs_ioctl_compat_post(zfs_cmd_t *, const int, const int); +nvlist_t *zfs_ioctl_compat_innvl(zfs_cmd_t *, nvlist_t *, const int, + const int); +nvlist_t *zfs_ioctl_compat_outnvl(zfs_cmd_t *, nvlist_t *, const int, + const int); +#else +int zcmd_ioctl_compat(int, int, zfs_cmd_t *, const int); +#endif /* _KERNEL */ +void zfs_cmd_compat_get(zfs_cmd_t *, caddr_t, const int); +void zfs_cmd_compat_put(zfs_cmd_t *, caddr_t, const int, const int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_IOCTL_COMPAT_H */ Index: src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zfs_namecheck.c --- src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c 27 Feb 2010 22:29:40 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c 10 Oct 2016 11:10:02 -0000 @@ -22,6 +22,9 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ /* * Common name validation routines for ZFS. These routines are shared by the @@ -62,11 +65,11 @@ valid_char(char c) * [-_.: ] */ int -snapshot_namecheck(const char *path, namecheck_err_t *why, char *what) +zfs_component_namecheck(const char *path, namecheck_err_t *why, char *what) { const char *loc; - if (strlen(path) >= MAXNAMELEN) { + if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) { if (why) *why = NAME_ERR_TOOLONG; return (-1); @@ -113,7 +116,7 @@ permset_namecheck(const char *path, name return (-1); } - return (snapshot_namecheck(&path[1], why, what)); + return (zfs_component_namecheck(&path[1], why, what)); } /* @@ -137,14 +140,9 @@ dataset_namecheck(const char *path, name /* * Make sure the name is not too long. - * - * ZFS_MAXNAMELEN is the maximum dataset length used in the userland - * which is the same as MAXNAMELEN used in the kernel. - * If ZFS_MAXNAMELEN value is changed, make sure to cleanup all - * places using MAXNAMELEN. */ - if (strlen(path) >= MAXNAMELEN) { + if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) { if (why) *why = NAME_ERR_TOOLONG; return (-1); @@ -273,7 +271,7 @@ mountpoint_namecheck(const char *path, n while (*end != '/' && *end != '\0') end++; - if (end - start >= MAXNAMELEN) { + if (end - start >= ZFS_MAX_DATASET_NAME_LEN) { if (why) *why = NAME_ERR_TOOLONG; return (-1); @@ -298,13 +296,8 @@ pool_namecheck(const char *pool, nameche /* * Make sure the name is not too long. - * - * ZPOOL_MAXNAMELEN is the maximum pool length used in the userland - * which is the same as MAXNAMELEN used in the kernel. - * If ZPOOL_MAXNAMELEN value is changed, make sure to cleanup all - * places using MAXNAMELEN. */ - if (strlen(pool) >= MAXNAMELEN) { + if (strlen(pool) >= ZFS_MAX_DATASET_NAME_LEN) { if (why) *why = NAME_ERR_TOOLONG; return (-1); Index: src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zfs_namecheck.h --- src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h 27 Feb 2010 22:29:40 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h 13 Jan 2014 02:59:29 -0000 @@ -22,6 +22,9 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ #ifndef _ZFS_NAMECHECK_H #define _ZFS_NAMECHECK_H @@ -48,7 +51,7 @@ typedef enum { int pool_namecheck(const char *, namecheck_err_t *, char *); int dataset_namecheck(const char *, namecheck_err_t *, char *); int mountpoint_namecheck(const char *, namecheck_err_t *); -int snapshot_namecheck(const char *, namecheck_err_t *, char *); +int zfs_component_namecheck(const char *, namecheck_err_t *, char *); int permset_namecheck(const char *, namecheck_err_t *, char *); #ifdef __cplusplus Index: src/external/cddl/osnet/dist/common/zfs/zfs_prop.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_prop.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zfs_prop.c --- src/external/cddl/osnet/dist/common/zfs/zfs_prop.c 27 Feb 2010 22:29:41 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/common/zfs/zfs_prop.c 28 Jun 2017 00:04:06 -0000 @@ -19,10 +19,15 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ +/* Portions Copyright 2010 Robert Milkowski */ + #include #include #include @@ -66,6 +71,14 @@ zfs_prop_init(void) { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 }, { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 }, { "sha256", ZIO_CHECKSUM_SHA256 }, + { "noparity", ZIO_CHECKSUM_NOPARITY }, +#ifndef __NetBSD__ + { "sha512", ZIO_CHECKSUM_SHA512 }, + { "skein", ZIO_CHECKSUM_SKEIN }, +#endif +#ifdef illumos + { "edonr", ZIO_CHECKSUM_EDONR }, +#endif { NULL } }; @@ -76,6 +89,18 @@ zfs_prop_init(void) { "sha256", ZIO_CHECKSUM_SHA256 }, { "sha256,verify", ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY }, +#ifndef __NetBSD__ + { "sha512", ZIO_CHECKSUM_SHA512 }, + { "sha512,verify", + ZIO_CHECKSUM_SHA512 | ZIO_CHECKSUM_VERIFY }, + { "skein", ZIO_CHECKSUM_SKEIN }, + { "skein,verify", + ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY }, +#endif +#ifdef illumos + { "edonr,verify", + ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY }, +#endif { NULL } }; @@ -94,6 +119,7 @@ zfs_prop_init(void) { "gzip-8", ZIO_COMPRESS_GZIP_8 }, { "gzip-9", ZIO_COMPRESS_GZIP_9 }, { "zle", ZIO_COMPRESS_ZLE }, + { "lz4", ZIO_COMPRESS_LZ4 }, { NULL } }; @@ -107,6 +133,7 @@ zfs_prop_init(void) { "discard", ZFS_ACL_DISCARD }, { "groupmask", ZFS_ACL_GROUPMASK }, { "passthrough", ZFS_ACL_PASSTHROUGH }, + { "restricted", ZFS_ACL_RESTRICTED }, { NULL } }; @@ -153,6 +180,7 @@ zfs_prop_init(void) { "2", 2 }, { "3", 3 }, { "4", 4 }, + { "5", 5 }, { "current", ZPL_VERSION }, { NULL } }; @@ -183,185 +211,260 @@ zfs_prop_init(void) { NULL } }; + static zprop_index_t sync_table[] = { + { "standard", ZFS_SYNC_STANDARD }, + { "always", ZFS_SYNC_ALWAYS }, + { "disabled", ZFS_SYNC_DISABLED }, + { NULL } + }; + + static zprop_index_t volmode_table[] = { + { "default", ZFS_VOLMODE_DEFAULT }, + { "geom", ZFS_VOLMODE_GEOM }, + { "dev", ZFS_VOLMODE_DEV }, + { "none", ZFS_VOLMODE_NONE }, + { NULL } + }; + + static zprop_index_t redundant_metadata_table[] = { + { "all", ZFS_REDUNDANT_METADATA_ALL }, + { "most", ZFS_REDUNDANT_METADATA_MOST }, + { NULL } + }; + /* inherit index properties */ - register_index(ZFS_PROP_CHECKSUM, "checksum", ZIO_CHECKSUM_DEFAULT, + zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata", + ZFS_REDUNDANT_METADATA_ALL, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "all | most", "REDUND_MD", + redundant_metadata_table); + zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM", - checksum_table); - register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF, + "standard | always | disabled", "SYNC", + sync_table); + zprop_register_index(ZFS_PROP_CHECKSUM, "checksum", + ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME, + "on | off | fletcher2 | fletcher4 | sha256 | sha512 | " + "skein | edonr", "CHECKSUM", checksum_table); + zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "on | off | verify | sha256[,verify]", "DEDUP", - dedup_table); - register_index(ZFS_PROP_COMPRESSION, "compression", + "on | off | verify | sha256[,verify], sha512[,verify], " + "skein[,verify], edonr,verify", "DEDUP", dedup_table); + zprop_register_index(ZFS_PROP_COMPRESSION, "compression", ZIO_COMPRESS_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "on | off | lzjb | gzip | gzip-[1-9] | zle", "COMPRESS", - compress_table); - register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN, + "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4", + "COMPRESS", compress_table); + zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "hidden | visible", "SNAPDIR", snapdir_table); - register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_GROUPMASK, - PROP_INHERIT, ZFS_TYPE_FILESYSTEM, - "discard | groupmask | passthrough", "ACLMODE", acl_mode_table); - register_index(ZFS_PROP_ACLINHERIT, "aclinherit", ZFS_ACL_RESTRICTED, + zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "discard | groupmask | passthrough | restricted", "ACLMODE", + acl_mode_table); + zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit", + ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "discard | noallow | restricted | passthrough | passthrough-x", "ACLINHERIT", acl_inherit_table); - register_index(ZFS_PROP_COPIES, "copies", 1, - PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "1 | 2 | 3", "COPIES", copies_table); - register_index(ZFS_PROP_PRIMARYCACHE, "primarycache", + zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache", ZFS_CACHE_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "all | none | metadata", "PRIMARYCACHE", cache_table); - register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache", + zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache", ZFS_CACHE_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "all | none | metadata", "SECONDARYCACHE", cache_table); - register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY, + zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "latency | throughput", "LOGBIAS", logbias_table); + zprop_register_index(ZFS_PROP_VOLMODE, "volmode", + ZFS_VOLMODE_DEFAULT, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, + "default | geom | dev | none", "VOLMODE", volmode_table); /* inherit index (boolean) properties */ - register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, + zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table); - register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT, + zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES", boolean_table); - register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT, + zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC", boolean_table); - register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT, + zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID", boolean_table); - register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT, + zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY", boolean_table); - register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT, - ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table); - register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT, + zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table); + zprop_register_index(ZFS_PROP_XATTR, "xattr", 1, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "XATTR", boolean_table); - register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT, + zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", boolean_table); - register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT, + zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND", boolean_table); /* default index properties */ - register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT, + zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, - "1 | 2 | 3 | 4 | current", "VERSION", version_table); - register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON, + "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table); + zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto", "CANMOUNT", canmount_table); /* readonly index (boolean) properties */ - register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY, + zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table); - register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0, + zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY", boolean_table); /* set once index properties */ - register_index(ZFS_PROP_NORMALIZE, "normalization", 0, + zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "none | formC | formD | formKC | formKD", "NORMALIZATION", normalize_table); - register_index(ZFS_PROP_CASE, "casesensitivity", ZFS_CASE_SENSITIVE, - PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, + zprop_register_index(ZFS_PROP_CASE, "casesensitivity", + ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_SNAPSHOT, "sensitive | insensitive | mixed", "CASE", case_table); /* set once index (boolean) properties */ - register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME, + zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "UTF8ONLY", boolean_table); /* string properties */ - register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY, + zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "ORIGIN"); - register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", PROP_INHERIT, - ZFS_TYPE_FILESYSTEM, " | legacy | none", "MOUNTPOINT"); - register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", PROP_INHERIT, - ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", "SHARENFS"); - register_string(ZFS_PROP_SHAREISCSI, "shareiscsi", "off", PROP_INHERIT, - ZFS_TYPE_DATASET, "on | off | type=", "SHAREISCSI"); - register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY, - ZFS_TYPE_DATASET, "filesystem | volume | snapshot", "TYPE"); - register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT, - ZFS_TYPE_FILESYSTEM, "on | off | sharemgr(1M) options", "SHARESMB"); - register_string(ZFS_PROP_MLSLABEL, "mlslabel", ZFS_MLSLABEL_DEFAULT, - PROP_INHERIT, ZFS_TYPE_DATASET, "", "MLSLABEL"); + zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY, + ZFS_TYPE_SNAPSHOT, "[,...]", "CLONES"); + zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, " | legacy | none", + "MOUNTPOINT"); + zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", + "SHARENFS"); + zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY, + ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, + "filesystem | volume | snapshot | bookmark", "TYPE"); + zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "on | off | sharemgr(1M) options", "SHARESMB"); + zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel", + ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET, + "", "MLSLABEL"); + zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN, + "receive_resume_token", + NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "", "RESUMETOK"); /* readonly number properties */ - register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, + zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "USED"); - register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY, + zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "AVAIL"); - register_number(ZFS_PROP_REFERENCED, "referenced", 0, PROP_READONLY, - ZFS_TYPE_DATASET, "", "REFER"); - register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0, + zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0, + PROP_READONLY, ZFS_TYPE_DATASET, "", "REFER"); + zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0, PROP_READONLY, ZFS_TYPE_DATASET, "<1.00x or higher if compressed>", "RATIO"); - register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", + zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0, + PROP_READONLY, ZFS_TYPE_DATASET, + "<1.00x or higher if compressed>", "REFRATIO"); + zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME, ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK"); - register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, PROP_READONLY, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDSNAP"); - register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, PROP_READONLY, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDDS"); - register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, PROP_READONLY, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDCHILD"); - register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0, + zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, + PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", + "USEDSNAP"); + zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, + PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", + "USEDDS"); + zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, + PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", + "USEDCHILD"); + zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDREFRESERV"); - register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY, + zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "", "USERREFS"); + zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY, + ZFS_TYPE_DATASET, "", "WRITTEN"); + zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0, + PROP_READONLY, ZFS_TYPE_DATASET, "", "LUSED"); + zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced", + 0, PROP_READONLY, ZFS_TYPE_DATASET, "", "LREFER"); /* default number properties */ - register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, + zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, " | none", "QUOTA"); - register_number(ZFS_PROP_RESERVATION, "reservation", 0, PROP_DEFAULT, - ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, " | none", "RESERV"); - register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT, + zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0, + PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + " | none", "RESERV"); + zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT, ZFS_TYPE_VOLUME, "", "VOLSIZE"); - register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT, + zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, " | none", "REFQUOTA"); - register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0, + zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, " | none", "REFRESERV"); + zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit", + UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, + " | none", "FSLIMIT"); + zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit", + UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + " | none", "SSLIMIT"); + zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count", + UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, + "", "FSCOUNT"); + zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count", + UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "", "SSCOUNT"); /* inherit number properties */ - register_number(ZFS_PROP_RECORDSIZE, "recordsize", SPA_MAXBLOCKSIZE, - PROP_INHERIT, - ZFS_TYPE_FILESYSTEM, "512 to 128k, power of 2", "RECSIZE"); + zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize", + SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE"); /* hidden properties */ - register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER, - PROP_READONLY, ZFS_TYPE_DATASET, "CREATETXG"); - register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER, + zprop_register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER, + PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "CREATETXG"); + zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES"); - register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING, - PROP_READONLY, ZFS_TYPE_DATASET, "NAME"); - register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING, - PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS"); - register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu", + zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING, + PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "NAME"); + zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", + PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS"); + zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu", PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "STMF_SBD_LU"); - register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY, - ZFS_TYPE_DATASET, "GUID"); - register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting", + zprop_register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, + PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "GUID"); + zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "USERACCOUNTING"); - register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER, + zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE"); - register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER, + zprop_register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID"); + zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent", + PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT"); + zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING, + PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP"); /* oddball properties */ - register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL, - PROP_READONLY, ZFS_TYPE_DATASET, + zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, + NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "CREATION", B_FALSE, B_TRUE, NULL); } @@ -443,6 +546,18 @@ zfs_prop_userquota(const char *name) } /* + * Returns true if this is a valid written@ property. + * Note that after the @, any character is valid (eg, another @, for + * written@pool/fs@origin). + */ +boolean_t +zfs_prop_written(const char *name) +{ + static const char *prefix = "written@"; + return (strncmp(name, prefix, strlen(prefix)) == 0); +} + +/* * Tables of index types, plus functions to convert between the user view * (strings) and internal representation (uint64_t). */ Index: src/external/cddl/osnet/dist/common/zfs/zfs_prop.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zfs_prop.h,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zfs_prop.h --- src/external/cddl/osnet/dist/common/zfs/zfs_prop.h 27 Feb 2010 22:29:40 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/common/zfs/zfs_prop.h 12 Jun 2012 05:57:26 -0000 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -98,16 +98,16 @@ zprop_desc_t *zpool_prop_get_table(void) /* * Common routines to initialize property tables */ -void register_impl(int, const char *, zprop_type_t, uint64_t, +void zprop_register_impl(int, const char *, zprop_type_t, uint64_t, const char *, zprop_attr_t, int, const char *, const char *, boolean_t, boolean_t, const zprop_index_t *); -void register_string(int, const char *, const char *, zprop_attr_t attr, - int, const char *, const char *); -void register_number(int, const char *, uint64_t, zprop_attr_t, int, +void zprop_register_string(int, const char *, const char *, + zprop_attr_t attr, int, const char *, const char *); +void zprop_register_number(int, const char *, uint64_t, zprop_attr_t, int, const char *, const char *); -void register_index(int, const char *, uint64_t, zprop_attr_t, int, +void zprop_register_index(int, const char *, uint64_t, zprop_attr_t, int, const char *, const char *, const zprop_index_t *); -void register_hidden(int, const char *, zprop_type_t, zprop_attr_t, +void zprop_register_hidden(int, const char *, zprop_type_t, zprop_attr_t, int, const char *); /* @@ -121,6 +121,8 @@ uint64_t zprop_random_value(int, uint64_ const char *zprop_values(int, zfs_type_t); size_t zprop_width(int, boolean_t *, zfs_type_t); boolean_t zprop_valid_for_type(int, zfs_type_t); +boolean_t zfs_prop_written(const char *name); + #ifdef __cplusplus } Index: src/external/cddl/osnet/dist/common/zfs/zpool_prop.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zpool_prop.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zpool_prop.c --- src/external/cddl/osnet/dist/common/zfs/zpool_prop.c 27 Feb 2010 22:29:41 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/common/zfs/zpool_prop.c 27 Mar 2016 02:52:13 -0000 @@ -19,8 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ #include @@ -64,53 +66,70 @@ zpool_prop_init(void) }; /* string properties */ - register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT, + zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, "", "ALTROOT"); - register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT, + zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT, ZFS_TYPE_POOL, "", "BOOTFS"); - register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, PROP_DEFAULT, - ZFS_TYPE_POOL, " | none", "CACHEFILE"); + zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, + PROP_DEFAULT, ZFS_TYPE_POOL, " | none", "CACHEFILE"); + zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL, + PROP_DEFAULT, ZFS_TYPE_POOL, "", "COMMENT"); /* readonly number properties */ - register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY, + zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "SIZE"); - register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY, + zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "FREE"); - register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, PROP_READONLY, - ZFS_TYPE_POOL, "", "ALLOC"); - register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, + zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY, + ZFS_TYPE_POOL, "", "FREEING"); + zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY, + ZFS_TYPE_POOL, "", "LEAKED"); + zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, + PROP_READONLY, ZFS_TYPE_POOL, "", "ALLOC"); + zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, + PROP_READONLY, ZFS_TYPE_POOL, "", "EXPANDSZ"); + zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0, + PROP_READONLY, ZFS_TYPE_POOL, "", "FRAG"); + zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "CAP"); - register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, + zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "GUID"); - register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY, + zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "HEALTH"); - register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0, PROP_READONLY, - ZFS_TYPE_POOL, "<1.00x or higher if deduped>", "DEDUP"); + zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0, + PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>", + "DEDUP"); /* default number properties */ - register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, + zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, PROP_DEFAULT, ZFS_TYPE_POOL, "", "VERSION"); - register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0, + zprop_register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "", "DEDUPDITTO"); /* default index (boolean) properties */ - register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT, - ZFS_TYPE_POOL, "on | off", "DELEGATION", boolean_table); - register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0, PROP_DEFAULT, - ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table); - register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT, - ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table); - register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0, PROP_DEFAULT, - ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table); + zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, + PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION", + boolean_table); + zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0, + PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table); + zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, + PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS", + boolean_table); + zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0, + PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table); + zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0, + PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "RDONLY", boolean_table); /* default index properties */ - register_index(ZPOOL_PROP_FAILUREMODE, "failmode", + zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode", ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL, "wait | continue | panic", "FAILMODE", failuremode_table); /* hidden properties */ - register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING, + zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING, PROP_READONLY, ZFS_TYPE_POOL, "NAME"); + zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize", + PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE"); } /* @@ -156,6 +175,26 @@ zpool_prop_default_numeric(zpool_prop_t return (zpool_prop_table[prop].pd_numdefault); } +/* + * Returns true if this is a valid feature@ property. + */ +boolean_t +zpool_prop_feature(const char *name) +{ + static const char *prefix = "feature@"; + return (strncmp(name, prefix, strlen(prefix)) == 0); +} + +/* + * Returns true if this is a valid unsupported@ property. + */ +boolean_t +zpool_prop_unsupported(const char *name) +{ + static const char *prefix = "unsupported@"; + return (strncmp(name, prefix, strlen(prefix)) == 0); +} + int zpool_prop_string_to_index(zpool_prop_t prop, const char *string, uint64_t *index) Index: src/external/cddl/osnet/dist/common/zfs/zprop_common.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/common/zfs/zprop_common.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zprop_common.c --- src/external/cddl/osnet/dist/common/zfs/zprop_common.c 27 Feb 2010 22:29:41 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/common/zfs/zprop_common.c 16 Jun 2017 17:20:43 -0000 @@ -19,9 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ /* * Common routines used by zfs and zpool property management. @@ -39,7 +42,11 @@ #if defined(_KERNEL) #include +#ifdef __FreeBSD__ +#include +#else #include +#endif #else #include #include @@ -65,7 +72,7 @@ zprop_get_numprops(zfs_type_t type) } void -register_impl(int prop, const char *name, zprop_type_t type, +zprop_register_impl(int prop, const char *name, zprop_type_t type, uint64_t numdefault, const char *strdefault, zprop_attr_t attr, int objset_types, const char *values, const char *colname, boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl) @@ -97,38 +104,40 @@ register_impl(int prop, const char *name } void -register_string(int prop, const char *name, const char *def, +zprop_register_string(int prop, const char *name, const char *def, zprop_attr_t attr, int objset_types, const char *values, const char *colname) { - register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr, + zprop_register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr, objset_types, values, colname, B_FALSE, B_TRUE, NULL); } void -register_number(int prop, const char *name, uint64_t def, zprop_attr_t attr, - int objset_types, const char *values, const char *colname) +zprop_register_number(int prop, const char *name, uint64_t def, + zprop_attr_t attr, int objset_types, const char *values, + const char *colname) { - register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr, + zprop_register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr, objset_types, values, colname, B_TRUE, B_TRUE, NULL); } void -register_index(int prop, const char *name, uint64_t def, zprop_attr_t attr, - int objset_types, const char *values, const char *colname, - const zprop_index_t *idx_tbl) +zprop_register_index(int prop, const char *name, uint64_t def, + zprop_attr_t attr, int objset_types, const char *values, + const char *colname, const zprop_index_t *idx_tbl) { - register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr, + zprop_register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr, objset_types, values, colname, B_TRUE, B_TRUE, idx_tbl); } void -register_hidden(int prop, const char *name, zprop_type_t type, +zprop_register_hidden(int prop, const char *name, zprop_type_t type, zprop_attr_t attr, int objset_types, const char *colname) { - register_impl(prop, name, type, 0, NULL, attr, - objset_types, NULL, colname, B_FALSE, B_FALSE, NULL); + zprop_register_impl(prop, name, type, 0, NULL, attr, + objset_types, NULL, colname, + type == PROP_TYPE_NUMBER, B_FALSE, NULL); } @@ -161,7 +170,7 @@ int zprop_iter_common(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered, zfs_type_t type) { - int i, num_props, size, prop; + int i, j, num_props, size, prop; zprop_desc_t *prop_tbl; zprop_desc_t **order; @@ -176,7 +185,7 @@ zprop_iter_common(zprop_func func, void return (ZPROP_CONT); #endif - for (int j = 0; j < num_props; j++) + for (j = 0; j < num_props; j++) order[j] = &prop_tbl[j]; if (ordered) { Index: src/external/cddl/osnet/dist/head/nlist.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/head/nlist.h,v retrieving revision 1.1.1.1 diff -u -p -r1.1.1.1 nlist.h --- src/external/cddl/osnet/dist/head/nlist.h 20 Feb 2010 04:33:53 -0000 1.1.1.1 +++ src/external/cddl/osnet/dist/head/nlist.h 12 Apr 2017 22:47:20 -0000 @@ -19,6 +19,9 @@ * * CDDL HEADER END */ +/* + * Copyright 2014 Garrett D'Amore + */ /* Copyright (c) 1988 AT&T */ /* All Rights Reserved */ @@ -26,8 +29,6 @@ #ifndef _NLIST_H #define _NLIST_H -#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 1.8.2.4 */ - #ifdef __cplusplus extern "C" { #endif @@ -41,11 +42,7 @@ struct nlist { char n_numaux; /* number of aux. entries */ }; -#if defined(__STDC__) extern int nlist(const char *, struct nlist *); -#else /* __STDC__ */ -extern int nlist(); -#endif /* __STDC__ */ #ifdef __cplusplus } Index: src/external/cddl/osnet/dist/lib/libctf/common/ctf_lib.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libctf/common/ctf_lib.c,v retrieving revision 1.6 diff -u -p -r1.6 ctf_lib.c --- src/external/cddl/osnet/dist/lib/libctf/common/ctf_lib.c 24 Sep 2015 14:20:18 -0000 1.6 +++ src/external/cddl/osnet/dist/lib/libctf/common/ctf_lib.c 12 Apr 2017 22:50:09 -0000 @@ -350,6 +350,7 @@ ctf_fdopen(int fd, int *errp) if ((sp32 = malloc(nbytes)) == NULL || pread64(fd, sp32, nbytes, hdr.e64.e_shoff) != nbytes) { free(sp); + free(sp32); return (ctf_set_open_errno(errp, errno)); } Index: src/external/cddl/osnet/dist/lib/libdtrace/arm/dt_isadep.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/arm/dt_isadep.c,v retrieving revision 1.3 diff -u -p -r1.3 dt_isadep.c --- src/external/cddl/osnet/dist/lib/libdtrace/arm/dt_isadep.c 21 Feb 2015 15:13:20 -0000 1.3 +++ src/external/cddl/osnet/dist/lib/libdtrace/arm/dt_isadep.c 8 May 2017 12:44:43 -0000 @@ -22,6 +22,9 @@ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Howard Su + * Copyright 2015 George V. Neville-Neil + * */ #pragma ident "%Z%%M% %I% %E% SMI" @@ -36,6 +39,10 @@ #include #include +#ifndef illumos +#include +#endif + #define OP(x) ((x) >> 30) #define OP2(x) (((x) >> 22) & 0x07) #define COND(x) (((x) >> 25) & 0x0f) @@ -75,6 +82,8 @@ dt_pid_create_return_probe(struct ps_pro { uint32_t *text; + int i; + int srdepth = 0; dt_dprintf("%s: unimplemented\n", __func__); return (DT_PROC_ERR); @@ -83,13 +92,12 @@ dt_pid_create_return_probe(struct ps_pro dt_dprintf("mr sparkle: malloc() failed\n"); return (DT_PROC_ERR); } -#ifdef DOODAD + if (Pread(P, text, symp->st_size, symp->st_value) != symp->st_size) { dt_dprintf("mr sparkle: Pread() failed\n"); free(text); return (DT_PROC_ERR); } -#endif /* * Leave a dummy instruction in the last slot to simplify edge @@ -179,4 +187,3 @@ dt_pid_create_glob_offset_probes(struct return (ftp->ftps_noffs); } - Index: src/external/cddl/osnet/dist/lib/libdtrace/common/drti.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/drti.c,v retrieving revision 1.7 diff -u -p -r1.7 drti.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/drti.c 1 Mar 2016 21:09:17 -0000 1.7 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/drti.c 12 Apr 2017 22:52:42 -0000 @@ -32,9 +32,7 @@ #include #include -#define dprintf __hide_dprintf #include -#undef dprintf #include #include #include @@ -68,7 +66,7 @@ extern dof_hdr_t __SUNW_dof; /* DOF defi static boolean_t dof_init_debug = B_FALSE; /* From DTRACE_DOF_INIT_DEBUG */ static void __printflike(2,3) -dprintf(int debug, const char *fmt, ...) +dbg_printf(int debug, const char *fmt, ...) { va_list ap; @@ -122,13 +120,13 @@ dtrace_dof_init(void) dof_init_debug = B_TRUE; if (dlinfo(RTLD_SELF, RTLD_DI_LINKMAP, &lmp) == -1 || lmp == NULL) { - dprintf(1, "couldn't discover module name or address\n"); + dbg_printf(1, "couldn't discover module name or address\n"); return; } #ifdef illumos if (dlinfo(RTLD_SELF, RTLD_DI_LMID, &lmid) == -1) { - dprintf(1, "couldn't discover link map ID\n"); + dbg_printf(1, "couldn't discover link map ID\n"); return; } #endif @@ -142,7 +140,7 @@ dtrace_dof_init(void) dof->dofh_ident[DOF_ID_MAG1] != DOF_MAG_MAG1 || dof->dofh_ident[DOF_ID_MAG2] != DOF_MAG_MAG2 || dof->dofh_ident[DOF_ID_MAG3] != DOF_MAG_MAG3) { - dprintf(0, ".SUNW_dof section corrupt\n"); + dbg_printf(0, ".SUNW_dof section corrupt\n"); return; } @@ -166,7 +164,7 @@ dtrace_dof_init(void) devnamep = p; if ((fd = open64(devnamep, O_RDWR)) < 0) { - dprintf(1, "failed to open helper device %s", devnamep); + dbg_printf(1, "failed to open helper device %s", devnamep); #ifdef illumos /* * If the device path wasn't explicitly set, try again with @@ -178,7 +176,7 @@ dtrace_dof_init(void) devnamep = olddevname; if ((fd = open64(devnamep, O_RDWR)) < 0) { - dprintf(1, "failed to open helper device %s", devnamep); + dbg_printf(1, "failed to open helper device %s", devnamep); return; } #else @@ -186,9 +184,9 @@ dtrace_dof_init(void) #endif } if ((gen = ioctl(fd, DTRACEHIOC_ADDDOF, &dh)) == -1) - dprintf(1, "DTrace ioctl failed for DOF at %p", dof); + dbg_printf(1, "DTrace ioctl failed for DOF at %p", dof); else { - dprintf(1, "DTrace ioctl succeeded for DOF at %p\n", dof); + dbg_printf(1, "DTrace ioctl succeeded for DOF at %p\n", dof); #if defined(__FreeBSD__) || defined(__NetBSD__) gen = dh.dofhp_gen; #endif @@ -209,14 +207,14 @@ dtrace_dof_fini(void) int fd; if ((fd = open64(devnamep, O_RDWR)) < 0) { - dprintf(1, "failed to open helper device %s", devnamep); + dbg_printf(1, "failed to open helper device %s", devnamep); return; } if ((gen = ioctl(fd, DTRACEHIOC_REMOVE, &gen)) == -1) - dprintf(1, "DTrace ioctl failed to remove DOF (%d)\n", gen); + dbg_printf(1, "DTrace ioctl failed to remove DOF (%d)\n", gen); else - dprintf(1, "DTrace ioctl removed DOF (%d)\n", gen); + dbg_printf(1, "DTrace ioctl removed DOF (%d)\n", gen); (void) close(fd); } Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_aggregate.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_aggregate.c,v retrieving revision 1.6 diff -u -p -r1.6 dt_aggregate.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_aggregate.c 24 Sep 2015 14:25:29 -0000 1.6 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_aggregate.c 20 Apr 2017 14:50:58 -0000 @@ -39,10 +39,8 @@ #include #else #include -#if defined(__FreeBSD__) || defined(__NetBSD__) #include #endif -#endif #include #define DTRACE_AHASHSIZE 32779 /* big 'ol prime */ @@ -188,7 +186,7 @@ dt_aggregate_lquantizedcmp(int64_t *lhs, { long double lsum = dt_aggregate_lquantizedsum(lhs); long double rsum = dt_aggregate_lquantizedsum(rhs); - int64_t lzero, rzero; + int64_t lzero = 0, rzero = 0; if (lsum < rsum) return (DT_LESSTHAN); Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_as.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_as.c,v retrieving revision 1.4 diff -u -p -r1.4 dt_as.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_as.c 24 Sep 2015 14:25:29 -0000 1.4 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_as.c 12 Apr 2017 22:55:13 -0000 @@ -430,12 +430,9 @@ dt_as(dt_pcb_t *pcb) if ((idp = dip->di_extern) == NULL) continue; /* no relocation entry needed */ -/*###431 [cc] error: 'kbits' may be used uninitialized in this function [-Werror=maybe-uninitialized]%%%*/ if ((idp->di_flags & kmask) == kbits) { nodef = knodef; rp = krp++; -/*###434 [cc] error: 'ubits' may be used uninitialized in this function [-Werror=maybe-uninitialized]%%%*/ -/*###434 [cc] error: 'umask' may be used uninitialized in this function [-Werror=maybe-uninitialized]%%%*/ } else if ((idp->di_flags & umask) == ubits) { nodef = unodef; rp = urp++; Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cc.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cc.c,v retrieving revision 1.4 diff -u -p -r1.4 dt_cc.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cc.c 24 Sep 2015 14:25:29 -0000 1.4 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cc.c 12 Apr 2017 23:04:59 -0000 @@ -21,8 +21,9 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent Inc. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2015 Gary Mills */ /* @@ -119,7 +120,6 @@ static const dtrace_diftype_t dt_int_rty static void *dt_compile(dtrace_hdl_t *, int, dtrace_probespec_t, void *, uint_t, int, char *const[], FILE *, const char *); - /*ARGSUSED*/ static int dt_idreset(dt_idhash_t *dhp, dt_ident_t *idp, void *ignored) @@ -1058,46 +1058,6 @@ dt_action_printm(dtrace_hdl_t *dtp, dt_n } static void -dt_action_printt(dtrace_hdl_t *dtp, dt_node_t *dnp, dtrace_stmtdesc_t *sdp) -{ - dtrace_actdesc_t *ap = dt_stmt_action(dtp, sdp); - - dt_node_t *size = dnp->dn_args; - dt_node_t *addr = dnp->dn_args->dn_list; - - char n[DT_TYPE_NAMELEN]; - - if (dt_node_is_posconst(size) == 0) { - dnerror(size, D_PRINTT_SIZE, "printt( ) argument #1 must " - "be a non-zero positive integral constant expression\n"); - } - - if (addr == NULL || addr->dn_kind != DT_NODE_FUNC || - addr->dn_ident != dt_idhash_lookup(dtp->dt_globals, "typeref")) { - dnerror(addr, D_PRINTT_ADDR, - "printt( ) argument #2 is incompatible with " - "prototype:\n\tprototype: typeref()\n" - "\t argument: %s\n", - dt_node_type_name(addr, n, sizeof (n))); - } - - dt_cg(yypcb, addr); - ap->dtad_difo = dt_as(yypcb); - ap->dtad_kind = DTRACEACT_PRINTT; - - ap->dtad_difo->dtdo_rtype.dtdt_flags |= DIF_TF_BYREF; - - /* - * Allow additional buffer space for the data size, type size, - * type string length and a stab in the dark (32 bytes) for the - * type string. The type string is part of the typeref() that - * this action references. - */ - ap->dtad_difo->dtdo_rtype.dtdt_size = size->dn_value + 3 * sizeof(uintptr_t) + 32; - -} - -static void dt_action_commit(dtrace_hdl_t *dtp, dt_node_t *dnp, dtrace_stmtdesc_t *sdp) { dtrace_actdesc_t *ap = dt_stmt_action(dtp, sdp); @@ -1169,9 +1129,6 @@ dt_compile_fun(dtrace_hdl_t *dtp, dt_nod case DT_ACT_PRINTM: dt_action_printm(dtp, dnp->dn_expr, sdp); break; - case DT_ACT_PRINTT: - dt_action_printt(dtp, dnp->dn_expr, sdp); - break; case DT_ACT_RAISE: dt_action_raise(dtp, dnp->dn_expr, sdp); break; @@ -2435,7 +2392,7 @@ dt_compile(dtrace_hdl_t *dtp, int contex dt_node_t *dnp; dt_decl_t *ddp; dt_pcb_t pcb; - void *rv = NULL; // XXX: gcc + void *volatile rv; int err; if ((fp == NULL && s == NULL) || (cflags & ~DTRACE_C_MASK) != 0) { @@ -2518,6 +2475,28 @@ dt_compile(dtrace_hdl_t *dtp, int contex } /* + * Perform sugar transformations (for "if" / "else") and replace the + * existing clause chain with the new one. + */ + if (context == DT_CTX_DPROG) { + dt_node_t *dnp, *next_dnp; + dt_node_t *new_list = NULL; + + for (dnp = yypcb->pcb_root->dn_list; + dnp != NULL; dnp = next_dnp) { + /* remove this node from the list */ + next_dnp = dnp->dn_list; + dnp->dn_list = NULL; + + if (dnp->dn_kind == DT_NODE_CLAUSE) + dnp = dt_compile_sugar(dtp, dnp); + /* append node to the new list */ + new_list = dt_node_link(new_list, dnp); + } + yypcb->pcb_root->dn_list = new_list; + } + + /* * If we have successfully created a parse tree for a D program, loop * over the clauses and actions and instantiate the corresponding * libdtrace program. If we are parsing a D expression, then we @@ -2537,6 +2516,8 @@ dt_compile(dtrace_hdl_t *dtp, int contex for (; dnp != NULL; dnp = dnp->dn_list) { switch (dnp->dn_kind) { case DT_NODE_CLAUSE: + if (DT_TREEDUMP_PASS(dtp, 4)) + dt_printd(dnp, stderr, 0); dt_compile_clause(dtp, dnp); break; case DT_NODE_XLATOR: Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cg.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cg.c,v retrieving revision 1.5 diff -u -p -r1.5 dt_cg.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cg.c 24 Sep 2015 14:25:29 -0000 1.5 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_cg.c 12 Apr 2017 23:13:06 -0000 @@ -446,7 +446,6 @@ dt_cg_store(dt_node_t *src, dt_irlist_t instr = DIF_INSTR_STORE(DIF_OP_STX, reg, dst->dn_reg); break; default: - instr = 0; xyerror(D_UNKNOWN, "internal error -- cg cannot store " "size %lu when passed by value\n", (ulong_t)size); } @@ -1354,40 +1353,6 @@ dt_cg_inline(dt_node_t *dnp, dt_irlist_t } } -static void -dt_cg_func_typeref(dtrace_hdl_t *dtp, dt_node_t *dnp) -{ - dtrace_typeinfo_t dtt; - dt_node_t *addr = dnp->dn_args; - dt_node_t *nelm = addr->dn_list; - dt_node_t *strp = nelm->dn_list; - dt_node_t *typs = strp->dn_list; - char buf[DT_TYPE_NAMELEN]; - char *p; - - ctf_type_name(addr->dn_ctfp, addr->dn_type, buf, sizeof (buf)); - - /* - * XXX Hack alert! XXX - * The prototype has two dummy args that we munge to represent - * the type string and the type size. - * - * Yes, I hear your grumble, but it works for now. We'll come - * up with a more elegant implementation later. :-) - */ - free(strp->dn_string); - - if ((p = strchr(buf, '*')) != NULL) - *p = '\0'; - - strp->dn_string = strdup(buf); - - if (dtrace_lookup_by_type(dtp, DTRACE_OBJ_EVERY, buf, &dtt) < 0) - return; - - typs->dn_value = ctf_type_size(dtt.dtt_ctfp, dtt.dtt_type); -} - typedef struct dt_xlmemb { dt_ident_t *dtxl_idp; /* translated ident */ dt_irlist_t *dtxl_dlp; /* instruction list */ @@ -2003,8 +1968,6 @@ dt_cg_node(dt_node_t *dnp, dt_irlist_t * switch (dnp->dn_kind) { case DT_NODE_FUNC: { - dtrace_hdl_t *dtp = yypcb->pcb_hdl; - if ((idp = dnp->dn_ident)->di_kind != DT_IDENT_FUNC) { dnerror(dnp, D_CG_EXPR, "%s %s( ) may not be " "called from a D expression (D program " @@ -2012,15 +1975,6 @@ dt_cg_node(dt_node_t *dnp, dt_irlist_t * dt_idkind_name(idp->di_kind), idp->di_name); } - switch (idp->di_id) { - case DIF_SUBR_TYPEREF: - dt_cg_func_typeref(dtp, dnp); - break; - - default: - break; - } - dt_cg_arglist(dnp->dn_ident, dnp->dn_args, dlp, drp); dnp->dn_reg = dt_regset_alloc(drp); Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c,v retrieving revision 1.9 diff -u -p -r1.9 dt_consume.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c 29 Sep 2015 14:31:22 -0000 1.9 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c 29 Apr 2017 03:36:39 -0000 @@ -1297,7 +1297,7 @@ dt_print_stack(dtrace_hdl_t *dtp, FILE * if (pc > sym.st_value) { (void) snprintf(c, sizeof (c), "%s`%s+0x%llx", dts.dts_object, dts.dts_name, - (unsigned long long)(pc - sym.st_value)); + (u_longlong_t)(pc - sym.st_value)); } else { (void) snprintf(c, sizeof (c), "%s`%s", dts.dts_object, dts.dts_name); @@ -1310,10 +1310,10 @@ dt_print_stack(dtrace_hdl_t *dtp, FILE * */ if (dtrace_lookup_by_addr(dtp, pc, NULL, &dts) == 0) { (void) snprintf(c, sizeof (c), "%s`0x%llx", - dts.dts_object, (unsigned long long)pc); + dts.dts_object, (u_longlong_t)pc); } else { (void) snprintf(c, sizeof (c), "0x%llx", - (unsigned long long)pc); + (u_longlong_t)pc); } } @@ -1387,7 +1387,7 @@ dt_print_ustack(dtrace_hdl_t *dtp, FILE if (pc[i] > sym.st_value) { (void) snprintf(c, sizeof (c), "%s`%s+0x%llx", dt_basename(objname), name, - (unsigned long long)(pc[i] - sym.st_value)); + (u_longlong_t)(pc[i] - sym.st_value)); } else { (void) snprintf(c, sizeof (c), "%s`%s", dt_basename(objname), name); @@ -1413,10 +1413,10 @@ dt_print_ustack(dtrace_hdl_t *dtp, FILE if (P != NULL && Pobjname(P, pc[i], objname, sizeof (objname)) != 0) { (void) snprintf(c, sizeof (c), "%s`0x%llx", - dt_basename(objname), (unsigned long long)pc[i]); + dt_basename(objname), (u_longlong_t)pc[i]); } else { (void) snprintf(c, sizeof (c), "0x%llx", - (unsigned long long)pc[i]); + (u_longlong_t)pc[i]); } } @@ -1526,7 +1526,7 @@ dt_print_umod(dtrace_hdl_t *dtp, FILE *f if (P != NULL && Pobjname(P, pc, objname, sizeof (objname)) != 0) { (void) snprintf(c, sizeof (c), "%s", dt_basename(objname)); } else { - (void) snprintf(c, sizeof (c), "0x%llx", (unsigned long long)pc); + (void) snprintf(c, sizeof (c), "0x%llx", (u_longlong_t)pc); } err = dt_printf(dtp, fp, format, c); @@ -1540,312 +1540,6 @@ dt_print_umod(dtrace_hdl_t *dtp, FILE *f } static int -dt_print_memory(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr) -{ - int quiet = (dtp->dt_options[DTRACEOPT_QUIET] != DTRACEOPT_UNSET); - size_t nbytes = *((uintptr_t *) addr); - - return (dt_print_bytes(dtp, fp, addr + sizeof(uintptr_t), - nbytes, 50, quiet, 1)); -} - -typedef struct dt_type_cbdata { - dtrace_hdl_t *dtp; - dtrace_typeinfo_t dtt; - caddr_t addr; - caddr_t addrend; - const char *name; - int f_type; - int indent; - int type_width; - int name_width; - FILE *fp; -} dt_type_cbdata_t; - -static int dt_print_type_data(dt_type_cbdata_t *, ctf_id_t); - -static int -dt_print_type_member(const char *name, ctf_id_t type, ulong_t off, void *arg) -{ - dt_type_cbdata_t cbdata; - dt_type_cbdata_t *cbdatap = arg; - ssize_t ssz; - - if ((ssz = ctf_type_size(cbdatap->dtt.dtt_ctfp, type)) <= 0) - return (0); - - off /= 8; - - cbdata = *cbdatap; - cbdata.name = name; - cbdata.addr += off; - cbdata.addrend = cbdata.addr + ssz; - - return (dt_print_type_data(&cbdata, type)); -} - -static int -dt_print_type_width(const char *name, ctf_id_t type, ulong_t off, void *arg) -{ - char buf[DT_TYPE_NAMELEN]; - char *p; - dt_type_cbdata_t *cbdatap = arg; - size_t sz = strlen(name); - - ctf_type_name(cbdatap->dtt.dtt_ctfp, type, buf, sizeof (buf)); - - if ((p = strchr(buf, '[')) != NULL) - p[-1] = '\0'; - else - p = __UNCONST(""); - - sz += strlen(p); - - if (sz > cbdatap->name_width) - cbdatap->name_width = sz; - - sz = strlen(buf); - - if (sz > cbdatap->type_width) - cbdatap->type_width = sz; - - return (0); -} - -static int -dt_print_type_data(dt_type_cbdata_t *cbdatap, ctf_id_t type) -{ - caddr_t addr = cbdatap->addr; - caddr_t addrend = cbdatap->addrend; - char buf[DT_TYPE_NAMELEN]; - char *p; - int cnt = 0; - uint_t kind = ctf_type_kind(cbdatap->dtt.dtt_ctfp, type); - ssize_t ssz = ctf_type_size(cbdatap->dtt.dtt_ctfp, type); - - ctf_type_name(cbdatap->dtt.dtt_ctfp, type, buf, sizeof (buf)); - - if ((p = strchr(buf, '[')) != NULL) - p[-1] = '\0'; - else - p = __UNCONST(""); - - if (cbdatap->f_type) { - int type_width = roundup(cbdatap->type_width + 1, 4); - int name_width = roundup(cbdatap->name_width + 1, 4); - - name_width -= strlen(cbdatap->name); - - dt_printf(cbdatap->dtp, cbdatap->fp, "%*s%-*s%s%-*s = ",cbdatap->indent * 4,"",type_width,buf,cbdatap->name,name_width,p); - } - - while (addr < addrend) { - dt_type_cbdata_t cbdata; - ctf_arinfo_t arinfo; - ctf_encoding_t cte; - void *vp = addr; - cbdata = *cbdatap; - cbdata.name = ""; - cbdata.addr = addr; - cbdata.addrend = addr + ssz; - cbdata.f_type = 0; - cbdata.indent++; - cbdata.type_width = 0; - cbdata.name_width = 0; - - if (cnt > 0) - dt_printf(cbdatap->dtp, cbdatap->fp, "%*s", cbdatap->indent * 4,""); - - switch (kind) { - case CTF_K_INTEGER: - if (ctf_type_encoding(cbdatap->dtt.dtt_ctfp, type, &cte) != 0) - return (-1); - if ((cte.cte_format & CTF_INT_SIGNED) != 0) - switch (cte.cte_bits) { - case 8: - if (isprint(*((unsigned char *) vp))) - dt_printf(cbdatap->dtp, cbdatap->fp, "'%c', ", *((char *) vp)); - dt_printf(cbdatap->dtp, cbdatap->fp, "%d (0x%x);\n", *((char *) vp), *((char *) vp)); - break; - case 16: - dt_printf(cbdatap->dtp, cbdatap->fp, "%hd (0x%hx);\n", *((short *) vp), *((u_short *) vp)); - break; - case 32: - dt_printf(cbdatap->dtp, cbdatap->fp, "%d (0x%x);\n", *((int *) vp), *((u_int *) vp)); - break; - case 64: - dt_printf(cbdatap->dtp, cbdatap->fp, "%jd (0x%jx);\n", *((long long *) vp), *((unsigned long long *) vp)); - break; - default: - dt_printf(cbdatap->dtp, cbdatap->fp, "CTF_K_INTEGER: format %x offset %u bits %u\n",cte.cte_format,cte.cte_offset,cte.cte_bits); - break; - } - else - switch (cte.cte_bits) { - case 8: - dt_printf(cbdatap->dtp, cbdatap->fp, "%u (0x%x);\n", *((uint8_t *) vp) & 0xff, *((uint8_t *) vp) & 0xff); - break; - case 16: - dt_printf(cbdatap->dtp, cbdatap->fp, "%hu (0x%hx);\n", *((u_short *) vp), *((u_short *) vp)); - break; - case 32: - dt_printf(cbdatap->dtp, cbdatap->fp, "%u (0x%x);\n", *((u_int *) vp), *((u_int *) vp)); - break; - case 64: - dt_printf(cbdatap->dtp, cbdatap->fp, "%ju (0x%jx);\n", *((unsigned long long *) vp), *((unsigned long long *) vp)); - break; - default: - dt_printf(cbdatap->dtp, cbdatap->fp, "CTF_K_INTEGER: format %x offset %u bits %u\n",cte.cte_format,cte.cte_offset,cte.cte_bits); - break; - } - break; - case CTF_K_FLOAT: - dt_printf(cbdatap->dtp, cbdatap->fp, "CTF_K_FLOAT: format %x offset %u bits %u\n",cte.cte_format,cte.cte_offset,cte.cte_bits); - break; - case CTF_K_POINTER: - dt_printf(cbdatap->dtp, cbdatap->fp, "%p;\n", *((void **) addr)); - break; - case CTF_K_ARRAY: - if (ctf_array_info(cbdatap->dtt.dtt_ctfp, type, &arinfo) != 0) - return (-1); - dt_printf(cbdatap->dtp, cbdatap->fp, "{\n%*s",cbdata.indent * 4,""); - dt_print_type_data(&cbdata, arinfo.ctr_contents); - dt_printf(cbdatap->dtp, cbdatap->fp, "%*s};\n",cbdatap->indent * 4,""); - break; - case CTF_K_FUNCTION: - dt_printf(cbdatap->dtp, cbdatap->fp, "CTF_K_FUNCTION:\n"); - break; - case CTF_K_STRUCT: - cbdata.f_type = 1; - if (ctf_member_iter(cbdatap->dtt.dtt_ctfp, type, - dt_print_type_width, &cbdata) != 0) - return (-1); - dt_printf(cbdatap->dtp, cbdatap->fp, "{\n"); - if (ctf_member_iter(cbdatap->dtt.dtt_ctfp, type, - dt_print_type_member, &cbdata) != 0) - return (-1); - dt_printf(cbdatap->dtp, cbdatap->fp, "%*s};\n",cbdatap->indent * 4,""); - break; - case CTF_K_UNION: - cbdata.f_type = 1; - if (ctf_member_iter(cbdatap->dtt.dtt_ctfp, type, - dt_print_type_width, &cbdata) != 0) - return (-1); - dt_printf(cbdatap->dtp, cbdatap->fp, "{\n"); - if (ctf_member_iter(cbdatap->dtt.dtt_ctfp, type, - dt_print_type_member, &cbdata) != 0) - return (-1); - dt_printf(cbdatap->dtp, cbdatap->fp, "%*s};\n",cbdatap->indent * 4,""); - break; - case CTF_K_ENUM: - dt_printf(cbdatap->dtp, cbdatap->fp, "%s;\n", ctf_enum_name(cbdatap->dtt.dtt_ctfp, type, *((int *) vp))); - break; - case CTF_K_TYPEDEF: - dt_print_type_data(&cbdata, ctf_type_reference(cbdatap->dtt.dtt_ctfp,type)); - break; - case CTF_K_VOLATILE: - if (cbdatap->f_type) - dt_printf(cbdatap->dtp, cbdatap->fp, "volatile "); - dt_print_type_data(&cbdata, ctf_type_reference(cbdatap->dtt.dtt_ctfp,type)); - break; - case CTF_K_CONST: - if (cbdatap->f_type) - dt_printf(cbdatap->dtp, cbdatap->fp, "const "); - dt_print_type_data(&cbdata, ctf_type_reference(cbdatap->dtt.dtt_ctfp,type)); - break; - case CTF_K_RESTRICT: - if (cbdatap->f_type) - dt_printf(cbdatap->dtp, cbdatap->fp, "restrict "); - dt_print_type_data(&cbdata, ctf_type_reference(cbdatap->dtt.dtt_ctfp,type)); - break; - default: - break; - } - - addr += ssz; - cnt++; - } - - return (0); -} - -static int -dt_print_type(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr) -{ - char *p; - dtrace_typeinfo_t dtt; - dt_type_cbdata_t cbdata; - int num = 0; - int quiet = (dtp->dt_options[DTRACEOPT_QUIET] != DTRACEOPT_UNSET); - ssize_t ssz; - - if (!quiet) - dt_printf(dtp, fp, "\n"); - - /* Get the total number of bytes of data buffered. */ - size_t nbytes = *((uintptr_t *) addr); - addr += sizeof(uintptr_t); - - /* - * Get the size of the type so that we can check that it matches - * the CTF data we look up and so that we can figure out how many - * type elements are buffered. - */ - size_t typs = *((uintptr_t *) addr); - addr += sizeof(uintptr_t); - - /* - * Point to the type string in the buffer. Get it's string - * length and round it up to become the offset to the start - * of the buffered type data which we would like to be aligned - * for easy access. - */ - char *strp = (char *) addr; - int offset = roundup(strlen(strp) + 1, sizeof(uintptr_t)); - - /* - * The type string might have a format such as 'int [20]'. - * Check if there is an array dimension present. - */ - if ((p = strchr(strp, '[')) != NULL) { - /* Strip off the array dimension. */ - *p++ = '\0'; - - for (; *p != '\0' && *p != ']'; p++) - num = num * 10 + *p - '0'; - } else - /* No array dimension, so default. */ - num = 1; - - /* Lookup the CTF type from the type string. */ - if (dtrace_lookup_by_type(dtp, DTRACE_OBJ_EVERY, strp, &dtt) < 0) - return (-1); - - /* Offset the buffer address to the start of the data... */ - addr += offset; - - ssz = ctf_type_size(dtt.dtt_ctfp, dtt.dtt_type); - - if (typs != ssz) { - printf("Expected type size from buffer (%lu) to match type size looked up now (%ld)\n", (u_long) typs, (long) ssz); - return (-1); - } - - cbdata.dtp = dtp; - cbdata.dtt = dtt; - cbdata.name = ""; - cbdata.addr = addr; - cbdata.addrend = addr + nbytes; - cbdata.indent = 1; - cbdata.f_type = 1; - cbdata.type_width = 0; - cbdata.name_width = 0; - cbdata.fp = fp; - - return (dt_print_type_data(&cbdata, dtt.dtt_type)); -} - -static int dt_print_sym(dtrace_hdl_t *dtp, FILE *fp, const char *format, caddr_t addr) { /* LINTED - alignment */ @@ -1868,10 +1562,10 @@ dt_print_sym(dtrace_hdl_t *dtp, FILE *fp */ if (dtrace_lookup_by_addr(dtp, pc, NULL, &dts) == 0) { (void) snprintf(c, sizeof (c), "%s`0x%llx", - dts.dts_object, (unsigned long long)pc); + dts.dts_object, (u_longlong_t)pc); } else { (void) snprintf(c, sizeof (c), "0x%llx", - (unsigned long long)pc); + (u_longlong_t)pc); } } @@ -1895,7 +1589,7 @@ dt_print_mod(dtrace_hdl_t *dtp, FILE *fp if (dtrace_lookup_by_addr(dtp, pc, NULL, &dts) == 0) { (void) snprintf(c, sizeof (c), "%s", dts.dts_object); } else { - (void) snprintf(c, sizeof (c), "0x%llx", (unsigned long long)pc); + (void) snprintf(c, sizeof (c), "0x%llx", (u_longlong_t)pc); } if (dt_printf(dtp, fp, format, c) < 0) @@ -1904,6 +1598,16 @@ dt_print_mod(dtrace_hdl_t *dtp, FILE *fp return (0); } +static int +dt_print_memory(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr) +{ + int quiet = (dtp->dt_options[DTRACEOPT_QUIET] != DTRACEOPT_UNSET); + size_t nbytes = *((uintptr_t *) addr); + + return (dt_print_bytes(dtp, fp, addr + sizeof(uintptr_t), + nbytes, 50, quiet, 1)); +} + typedef struct dt_normal { dtrace_aggvarid_t dtnd_id; uint64_t dtnd_normal; @@ -2644,12 +2348,6 @@ dt_consume_cpu(dtrace_hdl_t *dtp, FILE * goto nextrec; } - if (act == DTRACEACT_PRINTT) { - if (dt_print_type(dtp, fp, addr) < 0) - return (-1); - goto nextrec; - } - if (DTRACEACT_ISPRINTFLIKE(act)) { void *fmtdata; int (*func)(dtrace_hdl_t *, FILE *, void *, @@ -2674,8 +2372,6 @@ dt_consume_cpu(dtrace_hdl_t *dtp, FILE * case DTRACEACT_FREOPEN: func = dtrace_freopen; break; - default: - return (dt_set_errno(dtp, EDT_BADAGG)); } n = (*func)(dtp, fp, fmtdata, &data, @@ -3381,7 +3077,7 @@ dtrace_consume(dtrace_hdl_t *dtp, FILE * * Reduce memory usage by re-allocating smaller buffers * for the "remnants". */ - while ((buf = dt_pq_walk(dtp->dt_bufq, &cookie)) != NULL) + while (buf = dt_pq_walk(dtp->dt_bufq, &cookie)) dt_realloc_buf(dtp, buf, buf->dtbd_size); } Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dis.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dis.c,v retrieving revision 1.4 diff -u -p -r1.4 dt_dis.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dis.c 24 Sep 2015 14:25:29 -0000 1.4 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dis.c 5 May 2017 16:56:53 -0000 @@ -176,7 +176,7 @@ dt_dis_setx(const dtrace_difo_t *dp, con if (intptr < dp->dtdo_intlen) { (void) fprintf(fp, "\t\t! 0x%llx", - (unsigned long long)dp->dtdo_inttab[intptr]); + (u_longlong_t)dp->dtdo_inttab[intptr]); } } @@ -334,8 +334,8 @@ dt_dis_rtab(const char *rtag, const dtra for (; len != 0; len--, rp++) { (void) fprintf(fp, "%-4u %-8llu %-8llu %s\n", - rp->dofr_type, (unsigned long long)rp->dofr_offset, - (unsigned long long)rp->dofr_data, + rp->dofr_type, (u_longlong_t)rp->dofr_offset, + (u_longlong_t)rp->dofr_data, &dp->dtdo_strtab[rp->dofr_name]); } } Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dof.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dof.c,v retrieving revision 1.4 diff -u -p -r1.4 dt_dof.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dof.c 24 Sep 2015 14:25:29 -0000 1.4 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_dof.c 13 Apr 2017 01:18:39 -0000 @@ -462,18 +462,8 @@ dof_add_probe(dt_idhash_t *dhp, dt_ident dt_buf_write(dtp, &ddo->ddo_enoffs, pip->pi_enoffs, pip->pi_nenoffs * sizeof (uint32_t), sizeof (uint32_t)); - /* - * If pi_rname isn't set, the relocation will be against the - * function name. If it is, the relocation will be against - * pi_rname. This will be used if the function is scoped - * locally so an alternate symbol is added for the purpose - * of this relocation. - */ - if (pip->pi_rname == NULL) - dofr.dofr_name = dofpr.dofpr_func; - else - dofr.dofr_name = dof_add_string(ddo, pip->pi_rname); - dofr.dofr_type = DOF_RELO_SETX; + dofr.dofr_name = dof_add_string(ddo, pip->pi_rname); + dofr.dofr_type = DOF_RELO_DOFREL; dofr.dofr_offset = dt_buf_len(&ddo->ddo_probes); dofr.dofr_data = 0; Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_errtags.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_errtags.h,v retrieving revision 1.3 diff -u -p -r1.3 dt_errtags.h --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_errtags.h 24 Sep 2015 14:25:29 -0000 1.3 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_errtags.h 13 Apr 2017 01:19:06 -0000 @@ -265,8 +265,6 @@ typedef enum { D_NOREG, /* no available internal registers */ D_PRINTM_ADDR, /* printm() memref bad type */ D_PRINTM_SIZE, /* printm() size bad type */ - D_PRINTT_ADDR, /* printt() typeref bad type */ - D_PRINTT_SIZE /* printt() size bad type */ } dt_errtag_t; extern const char *dt_errtag(dt_errtag_t); Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_grammar.y =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_grammar.y,v retrieving revision 1.3 diff -u -p -r1.3 dt_grammar.y --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_grammar.y 24 Sep 2015 14:25:29 -0000 1.3 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_grammar.y 10 Oct 2016 11:14:31 -0000 @@ -23,8 +23,9 @@ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ + /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ @@ -155,6 +156,8 @@ %type probe_specifier_list %type probe_specifier %type statement_list +%type statement_list_impl +%type statement_or_block %type statement %type declaration %type init_declarator_list @@ -319,9 +322,11 @@ probe_definition: "or actions following probe description\n"); } $$ = dt_node_clause($1, NULL, NULL); + yybegin(YYS_CLAUSE); } | probe_specifiers '{' statement_list '}' { $$ = dt_node_clause($1, NULL, $3); + yybegin(YYS_CLAUSE); } | probe_specifiers DT_TOK_DIV expression DT_TOK_EPRED { dnerror($3, D_SYNTAX, "expected actions { } following " @@ -330,6 +335,7 @@ probe_definition: | probe_specifiers DT_TOK_DIV expression DT_TOK_EPRED '{' statement_list '}' { $$ = dt_node_clause($1, $3, $6); + yybegin(YYS_CLAUSE); } ; @@ -349,12 +355,30 @@ probe_specifier: | DT_TOK_INT { $$ = dt_node_pdesc_by_id($1); } ; -statement_list: statement { $$ = $1; } - | statement_list ';' statement { $$ = LINK($1, $3); } +statement_list_impl: /* empty */ { $$ = NULL; } + | statement_list_impl statement { $$ = LINK($1, $2); } + ; + +statement_list: + statement_list_impl { $$ = $1; } + | statement_list_impl expression { + $$ = LINK($1, dt_node_statement($2)); + } ; -statement: /* empty */ { $$ = NULL; } - | expression { $$ = dt_node_statement($1); } +statement_or_block: + statement + | '{' statement_list '}' { $$ = $2; } + +statement: ';' { $$ = NULL; } + | expression ';' { $$ = dt_node_statement($1); } + | DT_KEY_IF DT_TOK_LPAR expression DT_TOK_RPAR statement_or_block { + $$ = dt_node_if($3, $5, NULL); + } + | DT_KEY_IF DT_TOK_LPAR expression DT_TOK_RPAR + statement_or_block DT_KEY_ELSE statement_or_block { + $$ = dt_node_if($3, $5, $7); + } ; argument_expression_list: Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_handle.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_handle.c,v retrieving revision 1.5 diff -u -p -r1.5 dt_handle.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_handle.c 24 Sep 2015 14:25:29 -0000 1.5 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_handle.c 29 Apr 2017 03:37:32 -0000 @@ -208,7 +208,7 @@ dt_handle_err(dtrace_hdl_t *dtp, dtrace_ case DTRACEFLT_BADALIGN: case DTRACEFLT_BADSTACK: (void) sprintf(details, " (0x%llx)", - (unsigned long long)err.dteda_addr); + (u_longlong_t)err.dteda_addr); break; default: @@ -335,7 +335,7 @@ dt_handle_cpudrop(dtrace_hdl_t *dtp, pro } (void) snprintf(s, size, "%llu %sdrop%s on CPU %d\n", - (unsigned long long)howmany, + (u_longlong_t)howmany, what == DTRACEDROP_PRINCIPAL ? "" : "aggregation ", howmany > 1 ? "s" : "", (int)cpu); @@ -429,7 +429,7 @@ dt_handle_status(dtrace_hdl_t *dtp, dtra } (void) snprintf(s, size, "%llu %s%s%s\n", - (unsigned long long)(nval - oval), + (u_longlong_t)(nval - oval), _dt_droptab[i].dtdrt_str, (nval - oval > 1) ? "s" : "", _dt_droptab[i].dtdrt_msg != NULL ? _dt_droptab[i].dtdrt_msg : ""); Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_ident.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_ident.c,v retrieving revision 1.5 diff -u -p -r1.5 dt_ident.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_ident.c 4 Feb 2016 17:27:32 -0000 1.5 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_ident.c 5 May 2017 16:57:32 -0000 @@ -358,7 +358,7 @@ dt_idcook_args(dt_node_t *dnp, dt_ident_ if (ap->dn_value >= prp->pr_argc) { xyerror(D_ARGS_IDX, "index %lld is out of range for %s %s[ ]\n", - (long long)ap->dn_value, dtrace_desc2str(yypcb->pcb_pdesc, + (longlong_t)ap->dn_value, dtrace_desc2str(yypcb->pcb_pdesc, n1, sizeof (n1)), idp->di_name); } @@ -374,12 +374,12 @@ dt_idcook_args(dt_node_t *dnp, dt_ident_ if (xnp->dn_type == CTF_ERR) { xyerror(D_ARGS_TYPE, "failed to resolve translated type for " - "%s[%lld]\n", idp->di_name, (long long)ap->dn_value); + "%s[%lld]\n", idp->di_name, (longlong_t)ap->dn_value); } if (nnp->dn_type == CTF_ERR) { xyerror(D_ARGS_TYPE, "failed to resolve native type for " - "%s[%lld]\n", idp->di_name, (long long)ap->dn_value); + "%s[%lld]\n", idp->di_name, (longlong_t)ap->dn_value); } if (dtp->dt_xlatemode == DT_XL_STATIC && ( @@ -428,7 +428,7 @@ dt_idcook_args(dt_node_t *dnp, dt_ident_ } else { xyerror(D_ARGS_XLATOR, "translator for %s[%lld] from %s to %s " - "is not defined\n", idp->di_name, (long long)ap->dn_value, + "is not defined\n", idp->di_name, (longlong_t)ap->dn_value, dt_node_type_name(nnp, n1, sizeof (n1)), dt_node_type_name(xnp, n2, sizeof (n2))); } @@ -459,7 +459,7 @@ dt_idcook_regs(dt_node_t *dnp, dt_ident_ if ((ap->dn_flags & DT_NF_SIGNED) && (int64_t)ap->dn_value < 0) { xyerror(D_REGS_IDX, "index %lld is out of range for array %s\n", - (long long)ap->dn_value, idp->di_name); + (longlong_t)ap->dn_value, idp->di_name); } if (dt_type_lookup("uint64_t", &dtt) == -1) { Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h,v retrieving revision 1.7 diff -u -p -r1.7 dt_impl.h --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h 28 Apr 2016 00:02:40 -0000 1.7 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h 29 Apr 2017 03:37:50 -0000 @@ -26,7 +26,7 @@ /* * Copyright (c) 2013, Joyent, Inc. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. */ #ifndef _DT_IMPL_H @@ -362,6 +362,7 @@ struct dtrace_hdl { int dt_indent; /* recommended flow indent */ dtrace_epid_t dt_last_epid; /* most recently consumed EPID */ uint64_t dt_last_timestamp; /* most recently consumed timestamp */ + boolean_t dt_has_sugar; /* syntactic sugar used? */ }; /* @@ -487,7 +488,6 @@ struct dtrace_hdl { #define DT_ACT_SETOPT DT_ACT(28) /* setopt() action */ #define DT_ACT_PRINT DT_ACT(29) /* print() action */ #define DT_ACT_PRINTM DT_ACT(30) /* printm() action */ -#define DT_ACT_PRINTT DT_ACT(31) /* printt() action */ /* * Sentinel to tell freopen() to restore the saved stdout. This must not @@ -744,15 +744,19 @@ extern int _dtrace_argmax; /* default m extern const char *_dtrace_libdir; /* default library directory */ extern const char *_dtrace_moddir; /* default kernel module directory */ -#ifndef illumos -extern const char *dt_bootfile(char *, size_t); -#endif - #if defined(__FreeBSD__) || defined(__NetBSD__) extern int gmatch(const char *, const char *); extern int yylex(void); #endif +#ifdef __NetBSD__ +extern const char *dt_bootfile(char *, size_t); + +#define longlong_t long long +#define u_longlong_t unsigned long long +#define __DECONST(a, b) __UNCONST(b) +#endif + #ifdef __cplusplus } #endif Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_link.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_link.c,v retrieving revision 1.9 diff -u -p -r1.9 dt_link.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_link.c 12 Jan 2017 23:12:59 -0000 1.9 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_link.c 29 Apr 2017 03:14:25 -0000 @@ -247,7 +247,7 @@ printf("%s:%s(%d): DOODAD\n",__FUNCTION_ rel->r_offset = s->dofs_offset + dofr[j].dofr_offset; rel->r_info = ELF32_R_INFO(count + dep->de_global, - R_386_32); + R_386_PC32); #elif defined(__mips__) /* XXX */ printf("%s:%s(%d): DOODAD\n",__FUNCTION__,__FILE__,__LINE__); @@ -269,6 +269,9 @@ printf("%s:%s(%d): DOODAD\n",__FUNCTION_ dofr[j].dofr_offset + 4; rel->r_info = ELF32_R_INFO(count + dep->de_global, R_SPARC_32); +#elif defined(__riscv__) +/* XXX */ +printf("%s:%s(%d): DOODAD\n",__FUNCTION__,__FILE__,__LINE__); #else #error unknown ISA #endif @@ -294,11 +297,7 @@ printf("%s:%s(%d): DOODAD\n",__FUNCTION_ sym->st_value = 0; sym->st_size = dof->dofh_filesz; sym->st_info = ELF32_ST_INFO(STB_GLOBAL, STT_OBJECT); -#ifdef illumos - sym->st_other = 0; -#else sym->st_other = ELF32_ST_VISIBILITY(STV_HIDDEN); -#endif sym->st_shndx = ESHDR_DOF; sym++; @@ -450,23 +449,16 @@ prepare_elf64(dtrace_hdl_t *dtp, const d dofr[j].dofr_offset; rel->r_info = ELF64_R_INFO(count + dep->de_global, R_PPC64_REL64); +#elif defined(__riscv__) +/* XXX */ #elif defined(__i386) || defined(__amd64) - rel->r_offset = s->dofs_offset + - dofr[j].dofr_offset; -#ifdef illumos - rel->r_info = ELF64_R_INFO(count + dep->de_global, - R_AMD64_64); -#else -#if defined(__amd64) - rel->r_info = ELF64_R_INFO(count + dep->de_global, - R_X86_64_RELATIVE); -#endif +#ifndef R_X86_64_PC64 +#define R_X86_64_PC64 24 #endif -#elif defined(__sparc) rel->r_offset = s->dofs_offset + dofr[j].dofr_offset; rel->r_info = ELF64_R_INFO(count + dep->de_global, - R_SPARC_64); + R_X86_64_PC64); #else #error unknown ISA #endif @@ -492,11 +484,7 @@ prepare_elf64(dtrace_hdl_t *dtp, const d sym->st_value = 0; sym->st_size = dof->dofh_filesz; sym->st_info = GELF_ST_INFO(STB_GLOBAL, STT_OBJECT); -#ifdef illumos - sym->st_other = 0; -#else sym->st_other = ELF64_ST_VISIBILITY(STV_HIDDEN); -#endif sym->st_shndx = ESHDR_DOF; sym++; @@ -806,16 +794,15 @@ dump_elf64(dtrace_hdl_t *dtp, const dof_ } static int -dt_symtab_lookup(Elf_Data *data_sym, int nsym, uintptr_t addr, uint_t shn, - GElf_Sym *sym, int uses_funcdesc, Elf *elf) +dt_symtab_lookup(Elf_Data *data_sym, int start, int end, uintptr_t addr, + uint_t shn, GElf_Sym *sym, int uses_funcdesc, Elf *elf) { - int i, ret = -1; Elf64_Addr symval; Elf_Scn *opd_scn; Elf_Data *opd_desc; - GElf_Sym s; + int i; - for (i = 0; i < nsym && gelf_getsym(data_sym, i, sym) != NULL; i++) { + for (i = start; i < end && gelf_getsym(data_sym, i, sym) != NULL; i++) { if (GELF_ST_TYPE(sym->st_info) == STT_FUNC) { symval = sym->st_value; if (uses_funcdesc) { @@ -825,20 +812,12 @@ dt_symtab_lookup(Elf_Data *data_sym, int *(uint64_t*)((char *)opd_desc->d_buf + symval); } if ((uses_funcdesc || shn == sym->st_shndx) && - symval <= addr && - addr < symval + sym->st_size) { - if (GELF_ST_BIND(sym->st_info) == STB_GLOBAL) - return (0); - - ret = 0; - s = *sym; - } + symval <= addr && addr < symval + sym->st_size) + return (0); } } - if (ret == 0) - *sym = s; - return (ret); + return (-1); } #if defined(__aarch64__) @@ -949,7 +928,15 @@ dt_modtext(dtrace_hdl_t *dtp, char *p, i return (0); } - +#elif defined(__riscv__) +/* XXX */ +static int +dt_modtext(dtrace_hdl_t *dtp, char *p, int isenabled, GElf_Rela *rela, + uint32_t *off) +{ +printf("%s:%s(%d): DOODAD\n",__FUNCTION__,__FILE__,__LINE__); + return (0); +} #elif defined(__sparc) #define DT_OP_RET 0x81c7e008 @@ -1224,6 +1211,8 @@ process_obj(dtrace_hdl_t *dtp, const cha static const char dt_enabled[] = "enabled"; static const char dt_symprefix[] = "$dtrace"; static const char dt_symfmt[] = "%s%ld.%s"; + static const char dt_weaksymfmt[] = "%s.%s"; + char probename[DTRACE_NAMELEN]; int fd, i, ndx, eprobe, mod = 0; Elf *elf = NULL; GElf_Ehdr ehdr; @@ -1237,10 +1226,11 @@ process_obj(dtrace_hdl_t *dtp, const cha dt_provider_t *pvp; dt_probe_t *prp; uint32_t off, eclass, emachine1, emachine2; - size_t symsize, nsym = 0, isym, istr, len; + size_t symsize, osym, nsym, isym, istr, len; key_t objkey; dt_link_pair_t *pair, *bufs = NULL; dt_strtab_t *strtab; + void *tmp; if ((fd = open64(obj, O_RDWR)) == -1) { return (dt_link_error(dtp, elf, fd, bufs, @@ -1374,12 +1364,13 @@ process_obj(dtrace_hdl_t *dtp, const cha * target (text) section to replace the call instruction with * one or more nops. * - * If the function containing the probe is locally scoped - * (static), we create an alias used by the relocation in the - * generated object. The alias, a new symbol, will be global - * (so that the relocation from the generated object can be - * resolved), and hidden (so that it is converted to a local - * symbol at link time). Such aliases have this form: + * To avoid runtime overhead, the relocations added to the + * generated object should be resolved at static link time. We + * therefore create aliases for the functions that contain + * probes. An alias is global (so that the relocation from the + * generated object can be resolved), and hidden (so that its + * address is known at static link time). Such aliases have this + * form: * * $dtrace. * @@ -1417,16 +1408,13 @@ process_obj(dtrace_hdl_t *dtp, const cha if (strncmp(s, dt_prefix, sizeof (dt_prefix) - 1) != 0) continue; - if (dt_symtab_lookup(data_sym, isym, rela.r_offset, - shdr_rel.sh_info, &fsym, - (emachine1 == EM_PPC64), elf) != 0) { + if (dt_symtab_lookup(data_sym, 0, isym, rela.r_offset, + shdr_rel.sh_info, &fsym, (emachine1 == EM_PPC64), + elf) != 0) { dt_strtab_destroy(strtab); goto err; } - if (GELF_ST_BIND(fsym.st_info) != STB_LOCAL) - continue; - if (fsym.st_name > data_str->d_size) { dt_strtab_destroy(strtab); goto err; @@ -1462,12 +1450,12 @@ process_obj(dtrace_hdl_t *dtp, const cha } /* - * If needed, allocate the additional space for the symbol - * table and string table copying the old data into the new - * buffers, and marking the buffers as dirty. We inject those - * newly allocated buffers into the libelf data structures, but - * are still responsible for freeing them once we're done with - * the elf handle. + * If any probes were found, allocate the additional space for + * the symbol table and string table, copying the old data into + * the new buffers, and marking the buffers as dirty. We inject + * those newly allocated buffers into the libelf data + * structures, but are still responsible for freeing them once + * we're done with the elf handle. */ if (nsym > 0) { /* @@ -1501,7 +1489,9 @@ process_obj(dtrace_hdl_t *dtp, const cha bufs = pair; bcopy(data_str->d_buf, pair->dlp_str, data_str->d_size); + tmp = data_str->d_buf; data_str->d_buf = pair->dlp_str; + pair->dlp_str = tmp; data_str->d_size += len; (void) elf_flagdata(data_str, ELF_C_SET, ELF_F_DIRTY); @@ -1509,16 +1499,20 @@ process_obj(dtrace_hdl_t *dtp, const cha (void) gelf_update_shdr(scn_str, &shdr_str); bcopy(data_sym->d_buf, pair->dlp_sym, data_sym->d_size); + tmp = data_sym->d_buf; data_sym->d_buf = pair->dlp_sym; + pair->dlp_sym = tmp; data_sym->d_size += nsym * symsize; (void) elf_flagdata(data_sym, ELF_C_SET, ELF_F_DIRTY); shdr_sym.sh_size += nsym * symsize; (void) gelf_update_shdr(scn_sym, &shdr_sym); + osym = isym; nsym += isym; } else { dt_strtab_destroy(strtab); + continue; } /* @@ -1577,62 +1571,62 @@ process_obj(dtrace_hdl_t *dtp, const cha bcopy(s, pname, p - s); pname[p - s] = '\0'; - p = strhyphenate(p + 3); /* strlen("___") */ - - if (dt_symtab_lookup(data_sym, isym, rela.r_offset, - shdr_rel.sh_info, &fsym, - (emachine1 == EM_PPC64), elf) != 0) - goto err; - - if (fsym.st_name > data_str->d_size) - goto err; - - assert(GELF_ST_TYPE(fsym.st_info) == STT_FUNC); - - /* - * If a NULL relocation name is passed to - * dt_probe_define(), the function name is used for the - * relocation. The relocation needs to use a mangled - * name if the symbol is locally scoped; the function - * name may need to change if we've found the global - * alias for the locally scoped symbol (we prefer - * global symbols to locals in dt_symtab_lookup()). - */ - s = (char *)data_str->d_buf + fsym.st_name; - r = NULL; + if (dt_symtab_lookup(data_sym, osym, isym, + rela.r_offset, shdr_rel.sh_info, &fsym, + (emachine1 == EM_PPC64), elf) == 0) { + if (fsym.st_name > data_str->d_size) + goto err; - if (GELF_ST_BIND(fsym.st_info) == STB_LOCAL) { + r = s = (char *) data_str->d_buf + fsym.st_name; + assert(strstr(s, dt_symprefix) == s); + s = strchr(s, '.') + 1; + } else if (dt_symtab_lookup(data_sym, 0, osym, + rela.r_offset, shdr_rel.sh_info, &fsym, + (emachine1 == EM_PPC64), elf) == 0) { + u_int bind; + + bind = GELF_ST_BIND(fsym.st_info) == STB_WEAK ? + STB_WEAK : STB_GLOBAL; + + /* + * Emit an alias for the symbol. It needs to be + * non-preemptible so that .SUNW_dof relocations + * may be resolved at static link time. Aliases + * of weak symbols are given a non-unique name + * so that they may be merged by the linker. + */ dsym = fsym; dsym.st_name = istr; - dsym.st_info = GELF_ST_INFO(STB_GLOBAL, - STT_FUNC); - dsym.st_other = - ELF64_ST_VISIBILITY(STV_ELIMINATE); + dsym.st_info = GELF_ST_INFO(bind, STT_FUNC); + dsym.st_other = GELF_ST_VISIBILITY(STV_HIDDEN); (void) gelf_update_sym(data_sym, isym, &dsym); - - r = (char *)data_str->d_buf + istr; - istr += 1 + sprintf(r, dt_symfmt, - dt_symprefix, objkey, s); + r = (char *) data_str->d_buf + istr; + s = (char *) data_str->d_buf + fsym.st_name; + if (bind == STB_WEAK) + istr += sprintf(r, dt_weaksymfmt, + dt_symprefix, s); + else + istr += sprintf(r, dt_symfmt, + dt_symprefix, objkey, s); + istr++; isym++; assert(isym <= nsym); - - } else if (strncmp(s, dt_symprefix, - strlen(dt_symprefix)) == 0) { - r = s; - if ((s = strchr(s, '.')) == NULL) - goto err; - s++; - } + } else + goto err; if ((pvp = dt_provider_lookup(dtp, pname)) == NULL) { return (dt_link_error(dtp, elf, fd, bufs, "no such provider %s", pname)); } - if ((prp = dt_probe_lookup(pvp, p)) == NULL) { + if (strlcpy(probename, p + 3, sizeof (probename)) >= + sizeof (probename)) return (dt_link_error(dtp, elf, fd, bufs, - "no such probe %s", p)); - } + "invalid probe name %s", probename)); + (void) strhyphenate(probename); + if ((prp = dt_probe_lookup(pvp, probename)) == NULL) + return (dt_link_error(dtp, elf, fd, bufs, + "no such probe %s", probename)); assert(fsym.st_value <= rela.r_offset); @@ -1695,9 +1689,6 @@ process_obj(dtrace_hdl_t *dtp, const cha (void) elf_end(elf); (void) close(fd); -#ifndef illumos - if (nsym > 0) -#endif while ((pair = bufs) != NULL) { bufs = pair->dlp_next; dt_free(dtp, pair->dlp_str); Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c,v retrieving revision 1.15 diff -u -p -r1.15 dt_module.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c 3 Aug 2016 16:37:02 -0000 1.15 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c 15 Jun 2017 23:20:19 -0000 @@ -24,6 +24,7 @@ */ /* * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2016, Pedro Giffuni. All rights reserved. */ #include @@ -36,7 +37,7 @@ #include #else #include -#include +//#include #include #include #endif @@ -93,7 +94,7 @@ dt_module_syminit32(dt_module_t *dmp) uint_t i, n = dmp->dm_nsymelems; uint_t asrsv = 0; -#if defined(__FreeBSD__) || defined(__NetBSD__) +#if defined(__FreeBSD__) GElf_Ehdr ehdr; int is_elf_obj; @@ -115,13 +116,16 @@ dt_module_syminit32(dt_module_t *dmp) (ELF32_ST_BIND(sym->st_info) != STB_LOCAL || sym->st_size)) { asrsv++; /* reserve space in the address map */ -#if defined(__FreeBSD__) || defined(__NetBSD__) +#if defined(__FreeBSD__) sym->st_value += (Elf_Addr) dmp->dm_reloc_offset; if (is_elf_obj && sym->st_shndx != SHN_UNDEF && sym->st_shndx < ehdr.e_shnum) sym->st_value += dmp->dm_sec_offsets[sym->st_shndx]; #endif +#ifdef __NetBSD__ + sym->st_value += (Elf_Addr) dmp->dm_reloc_offset; +#endif } dt_module_symhash_insert(dmp, name, i); @@ -143,7 +147,7 @@ dt_module_syminit64(dt_module_t *dmp) uint_t i, n = dmp->dm_nsymelems; uint_t asrsv = 0; -#if defined(__FreeBSD__) || defined(__NetBSD__) +#if defined(__FreeBSD__) GElf_Ehdr ehdr; int is_elf_obj; @@ -164,13 +168,16 @@ dt_module_syminit64(dt_module_t *dmp) if (sym->st_value != 0 && (ELF64_ST_BIND(sym->st_info) != STB_LOCAL || sym->st_size)) { asrsv++; /* reserve space in the address map */ -#if defined(__FreeBSD__) || defined(__NetBSD__) +#if defined(__FreeBSD__) sym->st_value += (Elf_Addr) dmp->dm_reloc_offset; if (is_elf_obj && sym->st_shndx != SHN_UNDEF && sym->st_shndx < ehdr.e_shnum) sym->st_value += dmp->dm_sec_offsets[sym->st_shndx]; #endif +#ifdef __NetBSD__ + sym->st_value += (Elf_Addr) dmp->dm_reloc_offset; +#endif } dt_module_symhash_insert(dmp, name, i); @@ -725,22 +732,20 @@ dt_module_load_proc(dtrace_hdl_t *dtp, d return (dt_set_errno(dtp, EDT_CANTLOAD)); } - dmp->dm_libctfp = malloc(sizeof (ctf_file_t *) * arg.dpa_count); + dmp->dm_libctfp = calloc(arg.dpa_count, sizeof (ctf_file_t *)); if (dmp->dm_libctfp == NULL) { dt_proc_unlock(dtp, p); dt_proc_release(dtp, p); return (dt_set_errno(dtp, EDT_NOMEM)); } - bzero(dmp->dm_libctfp, sizeof (ctf_file_t *) * arg.dpa_count); - dmp->dm_libctfn = malloc(sizeof (char *) * arg.dpa_count); + dmp->dm_libctfn = calloc(arg.dpa_count, sizeof (char *)); if (dmp->dm_libctfn == NULL) { free(dmp->dm_libctfp); dt_proc_unlock(dtp, p); dt_proc_release(dtp, p); return (dt_set_errno(dtp, EDT_NOMEM)); } - bzero(dmp->dm_libctfn, sizeof (char *) * arg.dpa_count); dmp->dm_nctflibs = arg.dpa_count; @@ -821,17 +826,14 @@ dt_module_load(dtrace_hdl_t *dtp, dt_mod dmp->dm_nsymbuckets = _dtrace_strbuckets; dmp->dm_symfree = 1; /* first free element is index 1 */ - dmp->dm_symbuckets = malloc(sizeof (uint_t) * dmp->dm_nsymbuckets); - dmp->dm_symchains = malloc(sizeof (dt_sym_t) * dmp->dm_nsymelems + 1); + dmp->dm_symbuckets = calloc(dmp->dm_nsymbuckets, sizeof (uint_t)); + dmp->dm_symchains = calloc(dmp->dm_nsymelems + 1, sizeof (dt_sym_t)); if (dmp->dm_symbuckets == NULL || dmp->dm_symchains == NULL) { dt_module_unload(dtp, dmp); return (dt_set_errno(dtp, EDT_NOMEM)); } - bzero(dmp->dm_symbuckets, sizeof (uint_t) * dmp->dm_nsymbuckets); - bzero(dmp->dm_symchains, sizeof (dt_sym_t) * dmp->dm_nsymelems + 1); - /* * Iterate over the symbol table data buffer and insert each symbol * name into the name hash if the name and type are valid. Then @@ -983,7 +985,7 @@ dt_module_unload(dtrace_hdl_t *dtp, dt_m free(dmp->dm_asmap); dmp->dm_asmap = NULL; } -#if defined(__FreeBSD__) || defined(__NetBSD__) +#if defined(__FreeBSD__) if (dmp->dm_sec_offsets != NULL) { free(dmp->dm_sec_offsets); dmp->dm_sec_offsets = NULL; @@ -1145,7 +1147,7 @@ dt_module_update(dtrace_hdl_t *dtp, stru char fname[MAXPATHLEN]; struct stat64 st; int fd, err, bits; -#if defined(__FreeBSD__) +#ifdef __FreeBSD__ struct module_stat ms; dt_kmodule_t *dkmp; uint_t h; @@ -1178,11 +1180,15 @@ dt_module_update(dtrace_hdl_t *dtp, stru char osrel[64]; char machine[64]; size_t len; + uintptr_t mapbase; + int i; + bool ismod; if (strcmp("netbsd", name) == 0) { /* want the kernel, but it is not absolute */ dt_bootfile(machine, sizeof(machine)); snprintf(fname, sizeof(fname), "/%s", machine); + ismod = false; } else { /* build stand module path from system */ @@ -1202,6 +1208,7 @@ dt_module_update(dtrace_hdl_t *dtp, stru (void) snprintf(fname, sizeof (fname), "/stand/%s/%s/modules/%s/%s.kmod", machine, osrel, name, name); + ismod = true; } #endif @@ -1258,6 +1265,34 @@ dt_module_update(dtrace_hdl_t *dtp, stru } } #endif +#ifdef __NetBSD__ + mapbase = 0; + if (ismod) { + int maxmodules = 512; + modstat_t modstat_buf[maxmodules], *ms; + struct iovec iov = { modstat_buf, sizeof(modstat_buf) }; + + if (modctl(MODCTL_STAT, &iov) < 0) { + dt_dprintf("failed to get list of kernel modules: %s\n", + strerror(errno)); + return; + } + + for (i = 0; i < maxmodules; i++) { + ms = &modstat_buf[i]; + if (!strcmp(name, ms->ms_name)) { + mapbase = ms->ms_addr; + break; + } + } + if (i == maxmodules) { + dt_dprintf("module %s not found\n", name); + return; + } + dmp->dm_reloc_offset = (void *)mapbase; + } +#endif + /* * Iterate over the section headers locating various sections of * interest and use their attributes to flesh out the dt_module_t. @@ -1302,7 +1337,8 @@ dt_module_update(dtrace_hdl_t *dtp, stru dmp->dm_flags |= DT_DM_KERNEL; #ifdef illumos dmp->dm_modid = (int)OBJFS_MODID(st.st_ino); -#else +#endif /* illumos */ +#ifdef __FreeBSD__ /* * Include .rodata and special sections into .text. * This depends on default section layout produced by GNU ld @@ -1310,7 +1346,7 @@ dt_module_update(dtrace_hdl_t *dtp, stru * [Text][R/O data][R/W data][Dynamic][BSS][Non loadable] */ dmp->dm_text_size = dmp->dm_data_va - dmp->dm_text_va; -#if defined(__i386__) && !defined(__NetBSD__) +#if defined(__i386__) /* * Find the first load section and figure out the relocation * offset for the symbols. The kernel module will not need @@ -1323,12 +1359,21 @@ dt_module_update(dtrace_hdl_t *dtp, stru } } #endif -#endif /* illumos */ +#endif /* __FreeBSD__ */ +#ifdef __NetBSD__ + if (ismod) { + dmp->dm_text_va = mapbase; + dmp->dm_data_va = 0; + dmp->dm_data_size = 0; + dmp->dm_bss_va = 0; + dmp->dm_bss_size = 0; + } +#endif if (dmp->dm_info.objfs_info_primary) dmp->dm_flags |= DT_DM_PRIMARY; -#if defined(__FreeBSD__) +#ifdef __FreeBSD__ ms.version = sizeof(ms); for (modid = kldfirstmod(k_stat->id); modid > 0; modid = modnext(modid)) { @@ -1405,8 +1450,31 @@ dtrace_update(dtrace_hdl_t *dtp) dt_module_update(dtp, &k_stat); } #elif defined(__NetBSD__) - /* XXX just the kernel for now */ + size_t len; + struct iovec iov; + modstat_t *ms; + dt_module_update(dtp, "netbsd"); + for (len = 8192;;) { + iov.iov_base = malloc(len); + iov.iov_len = len; + if (modctl(MODCTL_STAT, &iov)) { + free(iov.iov_base); + iov.iov_len = 0; + break; + } + if (len >= iov.iov_len) { + break; + } + free(iov.iov_base); + len = iov.iov_len; + } + len = iov.iov_len / sizeof(modstat_t); + for (ms = iov.iov_base; len != 0; ms++, len--) { + if (ms->ms_source != MODULE_SOURCE_FILESYS) + continue; + dt_module_update(dtp, ms->ms_name); + } #endif /* @@ -1583,6 +1651,7 @@ dtrace_lookup_by_addr(dtrace_hdl_t *dtp, for (dmp = dt_list_next(&dtp->dt_modlist); dmp != NULL; dmp = dt_list_next(dmp)) { + if (addr - dmp->dm_text_va < dmp->dm_text_size || addr - dmp->dm_data_va < dmp->dm_data_size || addr - dmp->dm_bss_va < dmp->dm_bss_size) Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_open.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_open.c,v retrieving revision 1.12 diff -u -p -r1.12 dt_open.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_open.c 28 Apr 2016 11:38:41 -0000 1.12 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_open.c 5 May 2017 16:59:27 -0000 @@ -22,7 +22,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #include @@ -130,8 +130,9 @@ #define DT_VERS_1_11 DT_VERSION_NUMBER(1, 11, 0) #define DT_VERS_1_12 DT_VERSION_NUMBER(1, 12, 0) #define DT_VERS_1_12_1 DT_VERSION_NUMBER(1, 12, 1) -#define DT_VERS_LATEST DT_VERS_1_12_1 -#define DT_VERS_STRING "Sun D 1.12.1" +#define DT_VERS_1_13 DT_VERSION_NUMBER(1, 13, 0) +#define DT_VERS_LATEST DT_VERS_1_13 +#define DT_VERS_STRING "Sun D 1.13" const dt_version_t _dtrace_versions[] = { DT_VERS_1_0, /* D API 1.0.0 (PSARC 2001/466) Solaris 10 FCS */ @@ -157,6 +158,7 @@ const dt_version_t _dtrace_versions[] = DT_VERS_1_11, /* D API 1.11 */ DT_VERS_1_12, /* D API 1.12 */ DT_VERS_1_12_1, /* D API 1.12.1 */ + DT_VERS_1_13, /* D API 1.13 */ 0 }; @@ -390,8 +392,6 @@ static const dt_ident_t _dtrace_globals[ &dt_idops_func, "void(@, ...)" }, { "printm", DT_IDENT_ACTFUNC, 0, DT_ACT_PRINTM, DT_ATTR_STABCMN, DT_VERS_1_0, &dt_idops_func, "void(size_t, uintptr_t *)" }, -{ "printt", DT_IDENT_ACTFUNC, 0, DT_ACT_PRINTT, DT_ATTR_STABCMN, DT_VERS_1_0, - &dt_idops_func, "void(size_t, uintptr_t *)" }, { "probefunc", DT_IDENT_SCALAR, 0, DIF_VAR_PROBEFUNC, DT_ATTR_STABCMN, DT_VERS_1_0, &dt_idops_type, "string" }, { "probemod", DT_IDENT_SCALAR, 0, DIF_VAR_PROBEMOD, @@ -503,8 +503,6 @@ static const dt_ident_t _dtrace_globals[ &dt_idops_func, "void(@, size_t, ...)" }, { "trunc", DT_IDENT_ACTFUNC, 0, DT_ACT_TRUNC, DT_ATTR_STABCMN, DT_VERS_1_0, &dt_idops_func, "void(...)" }, -{ "typeref", DT_IDENT_FUNC, 0, DIF_SUBR_TYPEREF, DT_ATTR_STABCMN, DT_VERS_1_1, - &dt_idops_func, "uintptr_t *(void *, size_t, string, size_t)" }, { "uaddr", DT_IDENT_ACTFUNC, 0, DT_ACT_UADDR, DT_ATTR_STABCMN, DT_VERS_1_2, &dt_idops_func, "_usymaddr(uintptr_t)" }, { "ucaller", DT_IDENT_SCALAR, 0, DIF_VAR_UCALLER, DT_ATTR_STABCMN, @@ -941,9 +939,11 @@ dt_provmod_open(dt_provmod_t **provmod, * reallocate it. We normally won't need to do this * because providers aren't being loaded all the time. */ - if ((p = realloc(p_providers,len)) == NULL) + if ((p = realloc(p_providers,len)) == NULL) { + free(p_providers); /* How do we report errors here? */ return; + } p_providers = p; } else break; @@ -1190,8 +1190,10 @@ dt_vopen(int version, int flags, int *er (void) fcntl(ftfd, F_SETFD, FD_CLOEXEC); alloc: - if ((dtp = malloc(sizeof (dtrace_hdl_t))) == NULL) + if ((dtp = malloc(sizeof (dtrace_hdl_t))) == NULL) { + dt_provmod_destroy(&provmod); return (set_open_errno(dtp, errp, EDT_NOMEM)); + } bzero(dtp, sizeof (dtrace_hdl_t)); dtp->dt_oflags = flags; @@ -1339,8 +1341,8 @@ alloc: /* * On FreeBSD the kernel module name can't be hard-coded. The - * 'kern.bootfile' sysctl value tells us exactly which file is - * being used as the kernel. + * 'kern.bootfile' sysctl value tells us exactly which file is being + * used as the kernel. */ #ifndef illumos # ifdef __FreeBSD__ Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_options.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_options.c,v retrieving revision 1.7 diff -u -p -r1.7 dt_options.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_options.c 24 Sep 2015 14:25:29 -0000 1.7 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_options.c 29 Apr 2017 03:20:38 -0000 @@ -751,7 +751,7 @@ dt_opt_rate(dtrace_hdl_t *dtp, const cha } } - if ((suffix[i].name == NULL && *end != '\0') || val < 0) + if (suffix[i].name == NULL && *end != '\0' || val < 0) return (dt_set_errno(dtp, EDT_BADOPTVAL)); if (mul == 0) { Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.c,v retrieving revision 1.8 diff -u -p -r1.8 dt_parser.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.c 4 Feb 2016 17:27:32 -0000 1.8 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.c 5 May 2017 17:00:23 -0000 @@ -23,7 +23,7 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2013, Joyent Inc. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #pragma ident "%Z%%M% %I% %E% SMI" @@ -424,7 +424,7 @@ dt_node_name(const dt_node_t *dnp, char switch (dnp->dn_kind) { case DT_NODE_INT: (void) snprintf(buf, len, "integer constant 0x%llx", - (unsigned long long)dnp->dn_value); + (u_longlong_t)dnp->dn_value); break; case DT_NODE_STRING: s = strchr2esc(dnp->dn_string, strlen(dnp->dn_string)); @@ -1280,7 +1280,7 @@ dt_node_int(uintmax_t value) } xyerror(D_INT_OFLOW, "integer constant 0x%llx cannot be represented " - "in any built-in integral type\n", (unsigned long long)value); + "in any built-in integral type\n", (u_longlong_t)value); /*NOTREACHED*/ return (NULL); /* keep gcc happy */ } @@ -2143,6 +2143,17 @@ dt_node_statement(dt_node_t *expr) } dt_node_t * +dt_node_if(dt_node_t *pred, dt_node_t *acts, dt_node_t *else_acts) +{ + dt_node_t *dnp = dt_node_alloc(DT_NODE_IF); + dnp->dn_conditional = pred; + dnp->dn_body = acts; + dnp->dn_alternate_body = else_acts; + + return (dnp); +} + +dt_node_t * dt_node_pdesc_by_name(char *spec) { dtrace_hdl_t *dtp = yypcb->pcb_hdl; @@ -2184,20 +2195,19 @@ dt_node_pdesc_by_id(uintmax_t id) longjmp(yypcb->pcb_jmpbuf, EDT_NOMEM); if (id > UINT_MAX) { - xyerror(D_PDESC_INVAL, "identifier %"PRIuMAX" exceeds maximum " - "probe id\n", id); + xyerror(D_PDESC_INVAL, "identifier %llu exceeds maximum " + "probe id\n", (u_longlong_t)id); } if (yypcb->pcb_pspec != DTRACE_PROBESPEC_NAME) { - xyerror(D_PDESC_INVAL, "probe identifier %"PRIuMAX - " not permitted when specifying %s\n", id, + xyerror(D_PDESC_INVAL, "probe identifier %llu not permitted " + "when specifying %s\n", (u_longlong_t)id, names[yypcb->pcb_pspec]); } if (dtrace_id2desc(dtp, (dtrace_id_t)id, dnp->dn_desc) != 0) { - xyerror(D_PDESC_INVAL, "invalid probe identifier %"PRIuMAX - ": %s\n", - id, dtrace_errmsg(dtp, dtrace_errno(dtp))); + xyerror(D_PDESC_INVAL, "invalid probe identifier %llu: %s\n", + (u_longlong_t)id, dtrace_errmsg(dtp, dtrace_errno(dtp))); } return (dnp); @@ -2212,7 +2222,6 @@ dt_node_clause(dt_node_t *pdescs, dt_nod dnp->dn_pred = pred; dnp->dn_acts = acts; - yybegin(YYS_CLAUSE); return (dnp); } @@ -3204,8 +3213,9 @@ dt_cook_op2(dt_node_t *dnp, uint_t idfla dt_xcook_ident(lp, dhp, idkind, B_TRUE); else dt_xcook_ident(lp, dhp, idp->di_kind, B_FALSE); - } else + } else { lp = dnp->dn_left = dt_node_cook(lp, 0); + } /* * Switch op to '+' for *(E1 + E2) array mode in these cases: @@ -3219,10 +3229,12 @@ dt_cook_op2(dt_node_t *dnp, uint_t idfla if (lp->dn_ident->di_kind == DT_IDENT_ARRAY) { if (lp->dn_args != NULL) op = DT_TOK_ADD; - } else if (!dt_ident_unref(lp->dn_ident)) + } else if (!dt_ident_unref(lp->dn_ident)) { op = DT_TOK_ADD; - } else if (lp->dn_kind != DT_NODE_AGG) + } + } else if (lp->dn_kind != DT_NODE_AGG) { op = DT_TOK_ADD; + } } switch (op) { @@ -3646,45 +3658,34 @@ asgn_common: case DT_TOK_PTR: /* - * If the left-hand side of operator -> is the name "self", - * then we permit a TLS variable to be created or referenced. + * If the left-hand side of operator -> is one of the scoping + * keywords, permit a local or thread variable to be created or + * referenced. */ - if (lp->dn_kind == DT_NODE_IDENT && - strcmp(lp->dn_string, "self") == 0) { - if (rp->dn_kind != DT_NODE_VAR) { - dt_xcook_ident(rp, dtp->dt_tls, - DT_IDENT_SCALAR, B_TRUE); - } - - if (idflags != 0) - rp = dt_node_cook(rp, idflags); - - dnp->dn_right = dnp->dn_left; /* avoid freeing rp */ - dt_node_free(dnp); - return (rp); - } + if (lp->dn_kind == DT_NODE_IDENT) { + dt_idhash_t *dhp = NULL; - /* - * If the left-hand side of operator -> is the name "this", - * then we permit a local variable to be created or referenced. - */ - if (lp->dn_kind == DT_NODE_IDENT && - strcmp(lp->dn_string, "this") == 0) { - if (rp->dn_kind != DT_NODE_VAR) { - dt_xcook_ident(rp, yypcb->pcb_locals, - DT_IDENT_SCALAR, B_TRUE); + if (strcmp(lp->dn_string, "self") == 0) { + dhp = dtp->dt_tls; + } else if (strcmp(lp->dn_string, "this") == 0) { + dhp = yypcb->pcb_locals; } + if (dhp != NULL) { + if (rp->dn_kind != DT_NODE_VAR) { + dt_xcook_ident(rp, dhp, + DT_IDENT_SCALAR, B_TRUE); + } - if (idflags != 0) - rp = dt_node_cook(rp, idflags); + if (idflags != 0) + rp = dt_node_cook(rp, idflags); - dnp->dn_right = dnp->dn_left; /* avoid freeing rp */ - dt_node_free(dnp); - return (rp); + /* avoid freeing rp */ + dnp->dn_right = dnp->dn_left; + dt_node_free(dnp); + return (rp); + } } - /*FALLTHRU*/ - case DT_TOK_DOT: lp = dnp->dn_left = dt_node_cook(lp, DT_IDFLG_REF); @@ -4503,7 +4504,8 @@ static dt_node_t *(*dt_cook_funcs[])(dt_ dt_cook_xlator, /* DT_NODE_XLATOR */ dt_cook_none, /* DT_NODE_PROBE */ dt_cook_provider, /* DT_NODE_PROVIDER */ - dt_cook_none /* DT_NODE_PROG */ + dt_cook_none, /* DT_NODE_PROG */ + dt_cook_none, /* DT_NODE_IF */ }; /* @@ -4518,6 +4520,8 @@ dt_node_cook(dt_node_t *dnp, uint_t idfl yylineno = dnp->dn_line; + assert(dnp->dn_kind < + sizeof (dt_cook_funcs) / sizeof (dt_cook_funcs[0])); dnp = dt_cook_funcs[dnp->dn_kind](dnp, idflags); dnp->dn_flags |= DT_NF_COOKED; @@ -4620,6 +4624,181 @@ dt_node_diftype(dtrace_hdl_t *dtp, const tp->dtdt_size = ctf_type_size(dnp->dn_ctfp, dnp->dn_type); } +/* + * Output the parse tree as D. The "-xtree=8" argument will call this + * function to print out the program after any syntactic sugar + * transformations have been applied (e.g. to implement "if"). The + * resulting output can be used to understand the transformations + * applied by these features, or to run such a script on a system that + * does not support these features + * + * Note that the output does not express precisely the same program as + * the input. In particular: + * - Only the clauses are output. #pragma options, variable + * declarations, etc. are excluded. + * - Command argument substitution has already been done, so the output + * will not contain e.g. $$1, but rather the substituted string. + */ +void +dt_printd(dt_node_t *dnp, FILE *fp, int depth) +{ + dt_node_t *arg; + + switch (dnp->dn_kind) { + case DT_NODE_INT: + (void) fprintf(fp, "0x%llx", (u_longlong_t)dnp->dn_value); + if (!(dnp->dn_flags & DT_NF_SIGNED)) + (void) fprintf(fp, "u"); + break; + + case DT_NODE_STRING: { + char *escd = strchr2esc(dnp->dn_string, strlen(dnp->dn_string)); + (void) fprintf(fp, "\"%s\"", escd); + free(escd); + break; + } + + case DT_NODE_IDENT: + (void) fprintf(fp, "%s", dnp->dn_string); + break; + + case DT_NODE_VAR: + (void) fprintf(fp, "%s%s", + (dnp->dn_ident->di_flags & DT_IDFLG_LOCAL) ? "this->" : + (dnp->dn_ident->di_flags & DT_IDFLG_TLS) ? "self->" : "", + dnp->dn_ident->di_name); + + if (dnp->dn_args != NULL) { + (void) fprintf(fp, "["); + + for (arg = dnp->dn_args; arg != NULL; + arg = arg->dn_list) { + dt_printd(arg, fp, 0); + if (arg->dn_list != NULL) + (void) fprintf(fp, ", "); + } + + (void) fprintf(fp, "]"); + } + break; + + case DT_NODE_SYM: { + const dtrace_syminfo_t *dts = dnp->dn_ident->di_data; + (void) fprintf(fp, "%s`%s", dts->dts_object, dts->dts_name); + break; + } + case DT_NODE_FUNC: + (void) fprintf(fp, "%s(", dnp->dn_ident->di_name); + + for (arg = dnp->dn_args; arg != NULL; arg = arg->dn_list) { + dt_printd(arg, fp, 0); + if (arg->dn_list != NULL) + (void) fprintf(fp, ", "); + } + (void) fprintf(fp, ")"); + break; + + case DT_NODE_OP1: + (void) fprintf(fp, "%s(", opstr(dnp->dn_op)); + dt_printd(dnp->dn_child, fp, 0); + (void) fprintf(fp, ")"); + break; + + case DT_NODE_OP2: + (void) fprintf(fp, "("); + dt_printd(dnp->dn_left, fp, 0); + if (dnp->dn_op == DT_TOK_LPAR) { + (void) fprintf(fp, ")"); + dt_printd(dnp->dn_right, fp, 0); + break; + } + if (dnp->dn_op == DT_TOK_PTR || dnp->dn_op == DT_TOK_DOT || + dnp->dn_op == DT_TOK_LBRAC) + (void) fprintf(fp, "%s", opstr(dnp->dn_op)); + else + (void) fprintf(fp, " %s ", opstr(dnp->dn_op)); + dt_printd(dnp->dn_right, fp, 0); + if (dnp->dn_op == DT_TOK_LBRAC) { + dt_node_t *ln = dnp->dn_right; + while (ln->dn_list != NULL) { + (void) fprintf(fp, ", "); + dt_printd(ln->dn_list, fp, depth); + ln = ln->dn_list; + } + (void) fprintf(fp, "]"); + } + (void) fprintf(fp, ")"); + break; + + case DT_NODE_OP3: + (void) fprintf(fp, "("); + dt_printd(dnp->dn_expr, fp, 0); + (void) fprintf(fp, " ? "); + dt_printd(dnp->dn_left, fp, 0); + (void) fprintf(fp, " : "); + dt_printd(dnp->dn_right, fp, 0); + (void) fprintf(fp, ")"); + break; + + case DT_NODE_DEXPR: + case DT_NODE_DFUNC: + (void) fprintf(fp, "%*s", depth * 8, ""); + dt_printd(dnp->dn_expr, fp, depth + 1); + (void) fprintf(fp, ";\n"); + break; + + case DT_NODE_PDESC: + (void) fprintf(fp, "%s:%s:%s:%s", + dnp->dn_desc->dtpd_provider, dnp->dn_desc->dtpd_mod, + dnp->dn_desc->dtpd_func, dnp->dn_desc->dtpd_name); + break; + + case DT_NODE_CLAUSE: + for (arg = dnp->dn_pdescs; arg != NULL; arg = arg->dn_list) { + dt_printd(arg, fp, 0); + if (arg->dn_list != NULL) + (void) fprintf(fp, ","); + (void) fprintf(fp, "\n"); + } + + if (dnp->dn_pred != NULL) { + (void) fprintf(fp, "/"); + dt_printd(dnp->dn_pred, fp, 0); + (void) fprintf(fp, "/\n"); + } + (void) fprintf(fp, "{\n"); + + for (arg = dnp->dn_acts; arg != NULL; arg = arg->dn_list) + dt_printd(arg, fp, depth + 1); + (void) fprintf(fp, "}\n"); + (void) fprintf(fp, "\n"); + break; + + case DT_NODE_IF: + (void) fprintf(fp, "%*sif (", depth * 8, ""); + dt_printd(dnp->dn_conditional, fp, 0); + (void) fprintf(fp, ") {\n"); + + for (arg = dnp->dn_body; arg != NULL; arg = arg->dn_list) + dt_printd(arg, fp, depth + 1); + if (dnp->dn_alternate_body == NULL) { + (void) fprintf(fp, "%*s}\n", depth * 8, ""); + } else { + (void) fprintf(fp, "%*s} else {\n", depth * 8, ""); + for (arg = dnp->dn_alternate_body; arg != NULL; + arg = arg->dn_list) + dt_printd(arg, fp, depth + 1); + (void) fprintf(fp, "%*s}\n", depth * 8, ""); + } + + break; + + default: + (void) fprintf(fp, "/* bad node %p, kind %d */\n", + (void *)dnp, dnp->dn_kind); + } +} + void dt_node_printr(dt_node_t *dnp, FILE *fp, int depth) { @@ -4666,7 +4845,7 @@ dt_node_printr(dt_node_t *dnp, FILE *fp, case DT_NODE_INT: (void) fprintf(fp, "INT 0x%llx (%s)\n", - (unsigned long long)dnp->dn_value, buf); + (u_longlong_t)dnp->dn_value, buf); break; case DT_NODE_STRING: @@ -4730,6 +4909,13 @@ dt_node_printr(dt_node_t *dnp, FILE *fp, (void) fprintf(fp, "OP2 %s (%s)\n", opstr(dnp->dn_op), buf); dt_node_printr(dnp->dn_left, fp, depth + 1); dt_node_printr(dnp->dn_right, fp, depth + 1); + if (dnp->dn_op == DT_TOK_LBRAC) { + dt_node_t *ln = dnp->dn_right; + while (ln->dn_list != NULL) { + dt_node_printr(ln->dn_list, fp, depth + 1); + ln = ln->dn_list; + } + } break; case DT_NODE_OP3: @@ -4791,6 +4977,7 @@ dt_node_printr(dt_node_t *dnp, FILE *fp, for (arg = dnp->dn_acts; arg != NULL; arg = arg->dn_list) dt_node_printr(arg, fp, depth + 1); + (void) fprintf(fp, "\n"); break; case DT_NODE_INLINE: @@ -4841,6 +5028,24 @@ dt_node_printr(dt_node_t *dnp, FILE *fp, dt_node_printr(arg, fp, depth + 1); break; + case DT_NODE_IF: + (void) fprintf(fp, "IF attr=%s CONDITION:\n", a); + + dt_node_printr(dnp->dn_conditional, fp, depth + 1); + + (void) fprintf(fp, "%*sIF BODY: \n", depth * 2, ""); + for (arg = dnp->dn_body; arg != NULL; arg = arg->dn_list) + dt_node_printr(arg, fp, depth + 1); + + if (dnp->dn_alternate_body != NULL) { + (void) fprintf(fp, "%*sIF ELSE: \n", depth * 2, ""); + for (arg = dnp->dn_alternate_body; arg != NULL; + arg = arg->dn_list) + dt_node_printr(arg, fp, depth + 1); + } + + break; + default: (void) fprintf(fp, "\n", (void *)dnp, dnp->dn_kind); @@ -4979,7 +5184,7 @@ yylabel(const char *label) yypcb->pcb_region = label; } -#if 0 +#ifndef __NetBSD__ int yywrap(void) { Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.h,v retrieving revision 1.3 diff -u -p -r1.3 dt_parser.h --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.h 4 Feb 2016 16:48:34 -0000 1.3 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_parser.h 13 Apr 2017 01:56:30 -0000 @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright (c) 2013 Joyent, Inc. All rights reserved. */ @@ -105,6 +105,12 @@ typedef struct dt_node { struct dt_node *_probes; /* list of probe nodes */ int _redecl; /* provider redeclared */ } _provider; + + struct { + struct dt_node *_conditional; + struct dt_node *_body; + struct dt_node *_alternate_body; + } _conditional; } dn_u; struct dt_node *dn_list; /* parse tree list link */ @@ -140,6 +146,11 @@ typedef struct dt_node { #define dn_provred dn_u._provider._redecl /* DT_NODE_PROVIDER */ #define dn_probes dn_u._provider._probes /* DT_NODE_PROVIDER */ +/* DT_NODE_IF: */ +#define dn_conditional dn_u._conditional._conditional +#define dn_body dn_u._conditional._body +#define dn_alternate_body dn_u._conditional._alternate_body + #define DT_NODE_FREE 0 /* unused node (waiting to be freed) */ #define DT_NODE_INT 1 /* integer value */ #define DT_NODE_STRING 2 /* string value */ @@ -162,6 +173,7 @@ typedef struct dt_node { #define DT_NODE_PROBE 19 /* probe definition */ #define DT_NODE_PROVIDER 20 /* provider definition */ #define DT_NODE_PROG 21 /* program translation unit */ +#define DT_NODE_IF 22 /* if statement */ #define DT_NF_SIGNED 0x01 /* data is a signed quantity (else unsigned) */ #define DT_NF_COOKED 0x02 /* data is a known type (else still cooking) */ @@ -213,6 +225,7 @@ extern dt_node_t *dt_node_xlator(dt_decl extern dt_node_t *dt_node_probe(char *, int, dt_node_t *, dt_node_t *); extern dt_node_t *dt_node_provider(char *, dt_node_t *); extern dt_node_t *dt_node_program(dt_node_t *); +extern dt_node_t *dt_node_if(dt_node_t *, dt_node_t *, dt_node_t *); extern dt_node_t *dt_node_link(dt_node_t *, dt_node_t *); extern dt_node_t *dt_node_cook(dt_node_t *, uint_t); @@ -237,6 +250,7 @@ extern void dt_node_promote(dt_node_t *, extern void dt_node_diftype(dtrace_hdl_t *, const dt_node_t *, dtrace_diftype_t *); extern void dt_node_printr(dt_node_t *, FILE *, int); +extern void dt_printd(dt_node_t *, FILE *, int); extern const char *dt_node_name(const dt_node_t *, char *, size_t); extern int dt_node_root(dt_node_t *); Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c,v retrieving revision 1.7 diff -u -p -r1.7 dt_pid.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c 24 Sep 2015 14:25:29 -0000 1.7 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c 29 Apr 2017 03:26:56 -0000 @@ -572,12 +572,6 @@ dt_pid_usdt_mapping(void *data, const pr prsyminfo_t sip; dof_helper_t dh; GElf_Half e_type; -#if defined(__FreeBSD__) || defined(__NetBSD__) - dof_hdr_t hdr; - size_t sz; - uint64_t dofmax; - void *dof; -#endif const char *mname; const char *syms[] = { "___SUNW_dof", "__SUNW_dof" }; int i, fd = -1; @@ -607,61 +601,24 @@ dt_pid_usdt_mapping(void *data, const pr continue; } -#if defined(__FreeBSD__) || defined(__NetBSD__) + dh.dofhp_dof = sym.st_value; dh.dofhp_addr = (e_type == ET_EXEC) ? 0 : pmp->pr_vaddr; - if (Pread(P, &hdr, sizeof (hdr), sym.st_value) != - sizeof (hdr)) { - dt_dprintf("read of DOF header failed\n"); - continue; - } - - sz = sizeof(dofmax); - if (sysctlbyname("kern.dtrace.dof_maxsize", &dofmax, &sz, - NULL, 0) != 0) { - dt_dprintf("failed to read dof_maxsize: %s\n", - strerror(errno)); - continue; - } - if (dofmax < hdr.dofh_loadsz) { - dt_dprintf("DOF load size exceeds maximum\n"); - continue; - } - - if ((dof = malloc(hdr.dofh_loadsz)) == NULL) - return (-1); - - if (Pread(P, dof, hdr.dofh_loadsz, sym.st_value) != - hdr.dofh_loadsz) { - free(dof); - dt_dprintf("read of DOF section failed\n"); - continue; - } - - dh.dofhp_dof = (uintptr_t)dof; - dh.dofhp_pid = proc_getpid(P); - dt_pid_objname(dh.dofhp_mod, sizeof (dh.dofhp_mod), sip.prs_lmid, mname); +#if defined(__FreeBSD__) || defined(__NetBSD__) + dh.dofhp_pid = proc_getpid(P); + if (fd == -1 && (fd = open("/dev/dtrace/helper", O_RDWR, 0)) < 0) { dt_dprintf("open of helper device failed: %s\n", strerror(errno)); - free(dof); return (-1); /* errno is set for us */ } if (ioctl(fd, DTRACEHIOC_ADDDOF, &dh, sizeof (dh)) < 0) dt_dprintf("DOF was rejected for %s\n", dh.dofhp_mod); - - free(dof); #else - dh.dofhp_dof = sym.st_value; - dh.dofhp_addr = (e_type == ET_EXEC) ? 0 : pmp->pr_vaddr; - - dt_pid_objname(dh.dofhp_mod, sizeof (dh.dofhp_mod), - sip.prs_lmid, mname); - if (fd == -1 && (fd = pr_open(P, "/dev/dtrace/helper", O_RDWR, 0)) < 0) { dt_dprintf("pr_open of helper device failed: %s\n", @@ -772,8 +729,13 @@ dt_pid_create_probes(dtrace_probedesc_t (void) snprintf(provname, sizeof (provname), "pid%d", (int)pid); if (gmatch(provname, pdp->dtpd_provider) != 0) { +#if defined(__FreeBSD__) || defined(__NetBSD__) + if ((P = dt_proc_grab(dtp, pid, 0, 1)) == NULL) +#else if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, - 0)) == NULL) { + 0)) == NULL) +#endif + { (void) dt_pid_error(dtp, pcb, NULL, NULL, D_PROC_GRAB, "failed to grab process %d", (int)pid); return (-1); @@ -974,7 +936,6 @@ dt_pid_get_types(dtrace_hdl_t *dtp, cons mptr = pdp->dtpd_mod; lmid = 0; } - __USE(lmid); if (Pxlookup_by_name(p, lmid, mptr, pdp->dtpd_func, &sym, &si) != 0) { Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_print.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_print.c,v retrieving revision 1.1 diff -u -p -r1.1 dt_print.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_print.c 24 Sep 2015 14:25:29 -0000 1.1 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_print.c 29 Apr 2017 03:27:15 -0000 @@ -190,7 +190,7 @@ print_bitfield(dt_printarg_t *pap, ulong value >>= shift; value &= mask; - (void) fprintf(fp, "%#llx", (unsigned long long)value); + (void) fprintf(fp, "%#llx", (u_longlong_t)value); } /* Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c,v retrieving revision 1.10 diff -u -p -r1.10 dt_printf.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c 1 Oct 2015 20:51:19 -0000 1.10 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c 29 Apr 2017 03:28:59 -0000 @@ -25,11 +25,7 @@ * Copyright (c) 2013 by Delphix. All rights reserved. */ -#ifdef illumos #include -#else -#define ABS(a) ((a) < 0 ? -(a) : (a)) -#endif #include #include #include @@ -301,7 +297,8 @@ pfprint_fp(dtrace_hdl_t *dtp, FILE *fp, const dt_pfargd_t *pfd, const void *addr, size_t size, uint64_t normal) { double n = (double)normal; -#if !defined(__arm__) && !defined(__powerpc__) && !defined(__mips__) +#if !defined(__arm__) && !defined(__powerpc__) && \ + !defined(__mips__) && !defined(__riscv__) long double ldn = (long double)normal; #endif @@ -312,7 +309,8 @@ pfprint_fp(dtrace_hdl_t *dtp, FILE *fp, case sizeof (double): return (dt_printf(dtp, fp, format, *((double *)addr) / n)); -#if !defined(__arm__) && !defined(__powerpc__) && !defined(__mips__) +#if !defined(__arm__) && !defined(__powerpc__) && \ + !defined(__mips__) && !defined(__riscv__) case sizeof (long double): return (dt_printf(dtp, fp, format, *((long double *)addr) / ldn)); @@ -524,6 +522,7 @@ pfprint_port(dtrace_hdl_t *dtp, FILE *fp #ifdef illumos if ((sv = getservbyport_r(port, NULL, &res, buf, sizeof (buf))) != NULL) + return (dt_printf(dtp, fp, format, sv->s_name)); #elif defined(__FreeBSD__) if (getservbyport_r(port, NULL, &res, buf, sizeof (buf), &sv) > 0) return (dt_printf(dtp, fp, format, sv->s_name)); Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c,v retrieving revision 1.8 diff -u -p -r1.8 dt_proc.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c 26 Sep 2015 00:33:34 -0000 1.8 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c 29 Apr 2017 03:29:14 -0000 @@ -98,10 +98,6 @@ #define IS_SYS_EXEC(w) (w == SYS_execve) #define IS_SYS_FORK(w) (w == SYS_vfork || w == SYS_forksys) -#if !defined(__DECONST) && defined(__UNCONST) -#define __DECONST(a, b) __UNCONST(b) -#endif - static dt_bkpt_t * dt_proc_bpcreate(dt_proc_t *dpr, uintptr_t addr, dt_bkpt_f *func, void *data) { @@ -115,7 +111,11 @@ dt_proc_bpcreate(dt_proc_t *dpr, uintptr dbp->dbp_data = data; dbp->dbp_addr = addr; +#ifdef __NetBSD__ if (Psetbkpt(P, dbp->dbp_addr, &dbp->dbp_instr) == 0) +#else + if (Psetbkpt(P, dbp->dbp_addr, dbp->dbp_instr) == 0) +#endif dbp->dbp_active = B_TRUE; dt_list_append(&dpr->dpr_bps, dbp); @@ -186,7 +186,11 @@ dt_proc_bpmatch(dtrace_hdl_t *dtp, dt_pr (int)dpr->dpr_pid, (ulong_t)dbp->dbp_addr, ++dbp->dbp_hits); dbp->dbp_func(dtp, dpr, dbp->dbp_data); +#ifdef __NetBSD__ (void) Pxecbkpt(dpr->dpr_proc, &dbp->dbp_instr); +#else + (void) Pxecbkpt(dpr->dpr_proc, dbp->dbp_instr); +#endif } static void Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.h,v retrieving revision 1.4 diff -u -p -r1.4 dt_proc.h --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.h 26 Sep 2015 00:33:34 -0000 1.4 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.h 13 Apr 2017 17:05:26 -0000 @@ -80,7 +80,11 @@ typedef struct dt_bkpt { dt_bkpt_f *dbp_func; /* callback function to execute */ void *dbp_data; /* callback function private data */ uintptr_t dbp_addr; /* virtual address of breakpoint */ +#ifdef __NetBSD__ proc_breakpoint_t dbp_instr; /* saved instruction from breakpoint */ +#else + ulong_t dbp_instr; /* saved instruction from breakpoint */ +#endif ulong_t dbp_hits; /* count of breakpoint hits for debug */ int dbp_active; /* flag indicating breakpoint is on */ } dt_bkpt_t; Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_provider.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_provider.c,v retrieving revision 1.3 diff -u -p -r1.3 dt_provider.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_provider.c 24 Sep 2015 14:25:29 -0000 1.3 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_provider.c 29 Apr 2017 16:01:03 -0000 @@ -545,9 +545,7 @@ dt_probe_define(dt_provider_t *pvp, dt_p for (pip = prp->pr_inst; pip != NULL; pip = pip->pi_next) { if (strcmp(pip->pi_fname, fname) == 0 && - ((rname == NULL && pip->pi_rname == NULL) || - (rname != NULL && pip->pi_rname != NULL && - strcmp(pip->pi_rname, rname) == 0))) + strcmp(pip->pi_rname, rname) == 0) break; } @@ -565,7 +563,7 @@ dt_probe_define(dt_provider_t *pvp, dt_p if ((pip->pi_fname = strdup(fname)) == NULL) goto nomem; - if (rname != NULL && (pip->pi_rname = strdup(rname)) == NULL) + if ((pip->pi_rname = strdup(rname)) == NULL) goto nomem; pip->pi_noffs = 0; @@ -605,7 +603,7 @@ dt_probe_define(dt_provider_t *pvp, dt_p dt_dprintf("defined probe %s %s:%s %s() +0x%x (%s)\n", isenabled ? "(is-enabled)" : "", pvp->pv_desc.dtvd_name, prp->pr_ident->di_name, fname, offset, - rname != NULL ? rname : fname); + rname); assert(*noffs < *maxoffs); (*offs)[(*noffs)++] = offset; Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_regset.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_regset.c,v retrieving revision 1.2 diff -u -p -r1.2 dt_regset.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_regset.c 24 Sep 2015 14:25:29 -0000 1.2 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_regset.c 13 Apr 2017 17:08:43 -0000 @@ -27,6 +27,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2016 Pedro Giffuni. All rights reserved. */ #include @@ -47,15 +48,15 @@ dt_regset_create(ulong_t nregs) if (drp == NULL) return (NULL); - drp->dr_bitmap = malloc(sizeof (ulong_t) * n); - drp->dr_size = nregs; + drp->dr_bitmap = calloc(n, sizeof (ulong_t)); if (drp->dr_bitmap == NULL) { dt_regset_destroy(drp); return (NULL); } - bzero(drp->dr_bitmap, sizeof (ulong_t) * n); + drp->dr_size = nregs; + return (drp); } Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_strtab.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_strtab.c,v retrieving revision 1.1.1.1 diff -u -p -r1.1.1.1 dt_strtab.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_strtab.c 20 Feb 2010 04:33:49 -0000 1.1.1.1 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_strtab.c 13 Apr 2017 17:09:37 -0000 @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Portions Copyright 2016 Pedro Giffuni. All rights reserved. + */ + #pragma ident "%Z%%M% %I% %E% SMI" #include @@ -70,12 +74,11 @@ dt_strtab_create(size_t bufsz) return (NULL); bzero(sp, sizeof (dt_strtab_t)); - sp->str_hash = malloc(nbuckets * sizeof (dt_strhash_t *)); + sp->str_hash = calloc(nbuckets, sizeof (dt_strhash_t *)); if (sp->str_hash == NULL) goto err; - bzero(sp->str_hash, nbuckets * sizeof (dt_strhash_t *)); sp->str_hashsz = nbuckets; sp->str_bufs = NULL; sp->str_ptr = NULL; @@ -253,8 +256,10 @@ dt_strtab_insert(dt_strtab_t *sp, const * Now copy the string data into our buffer list, and then update * the global counts of strings and bytes. Return str's byte offset. */ - if (dt_strtab_copyin(sp, str, len + 1) == -1) + if (dt_strtab_copyin(sp, str, len + 1) == -1) { + free(hp); return (-1L); + } sp->str_nstrs++; sp->str_size += len + 1; Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c,v retrieving revision 1.12 diff -u -p -r1.12 dt_subr.c --- src/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c 24 Sep 2015 14:25:29 -0000 1.12 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c 29 Apr 2017 03:39:45 -0000 @@ -582,6 +582,7 @@ int dt_printf(dtrace_hdl_t *dtp, FILE *fp, const char *format, ...) { va_list ap; + va_list ap2; int n; #ifndef illumos @@ -606,11 +607,13 @@ dt_printf(dtrace_hdl_t *dtp, FILE *fp, c len = dtp->dt_sprintf_buflen - len; assert(len >= 0); - if ((n = vsnprintf(buf, len, format, ap)) < 0) + va_copy(ap2, ap); + if ((n = vsnprintf(buf, len, format, ap2)) < 0) n = dt_set_errno(dtp, errno); + va_end(ap2); va_end(ap); - + return (n); } @@ -641,11 +644,14 @@ dt_printf(dtrace_hdl_t *dtp, FILE *fp, c dtp->dt_buffered_buf[0] = '\0'; } - if ((needed = vsnprintf(NULL, 0, format, ap)) < 0) { + va_copy(ap2, ap); + if ((needed = vsnprintf(NULL, 0, format, ap2)) < 0) { rval = dt_set_errno(dtp, errno); + va_end(ap2); va_end(ap); return (rval); } + va_end(ap2); if (needed == 0) { va_end(ap); @@ -671,12 +677,15 @@ dt_printf(dtrace_hdl_t *dtp, FILE *fp, c dtp->dt_buffered_size <<= 1; } + va_copy(ap2, ap); if (vsnprintf(&dtp->dt_buffered_buf[dtp->dt_buffered_offs], - avail, format, ap) < 0) { + avail, format, ap2) < 0) { rval = dt_set_errno(dtp, errno); + va_end(ap2); va_end(ap); return (rval); } + va_end(ap2); dtp->dt_buffered_offs += needed; assert(dtp->dt_buffered_buf[dtp->dt_buffered_offs] == '\0'); @@ -684,8 +693,10 @@ dt_printf(dtrace_hdl_t *dtp, FILE *fp, c return (0); } - n = vfprintf(fp, format, ap); + va_copy(ap2, ap); + n = vfprintf(fp, format, ap2); fflush(fp); + va_end(ap2); va_end(ap); if (n < 0) { @@ -920,7 +931,7 @@ dtrace_addr2str(dtrace_hdl_t *dtp, uint6 if (err == 0 && addr != sym.st_value) { (void) snprintf(s, n, "%s`%s+0x%llx", dts.dts_object, - dts.dts_name, (unsigned long long)addr - sym.st_value); + dts.dts_name, (u_longlong_t)addr - sym.st_value); } else if (err == 0) { (void) snprintf(s, n, "%s`%s", dts.dts_object, dts.dts_name); @@ -932,9 +943,9 @@ dtrace_addr2str(dtrace_hdl_t *dtp, uint6 */ if (dtrace_lookup_by_addr(dtp, addr, NULL, &dts) == 0) { (void) snprintf(s, n, "%s`0x%llx", dts.dts_object, - (unsigned long long)addr); + (u_longlong_t)addr); } else { - (void) snprintf(s, n, "0x%llx", (unsigned long long)addr); + (void) snprintf(s, n, "0x%llx", (u_longlong_t)addr); } } @@ -967,7 +978,7 @@ dtrace_uaddr2str(dtrace_hdl_t *dtp, pid_ if (addr > sym.st_value) { (void) snprintf(c, sizeof (c), "%s`%s+0x%llx", obj, - name, (unsigned long long)(addr - sym.st_value)); + name, (u_longlong_t)(addr - sym.st_value)); } else { (void) snprintf(c, sizeof (c), "%s`%s", obj, name); } Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_sugar.c =================================================================== RCS file: src/external/cddl/osnet/dist/lib/libdtrace/common/dt_sugar.c diff -N src/external/cddl/osnet/dist/lib/libdtrace/common/dt_sugar.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dt_sugar.c 10 Oct 2016 11:14:31 -0000 @@ -0,0 +1,516 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + */ + +/* + * Syntactic sugar features are implemented by transforming the D parse tree + * such that it only uses the subset of D that is supported by the rest of the + * compiler / the kernel. A clause containing these language features is + * referred to as a "super-clause", and its transformation typically entails + * creating several "sub-clauses" to implement it. For diagnosability, the + * sub-clauses will be printed if the "-xtree=8" flag is specified. + * + * Currently, the only syntactic sugar feature is "if/else" statements. Each + * basic block (e.g. the body of the "if" and "else" statements, and the + * statements before and after) is turned into its own sub-clause, with a + * predicate that causes it to be executed only if the code flows to this point. + * Nested if/else statements are supported. + * + * This infrastructure is designed to accommodate other syntactic sugar features + * in the future. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct dt_sugar_parse { + dtrace_hdl_t *dtsp_dtp; /* dtrace handle */ + dt_node_t *dtsp_pdescs; /* probe descriptions */ + int dtsp_num_conditions; /* number of condition variables */ + int dtsp_num_ifs; /* number of "if" statements */ + dt_node_t *dtsp_clause_list; /* list of clauses */ +} dt_sugar_parse_t; + +static void dt_sugar_visit_stmts(dt_sugar_parse_t *, dt_node_t *, int); + +/* + * Return a node for "self->%error". + * + * Note that the "%" is part of the variable name, and is included so that + * this variable name can not collide with any user-specified variable. + * + * This error variable is used to keep track of if there has been an error + * in any of the sub-clauses, and is used to prevent execution of subsequent + * sub-clauses following an error. + */ +static dt_node_t * +dt_sugar_new_error_var(void) +{ + return (dt_node_op2(DT_TOK_PTR, dt_node_ident(strdup("self")), + dt_node_ident(strdup("%error")))); +} + +/* + * Append this clause to the clause list. + */ +static void +dt_sugar_append_clause(dt_sugar_parse_t *dp, dt_node_t *clause) +{ + dp->dtsp_clause_list = dt_node_link(dp->dtsp_clause_list, clause); +} + +/* + * Prepend this clause to the clause list. + */ +static void +dt_sugar_prepend_clause(dt_sugar_parse_t *dp, dt_node_t *clause) +{ + dp->dtsp_clause_list = dt_node_link(clause, dp->dtsp_clause_list); +} + +/* + * Return a node for "this->%condition_", or NULL if condid==0. + * + * Note that the "%" is part of the variable name, and is included so that + * this variable name can not collide with any user-specified variable. + */ +static dt_node_t * +dt_sugar_new_condition_var(int condid) +{ + char *str; + + if (condid == 0) + return (NULL); + assert(condid > 0); + + (void) asprintf(&str, "%%condition_%d", ABS(condid)); + return (dt_node_op2(DT_TOK_PTR, dt_node_ident(strdup("this")), + dt_node_ident(str))); +} + +/* + * Return new clause to evaluate predicate and set newcond. condid is + * the condition that we are already under, or 0 if none. + * The new clause will be of the form: + * + * dp_pdescs + * /!self->%error/ + * { + * this->%condition_ = + * (this->%condition_ && pred); + * } + * + * Note: if condid==0, we will instead do "... = (1 && pred)", to effectively + * convert the pred to a boolean. + * + * Note: Unless an error has been encountered, we always set the condition + * variable (either to 0 or 1). This lets us avoid resetting the condition + * variables back to 0 when the super-clause completes. + */ +static dt_node_t * +dt_sugar_new_condition_impl(dt_sugar_parse_t *dp, + dt_node_t *pred, int condid, int newcond) +{ + dt_node_t *value, *body, *newpred; + + /* predicate is !self->%error */ + newpred = dt_node_op1(DT_TOK_LNEG, dt_sugar_new_error_var()); + + if (condid == 0) { + /* + * value is (1 && pred) + * + * Note, D doesn't allow a probe-local "this" variable to + * be reused as a different type, even from a different probe. + * Therefore, value can't simply be , because then + * its type could be different when we reuse this condid + * in a different meta-clause. + */ + value = dt_node_op2(DT_TOK_LAND, dt_node_int(1), pred); + } else { + /* value is (this->%condition_ && pred) */ + value = dt_node_op2(DT_TOK_LAND, + dt_sugar_new_condition_var(condid), pred); + } + + /* body is "this->%condition_ = ;" */ + body = dt_node_statement(dt_node_op2(DT_TOK_ASGN, + dt_sugar_new_condition_var(newcond), value)); + + return (dt_node_clause(dp->dtsp_pdescs, newpred, body)); +} + +/* + * Generate a new clause to evaluate predicate and set a new condition variable, + * whose ID will be returned. The new clause will be appended to + * dp_first_new_clause. + */ +static int +dt_sugar_new_condition(dt_sugar_parse_t *dp, dt_node_t *pred, int condid) +{ + dp->dtsp_num_conditions++; + dt_sugar_append_clause(dp, dt_sugar_new_condition_impl(dp, + pred, condid, dp->dtsp_num_conditions)); + return (dp->dtsp_num_conditions); +} + +/* + * Visit the specified node and all of its descendants. Currently this is only + * used to count the number of "if" statements (dtsp_num_ifs). + */ +static void +dt_sugar_visit_all(dt_sugar_parse_t *dp, dt_node_t *dnp) +{ + dt_node_t *arg; + + switch (dnp->dn_kind) { + case DT_NODE_FREE: + case DT_NODE_INT: + case DT_NODE_STRING: + case DT_NODE_SYM: + case DT_NODE_TYPE: + case DT_NODE_PROBE: + case DT_NODE_PDESC: + case DT_NODE_IDENT: + break; + + case DT_NODE_FUNC: + for (arg = dnp->dn_args; arg != NULL; arg = arg->dn_list) + dt_sugar_visit_all(dp, arg); + break; + + case DT_NODE_OP1: + dt_sugar_visit_all(dp, dnp->dn_child); + break; + + case DT_NODE_OP2: + dt_sugar_visit_all(dp, dnp->dn_left); + dt_sugar_visit_all(dp, dnp->dn_right); + if (dnp->dn_op == DT_TOK_LBRAC) { + dt_node_t *ln = dnp->dn_right; + while (ln->dn_list != NULL) { + dt_sugar_visit_all(dp, ln->dn_list); + ln = ln->dn_list; + } + } + break; + + case DT_NODE_OP3: + dt_sugar_visit_all(dp, dnp->dn_expr); + dt_sugar_visit_all(dp, dnp->dn_left); + dt_sugar_visit_all(dp, dnp->dn_right); + break; + + case DT_NODE_DEXPR: + case DT_NODE_DFUNC: + dt_sugar_visit_all(dp, dnp->dn_expr); + break; + + case DT_NODE_AGG: + for (arg = dnp->dn_aggtup; arg != NULL; arg = arg->dn_list) + dt_sugar_visit_all(dp, arg); + + if (dnp->dn_aggfun) + dt_sugar_visit_all(dp, dnp->dn_aggfun); + break; + + case DT_NODE_CLAUSE: + for (arg = dnp->dn_pdescs; arg != NULL; arg = arg->dn_list) + dt_sugar_visit_all(dp, arg); + + if (dnp->dn_pred != NULL) + dt_sugar_visit_all(dp, dnp->dn_pred); + + for (arg = dnp->dn_acts; arg != NULL; arg = arg->dn_list) + dt_sugar_visit_all(dp, arg); + break; + + case DT_NODE_INLINE: { + const dt_idnode_t *inp = dnp->dn_ident->di_iarg; + + dt_sugar_visit_all(dp, inp->din_root); + break; + } + case DT_NODE_MEMBER: + if (dnp->dn_membexpr) + dt_sugar_visit_all(dp, dnp->dn_membexpr); + break; + + case DT_NODE_XLATOR: + for (arg = dnp->dn_members; arg != NULL; arg = arg->dn_list) + dt_sugar_visit_all(dp, arg); + break; + + case DT_NODE_PROVIDER: + for (arg = dnp->dn_probes; arg != NULL; arg = arg->dn_list) + dt_sugar_visit_all(dp, arg); + break; + + case DT_NODE_PROG: + for (arg = dnp->dn_list; arg != NULL; arg = arg->dn_list) + dt_sugar_visit_all(dp, arg); + break; + + case DT_NODE_IF: + dp->dtsp_num_ifs++; + dt_sugar_visit_all(dp, dnp->dn_conditional); + + for (arg = dnp->dn_body; arg != NULL; arg = arg->dn_list) + dt_sugar_visit_all(dp, arg); + for (arg = dnp->dn_alternate_body; arg != NULL; + arg = arg->dn_list) + dt_sugar_visit_all(dp, arg); + + break; + + default: + (void) dnerror(dnp, D_UNKNOWN, "bad node %p, kind %d\n", + (void *)dnp, dnp->dn_kind); + } +} + +/* + * Return a new clause which resets the error variable to zero: + * + * dp_pdescs{ self->%error = 0; } + * + * This clause will be executed at the beginning of each meta-clause, to + * ensure the error variable is unset (in case the previous meta-clause + * failed). + */ +static dt_node_t * +dt_sugar_new_clearerror_clause(dt_sugar_parse_t *dp) +{ + dt_node_t *stmt = dt_node_statement(dt_node_op2(DT_TOK_ASGN, + dt_sugar_new_error_var(), dt_node_int(0))); + return (dt_node_clause(dp->dtsp_pdescs, NULL, stmt)); +} + +/* + * Evaluate the conditional, and recursively visit the body of the "if" + * statement (and the "else", if present). + */ +static void +dt_sugar_do_if(dt_sugar_parse_t *dp, dt_node_t *if_stmt, int precondition) +{ + int newid; + + assert(if_stmt->dn_kind == DT_NODE_IF); + + /* condition */ + newid = dt_sugar_new_condition(dp, + if_stmt->dn_conditional, precondition); + + /* body of if */ + dt_sugar_visit_stmts(dp, if_stmt->dn_body, newid); + + /* + * Visit the body of the "else" statement, if present. Note that we + * generate a new condition which is the inverse of the previous + * condition. + */ + if (if_stmt->dn_alternate_body != NULL) { + dt_node_t *pred = + dt_node_op1(DT_TOK_LNEG, dt_sugar_new_condition_var(newid)); + dt_sugar_visit_stmts(dp, if_stmt->dn_alternate_body, + dt_sugar_new_condition(dp, pred, precondition)); + } +} + +/* + * Generate a new clause to evaluate the statements based on the condition. + * The new clause will be appended to dp_first_new_clause. + * + * dp_pdescs + * /!self->%error && this->%condition_/ + * { + * stmts + * } + */ +static void +dt_sugar_new_basic_block(dt_sugar_parse_t *dp, int condid, dt_node_t *stmts) +{ + dt_node_t *pred = NULL; + + if (condid == 0) { + /* + * Don't bother with !error on the first clause, because if + * there is only one clause, we don't add the prelude to + * zero out %error. + */ + if (dp->dtsp_num_conditions != 0) { + pred = dt_node_op1(DT_TOK_LNEG, + dt_sugar_new_error_var()); + } + } else { + pred = dt_node_op2(DT_TOK_LAND, + dt_node_op1(DT_TOK_LNEG, dt_sugar_new_error_var()), + dt_sugar_new_condition_var(condid)); + } + dt_sugar_append_clause(dp, + dt_node_clause(dp->dtsp_pdescs, pred, stmts)); +} + +/* + * Visit all the statements in this list, and break them into basic blocks, + * generating new clauses for "if" and "else" statements. + */ +static void +dt_sugar_visit_stmts(dt_sugar_parse_t *dp, dt_node_t *stmts, int precondition) +{ + dt_node_t *stmt; + dt_node_t *prev_stmt = NULL; + dt_node_t *next_stmt; + dt_node_t *first_stmt_in_basic_block = NULL; + + for (stmt = stmts; stmt != NULL; stmt = next_stmt) { + next_stmt = stmt->dn_list; + + if (stmt->dn_kind != DT_NODE_IF) { + if (first_stmt_in_basic_block == NULL) + first_stmt_in_basic_block = stmt; + prev_stmt = stmt; + continue; + } + + /* + * Remove this and following statements from the previous + * clause. + */ + if (prev_stmt != NULL) + prev_stmt->dn_list = NULL; + + /* + * Generate clause for statements preceding the "if" + */ + if (first_stmt_in_basic_block != NULL) { + dt_sugar_new_basic_block(dp, precondition, + first_stmt_in_basic_block); + } + + dt_sugar_do_if(dp, stmt, precondition); + + first_stmt_in_basic_block = NULL; + + prev_stmt = stmt; + } + + /* generate clause for statements after last "if". */ + if (first_stmt_in_basic_block != NULL) { + dt_sugar_new_basic_block(dp, precondition, + first_stmt_in_basic_block); + } +} + +/* + * Generate a new clause which will set the error variable when an error occurs. + * Only one of these clauses is created per program (e.g. script file). + * The clause is: + * + * dtrace:::ERROR{ self->%error = 1; } + */ +static dt_node_t * +dt_sugar_makeerrorclause(void) +{ + dt_node_t *acts, *pdesc; + + pdesc = dt_node_pdesc_by_name(strdup("dtrace:::ERROR")); + + acts = dt_node_statement(dt_node_op2(DT_TOK_ASGN, + dt_sugar_new_error_var(), dt_node_int(1))); + + return (dt_node_clause(pdesc, NULL, acts)); +} + +/* + * Transform the super-clause into straight-D, returning the new list of + * sub-clauses. + */ +dt_node_t * +dt_compile_sugar(dtrace_hdl_t *dtp, dt_node_t *clause) +{ + dt_sugar_parse_t dp = { 0 }; + int condid = 0; + + dp.dtsp_dtp = dtp; + dp.dtsp_pdescs = clause->dn_pdescs; + + /* make dt_node_int() generate an "int"-typed integer */ + yyintdecimal = B_TRUE; + yyintsuffix[0] = '\0'; + yyintprefix = 0; + + dt_sugar_visit_all(&dp, clause); + + if (dp.dtsp_num_ifs == 0 && dp.dtsp_num_conditions == 0) { + /* + * There is nothing that modifies the number of clauses. Use + * the existing clause as-is, with its predicate intact. This + * ensures that in the absence of D sugar, the body of the + * clause can create a variable that is referenced in the + * predicate. + */ + dt_sugar_append_clause(&dp, dt_node_clause(clause->dn_pdescs, + clause->dn_pred, clause->dn_acts)); + } else { + if (clause->dn_pred != NULL) { + condid = dt_sugar_new_condition(&dp, + clause->dn_pred, condid); + } + + if (clause->dn_acts == NULL) { + /* + * dt_sugar_visit_stmts() does not emit a clause with + * an empty body (e.g. if there's an empty "if" body), + * but we need the empty body here so that we + * continue to get the default tracing action. + */ + dt_sugar_new_basic_block(&dp, condid, NULL); + } else { + dt_sugar_visit_stmts(&dp, clause->dn_acts, condid); + } + } + + if (dp.dtsp_num_conditions != 0) { + dt_sugar_prepend_clause(&dp, + dt_sugar_new_clearerror_clause(&dp)); + } + + if (dp.dtsp_clause_list != NULL && + dp.dtsp_clause_list->dn_list != NULL && !dtp->dt_has_sugar) { + dtp->dt_has_sugar = B_TRUE; + dt_sugar_prepend_clause(&dp, dt_sugar_makeerrorclause()); + } + return (dp.dtsp_clause_list); +} Index: src/external/cddl/osnet/dist/lib/libdtrace/common/dtrace.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/common/dtrace.h,v retrieving revision 1.5 diff -u -p -r1.5 dtrace.h --- src/external/cddl/osnet/dist/lib/libdtrace/common/dtrace.h 24 Sep 2015 14:25:29 -0000 1.5 +++ src/external/cddl/osnet/dist/lib/libdtrace/common/dtrace.h 13 Apr 2017 17:14:37 -0000 @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ @@ -59,6 +59,7 @@ extern "C" { #define DTRACE_VERSION 3 /* library ABI interface version */ struct ps_prochandle; +struct dt_node; typedef struct dtrace_hdl dtrace_hdl_t; typedef struct dtrace_prog dtrace_prog_t; typedef struct dtrace_vector dtrace_vector_t; @@ -115,7 +116,7 @@ typedef struct dtrace_proginfo { #define DTRACE_C_CPP 0x0010 /* Preprocess input file with cpp(1) utility */ #define DTRACE_C_KNODEF 0x0020 /* Permit unresolved kernel symbols in DIFO */ #define DTRACE_C_UNODEF 0x0040 /* Permit unresolved user symbols in DIFO */ -#define DTRACE_C_PSPEC 0x0080 /* Intepret ambiguous specifiers as probes */ +#define DTRACE_C_PSPEC 0x0080 /* Interpret ambiguous specifiers as probes */ #define DTRACE_C_ETAGS 0x0100 /* Prefix error messages with error tags */ #define DTRACE_C_ARGREF 0x0200 /* Do not require all macro args to be used */ #define DTRACE_C_DEFARG 0x0800 /* Use 0/"" as value for unspecified args */ @@ -523,6 +524,10 @@ extern int dtrace_type_strcompile(dtrace extern int dtrace_type_fcompile(dtrace_hdl_t *, FILE *, dtrace_typeinfo_t *); +extern struct dt_node *dt_compile_sugar(dtrace_hdl_t *, + struct dt_node *); + + /* * DTrace Probe Interface * Index: src/external/cddl/osnet/dist/lib/libdtrace/i386/dt_isadep.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libdtrace/i386/dt_isadep.c,v retrieving revision 1.1 diff -u -p -r1.1 dt_isadep.c --- src/external/cddl/osnet/dist/lib/libdtrace/i386/dt_isadep.c 24 Sep 2015 14:26:09 -0000 1.1 +++ src/external/cddl/osnet/dist/lib/libdtrace/i386/dt_isadep.c 7 May 2017 03:58:22 -0000 @@ -40,9 +40,8 @@ #include -#ifndef illumos -#define PR_MODEL_ILP32 1 -#define PR_MODEL_LP64 2 +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include #include #endif @@ -94,11 +93,7 @@ dt_pid_has_jump_table(struct ps_prochand char dmodel = Pstatus(P)->pr_dmodel; #else pid_t pid = proc_getpid(P); -#if __i386__ - char dmodel = PR_MODEL_ILP32; -#elif __amd64__ - char dmodel = PR_MODEL_LP64; -#endif + char dmodel = proc_getmodel(P); #endif /* @@ -150,11 +145,7 @@ dt_pid_create_return_probe(struct ps_pro char dmodel = Pstatus(P)->pr_dmodel; #else pid_t pid = proc_getpid(P); -#if __i386__ - char dmodel = PR_MODEL_ILP32; -#elif __amd64__ - char dmodel = PR_MODEL_LP64; -#endif + char dmodel = proc_getmodel(P); #endif /* @@ -311,11 +302,7 @@ dt_pid_create_offset_probe(struct ps_pro char dmodel = Pstatus(P)->pr_dmodel; #else pid_t pid = proc_getpid(P); -#if __i386__ - char dmodel = PR_MODEL_ILP32; -#elif __amd64__ - char dmodel = PR_MODEL_LP64; -#endif + char dmodel = proc_getmodel(P); #endif if ((text = malloc(symp->st_size)) == NULL) { @@ -394,11 +381,7 @@ dt_pid_create_glob_offset_probes(struct char dmodel = Pstatus(P)->pr_dmodel; #else pid_t pid = proc_getpid(P); -#if __i386__ - char dmodel = PR_MODEL_ILP32; -#elif __amd64__ - char dmodel = PR_MODEL_LP64; -#endif + char dmodel = proc_getmodel(P); #endif ftp->ftps_type = DTFTP_OFFSETS; @@ -522,13 +505,8 @@ dt_instr_size(uchar_t *instr, dtrace_hdl cpu_mode = (dmodel == PR_MODEL_ILP32) ? SIZE32 : SIZE64; -#ifdef notyet - // XXX: does not compile! if (dtrace_disx86(&x86dis, cpu_mode) != 0) return (-1); -#else - __USE(cpu_mode); -#endif /* * If the instruction was a single-byte breakpoint, there may be Index: src/external/cddl/osnet/dist/lib/libgen/common/gmatch.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libgen/common/gmatch.c,v retrieving revision 1.4 diff -u -p -r1.4 gmatch.c --- src/external/cddl/osnet/dist/lib/libgen/common/gmatch.c 7 Feb 2015 20:30:03 -0000 1.4 +++ src/external/cddl/osnet/dist/lib/libgen/common/gmatch.c 13 Apr 2017 17:22:08 -0000 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,34 +19,34 @@ * CDDL HEADER END */ -/* Copyright (c) 1988 AT&T */ -/* All Rights Reserved */ - /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 1.1.5.2 */ +/* Copyright (c) 1988 AT&T */ +/* All Rights Reserved */ + +#pragma ident "%Z%%M% %I% %E% SMI" /*LINTLIBRARY*/ -#if defined(sun) +#ifdef illumos #pragma weak gmatch = _gmatch #endif -#if defined(sun) +#ifdef illumos #include "gen_synonyms.h" #endif #include #include #include #include -#include -#if defined(sun) +#ifdef illumos #include #include "_range.h" #else +#include /* DOODAD */ static int multibyte = 0; #define WCHAR_CSMASK 0x30000000 #define valid_range(c1, c2) \ @@ -61,7 +60,7 @@ c = cl; \ if (n <= 0) \ return (0); \ - p += n; + p += n int gmatch(const char *, const char *); int @@ -103,13 +102,13 @@ gmatch(const char *s, const char *p) notflag = 1; p++; } - Popwchar(p, c) + Popwchar(p, c); do { if (c == '-' && lc && *p != ']') { - Popwchar(p, c) + Popwchar(p, c); if (c == '\\') { - Popwchar(p, c) + Popwchar(p, c); } if (notflag) { if (!multibyte || @@ -128,7 +127,7 @@ gmatch(const char *s, const char *p) } } else if (c == '\\') { /* skip to quoted character */ - Popwchar(p, c) + Popwchar(p, c); } lc = c; if (notflag) { @@ -142,14 +141,14 @@ gmatch(const char *s, const char *p) if (scc == lc) ok++; } - Popwchar(p, c) + Popwchar(p, c); } while (c != ']'); return (ok ? gmatch(s, p) : 0); } case '\\': /* skip to quoted character and see if it matches */ - Popwchar(p, c) + Popwchar(p, c); default: if (c != scc) Index: src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c,v retrieving revision 1.3 diff -u -p -r1.3 libnvpair.c --- src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c 19 Oct 2013 23:07:39 -0000 1.3 +++ src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c 20 Apr 2017 17:38:21 -0000 @@ -19,15 +19,17 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ +#include +#include #include -#include +#include #include -#include -#include +#include +#include #include "libnvpair.h" /* @@ -38,21 +40,531 @@ * between kernel and userland, and possibly saving onto disk files. */ +/* + * Print control structure. + */ + +#define DEFINEOP(opname, vtype) \ + struct { \ + int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \ + const char *, vtype); \ + void *arg; \ + } opname + +#define DEFINEARROP(opname, vtype) \ + struct { \ + int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \ + const char *, vtype, uint_t); \ + void *arg; \ + } opname + +struct nvlist_printops { + DEFINEOP(print_boolean, int); + DEFINEOP(print_boolean_value, boolean_t); + DEFINEOP(print_byte, uchar_t); + DEFINEOP(print_int8, int8_t); + DEFINEOP(print_uint8, uint8_t); + DEFINEOP(print_int16, int16_t); + DEFINEOP(print_uint16, uint16_t); + DEFINEOP(print_int32, int32_t); + DEFINEOP(print_uint32, uint32_t); + DEFINEOP(print_int64, int64_t); + DEFINEOP(print_uint64, uint64_t); + DEFINEOP(print_double, double); + DEFINEOP(print_string, char *); + DEFINEOP(print_hrtime, hrtime_t); + DEFINEOP(print_nvlist, nvlist_t *); + DEFINEARROP(print_boolean_array, boolean_t *); + DEFINEARROP(print_byte_array, uchar_t *); + DEFINEARROP(print_int8_array, int8_t *); + DEFINEARROP(print_uint8_array, uint8_t *); + DEFINEARROP(print_int16_array, int16_t *); + DEFINEARROP(print_uint16_array, uint16_t *); + DEFINEARROP(print_int32_array, int32_t *); + DEFINEARROP(print_uint32_array, uint32_t *); + DEFINEARROP(print_int64_array, int64_t *); + DEFINEARROP(print_uint64_array, uint64_t *); + DEFINEARROP(print_string_array, char **); + DEFINEARROP(print_nvlist_array, nvlist_t **); +}; + +struct nvlist_prtctl { + FILE *nvprt_fp; /* output destination */ + enum nvlist_indent_mode nvprt_indent_mode; /* see above */ + int nvprt_indent; /* absolute indent, or tab depth */ + int nvprt_indentinc; /* indent or tab increment */ + const char *nvprt_nmfmt; /* member name format, max one %s */ + const char *nvprt_eomfmt; /* after member format, e.g. "\n" */ + const char *nvprt_btwnarrfmt; /* between array members */ + int nvprt_btwnarrfmt_nl; /* nvprt_eoamfmt includes newline? */ + struct nvlist_printops *nvprt_dfltops; + struct nvlist_printops *nvprt_custops; +}; + +#define DFLTPRTOP(pctl, type) \ + ((pctl)->nvprt_dfltops->print_##type.op) + +#define DFLTPRTOPARG(pctl, type) \ + ((pctl)->nvprt_dfltops->print_##type.arg) + +#define CUSTPRTOP(pctl, type) \ + ((pctl)->nvprt_custops->print_##type.op) + +#define CUSTPRTOPARG(pctl, type) \ + ((pctl)->nvprt_custops->print_##type.arg) + +#define RENDER(pctl, type, nvl, name, val) \ + { \ + int done = 0; \ + if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \ + done = CUSTPRTOP(pctl, type)(pctl, \ + CUSTPRTOPARG(pctl, type), nvl, name, val); \ + } \ + if (!done) { \ + (void) DFLTPRTOP(pctl, type)(pctl, \ + DFLTPRTOPARG(pctl, type), nvl, name, val); \ + } \ + (void) fprintf(pctl->nvprt_fp, pctl->nvprt_eomfmt); \ + } + +#define ARENDER(pctl, type, nvl, name, arrp, count) \ + { \ + int done = 0; \ + if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \ + done = CUSTPRTOP(pctl, type)(pctl, \ + CUSTPRTOPARG(pctl, type), nvl, name, arrp, count); \ + } \ + if (!done) { \ + (void) DFLTPRTOP(pctl, type)(pctl, \ + DFLTPRTOPARG(pctl, type), nvl, name, arrp, count); \ + } \ + (void) fprintf(pctl->nvprt_fp, pctl->nvprt_eomfmt); \ + } + +static void nvlist_print_with_indent(nvlist_t *, nvlist_prtctl_t); + +/* + * ====================================================================== + * | | + * | Indentation | + * | | + * ====================================================================== + */ + static void -indent(FILE *fp, int depth) +indent(nvlist_prtctl_t pctl, int onemore) { - while (depth-- > 0) - (void) fprintf(fp, "\t"); + int depth; + + switch (pctl->nvprt_indent_mode) { + case NVLIST_INDENT_ABS: + (void) fprintf(pctl->nvprt_fp, "%*s", + pctl->nvprt_indent + onemore * pctl->nvprt_indentinc, ""); + break; + + case NVLIST_INDENT_TABBED: + depth = pctl->nvprt_indent + onemore; + while (depth-- > 0) + (void) fprintf(pctl->nvprt_fp, "\t"); + } } /* - * nvlist_print - Prints elements in an event buffer + * ====================================================================== + * | | + * | Default nvlist member rendering functions. | + * | | + * ====================================================================== + */ + +/* + * Generate functions to print single-valued nvlist members. + * + * type_and_variant - suffix to form function name + * vtype - C type for the member value + * ptype - C type to cast value to for printing + * vfmt - format string for pair value, e.g "%d" or "0x%llx" + */ + +#define NVLIST_PRTFUNC(type_and_variant, vtype, ptype, vfmt) \ +static int \ +nvprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \ + nvlist_t *nvl, const char *name, vtype value) \ +{ \ + FILE *fp = pctl->nvprt_fp; \ + NOTE(ARGUNUSED(private)) \ + NOTE(ARGUNUSED(nvl)) \ + indent(pctl, 1); \ + (void) fprintf(fp, pctl->nvprt_nmfmt, name); \ + (void) fprintf(fp, vfmt, (ptype)value); \ + return (1); \ +} + +NVLIST_PRTFUNC(boolean, int, int, "%d") +NVLIST_PRTFUNC(boolean_value, boolean_t, int, "%d") +NVLIST_PRTFUNC(byte, uchar_t, uchar_t, "0x%2.2x") +NVLIST_PRTFUNC(int8, int8_t, int, "%d") +NVLIST_PRTFUNC(uint8, uint8_t, uint8_t, "0x%x") +NVLIST_PRTFUNC(int16, int16_t, int16_t, "%d") +NVLIST_PRTFUNC(uint16, uint16_t, uint16_t, "0x%x") +NVLIST_PRTFUNC(int32, int32_t, int32_t, "%d") +NVLIST_PRTFUNC(uint32, uint32_t, uint32_t, "0x%x") +NVLIST_PRTFUNC(int64, int64_t, longlong_t, "%lld") +NVLIST_PRTFUNC(uint64, uint64_t, u_longlong_t, "0x%llx") +NVLIST_PRTFUNC(double, double, double, "0x%f") +NVLIST_PRTFUNC(string, char *, char *, "%s") +NVLIST_PRTFUNC(hrtime, hrtime_t, hrtime_t, "0x%llx") + +/* + * Generate functions to print array-valued nvlist members. + */ + +#define NVLIST_ARRPRTFUNC(type_and_variant, vtype, ptype, vfmt) \ +static int \ +nvaprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \ + nvlist_t *nvl, const char *name, vtype *valuep, uint_t count) \ +{ \ + FILE *fp = pctl->nvprt_fp; \ + uint_t i; \ + NOTE(ARGUNUSED(private)) \ + NOTE(ARGUNUSED(nvl)) \ + for (i = 0; i < count; i++) { \ + if (i == 0 || pctl->nvprt_btwnarrfmt_nl) { \ + indent(pctl, 1); \ + (void) fprintf(fp, pctl->nvprt_nmfmt, name); \ + if (pctl->nvprt_btwnarrfmt_nl) \ + (void) fprintf(fp, "[%d]: ", i); \ + } \ + if (i != 0) \ + (void) fprintf(fp, pctl->nvprt_btwnarrfmt); \ + (void) fprintf(fp, vfmt, (ptype)valuep[i]); \ + } \ + return (1); \ +} + +NVLIST_ARRPRTFUNC(boolean_array, boolean_t, boolean_t, "%d") +NVLIST_ARRPRTFUNC(byte_array, uchar_t, uchar_t, "0x%2.2x") +NVLIST_ARRPRTFUNC(int8_array, int8_t, int8_t, "%d") +NVLIST_ARRPRTFUNC(uint8_array, uint8_t, uint8_t, "0x%x") +NVLIST_ARRPRTFUNC(int16_array, int16_t, int16_t, "%d") +NVLIST_ARRPRTFUNC(uint16_array, uint16_t, uint16_t, "0x%x") +NVLIST_ARRPRTFUNC(int32_array, int32_t, int32_t, "%d") +NVLIST_ARRPRTFUNC(uint32_array, uint32_t, uint32_t, "0x%x") +NVLIST_ARRPRTFUNC(int64_array, int64_t, longlong_t, "%lld") +NVLIST_ARRPRTFUNC(uint64_array, uint64_t, u_longlong_t, "0x%llx") +NVLIST_ARRPRTFUNC(string_array, char *, char *, "%s") + +/*ARGSUSED*/ +static int +nvprint_nvlist(nvlist_prtctl_t pctl, void *private, + nvlist_t *nvl, const char *name, nvlist_t *value) +{ + FILE *fp = pctl->nvprt_fp; + + indent(pctl, 1); + (void) fprintf(fp, "%s = (embedded nvlist)\n", name); + + pctl->nvprt_indent += pctl->nvprt_indentinc; + nvlist_print_with_indent(value, pctl); + pctl->nvprt_indent -= pctl->nvprt_indentinc; + + indent(pctl, 1); + (void) fprintf(fp, "(end %s)\n", name); + + return (1); +} + +/*ARGSUSED*/ +static int +nvaprint_nvlist_array(nvlist_prtctl_t pctl, void *private, + nvlist_t *nvl, const char *name, nvlist_t **valuep, uint_t count) +{ + FILE *fp = pctl->nvprt_fp; + uint_t i; + + indent(pctl, 1); + (void) fprintf(fp, "%s = (array of embedded nvlists)\n", name); + + for (i = 0; i < count; i++) { + indent(pctl, 1); + (void) fprintf(fp, "(start %s[%d])\n", name, i); + + pctl->nvprt_indent += pctl->nvprt_indentinc; + nvlist_print_with_indent(valuep[i], pctl); + pctl->nvprt_indent -= pctl->nvprt_indentinc; + + indent(pctl, 1); + (void) fprintf(fp, "(end %s[%d])\n", name, i); + } + + return (1); +} + +/* + * ====================================================================== + * | | + * | Interfaces that allow control over formatting. | + * | | + * ====================================================================== + */ + +void +nvlist_prtctl_setdest(nvlist_prtctl_t pctl, FILE *fp) +{ + pctl->nvprt_fp = fp; +} + +FILE * +nvlist_prtctl_getdest(nvlist_prtctl_t pctl) +{ + return (pctl->nvprt_fp); +} + + +void +nvlist_prtctl_setindent(nvlist_prtctl_t pctl, enum nvlist_indent_mode mode, + int start, int inc) +{ + if (mode < NVLIST_INDENT_ABS || mode > NVLIST_INDENT_TABBED) + mode = NVLIST_INDENT_TABBED; + + if (start < 0) + start = 0; + + if (inc < 0) + inc = 1; + + pctl->nvprt_indent_mode = mode; + pctl->nvprt_indent = start; + pctl->nvprt_indentinc = inc; +} + +void +nvlist_prtctl_doindent(nvlist_prtctl_t pctl, int onemore) +{ + indent(pctl, onemore); +} + + +void +nvlist_prtctl_setfmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which, + const char *fmt) +{ + switch (which) { + case NVLIST_FMT_MEMBER_NAME: + if (fmt == NULL) + fmt = "%s = "; + pctl->nvprt_nmfmt = fmt; + break; + + case NVLIST_FMT_MEMBER_POSTAMBLE: + if (fmt == NULL) + fmt = "\n"; + pctl->nvprt_eomfmt = fmt; + break; + + case NVLIST_FMT_BTWN_ARRAY: + if (fmt == NULL) { + pctl->nvprt_btwnarrfmt = " "; + pctl->nvprt_btwnarrfmt_nl = 0; + } else { + pctl->nvprt_btwnarrfmt = fmt; + pctl->nvprt_btwnarrfmt_nl = (strstr(fmt, "\n") != NULL); + } + break; + + default: + break; + } +} + + +void +nvlist_prtctl_dofmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which, ...) +{ + FILE *fp = pctl->nvprt_fp; + va_list ap; + char *name; + + va_start(ap, which); + + switch (which) { + case NVLIST_FMT_MEMBER_NAME: + name = va_arg(ap, char *); + (void) fprintf(fp, pctl->nvprt_nmfmt, name); + break; + + case NVLIST_FMT_MEMBER_POSTAMBLE: + (void) fprintf(fp, pctl->nvprt_eomfmt); + break; + + case NVLIST_FMT_BTWN_ARRAY: + (void) fprintf(fp, pctl->nvprt_btwnarrfmt); \ + break; + + default: + break; + } + + va_end(ap); +} + +/* + * ====================================================================== + * | | + * | Interfaces to allow appointment of replacement rendering functions.| + * | | + * ====================================================================== + */ + +#define NVLIST_PRINTCTL_REPLACE(type, vtype) \ +void \ +nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \ + int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype), \ + void *private) \ +{ \ + CUSTPRTOP(pctl, type) = func; \ + CUSTPRTOPARG(pctl, type) = private; \ +} + +NVLIST_PRINTCTL_REPLACE(boolean, int) +NVLIST_PRINTCTL_REPLACE(boolean_value, boolean_t) +NVLIST_PRINTCTL_REPLACE(byte, uchar_t) +NVLIST_PRINTCTL_REPLACE(int8, int8_t) +NVLIST_PRINTCTL_REPLACE(uint8, uint8_t) +NVLIST_PRINTCTL_REPLACE(int16, int16_t) +NVLIST_PRINTCTL_REPLACE(uint16, uint16_t) +NVLIST_PRINTCTL_REPLACE(int32, int32_t) +NVLIST_PRINTCTL_REPLACE(uint32, uint32_t) +NVLIST_PRINTCTL_REPLACE(int64, int64_t) +NVLIST_PRINTCTL_REPLACE(uint64, uint64_t) +NVLIST_PRINTCTL_REPLACE(double, double) +NVLIST_PRINTCTL_REPLACE(string, char *) +NVLIST_PRINTCTL_REPLACE(hrtime, hrtime_t) +NVLIST_PRINTCTL_REPLACE(nvlist, nvlist_t *) + +#define NVLIST_PRINTCTL_AREPLACE(type, vtype) \ +void \ +nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \ + int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, \ + uint_t), void *private) \ +{ \ + CUSTPRTOP(pctl, type) = func; \ + CUSTPRTOPARG(pctl, type) = private; \ +} + +NVLIST_PRINTCTL_AREPLACE(boolean_array, boolean_t *) +NVLIST_PRINTCTL_AREPLACE(byte_array, uchar_t *) +NVLIST_PRINTCTL_AREPLACE(int8_array, int8_t *) +NVLIST_PRINTCTL_AREPLACE(uint8_array, uint8_t *) +NVLIST_PRINTCTL_AREPLACE(int16_array, int16_t *) +NVLIST_PRINTCTL_AREPLACE(uint16_array, uint16_t *) +NVLIST_PRINTCTL_AREPLACE(int32_array, int32_t *) +NVLIST_PRINTCTL_AREPLACE(uint32_array, uint32_t *) +NVLIST_PRINTCTL_AREPLACE(int64_array, int64_t *) +NVLIST_PRINTCTL_AREPLACE(uint64_array, uint64_t *) +NVLIST_PRINTCTL_AREPLACE(string_array, char **) +NVLIST_PRINTCTL_AREPLACE(nvlist_array, nvlist_t **) + +/* + * ====================================================================== + * | | + * | Interfaces to manage nvlist_prtctl_t cookies. | + * | | + * ====================================================================== */ -static + + +static const struct nvlist_printops defprtops = { + { nvprint_boolean, NULL }, + { nvprint_boolean_value, NULL }, + { nvprint_byte, NULL }, + { nvprint_int8, NULL }, + { nvprint_uint8, NULL }, + { nvprint_int16, NULL }, + { nvprint_uint16, NULL }, + { nvprint_int32, NULL }, + { nvprint_uint32, NULL }, + { nvprint_int64, NULL }, + { nvprint_uint64, NULL }, + { nvprint_double, NULL }, + { nvprint_string, NULL }, + { nvprint_hrtime, NULL }, + { nvprint_nvlist, NULL }, + { nvaprint_boolean_array, NULL }, + { nvaprint_byte_array, NULL }, + { nvaprint_int8_array, NULL }, + { nvaprint_uint8_array, NULL }, + { nvaprint_int16_array, NULL }, + { nvaprint_uint16_array, NULL }, + { nvaprint_int32_array, NULL }, + { nvaprint_uint32_array, NULL }, + { nvaprint_int64_array, NULL }, + { nvaprint_uint64_array, NULL }, + { nvaprint_string_array, NULL }, + { nvaprint_nvlist_array, NULL }, +}; + +static void +prtctl_defaults(FILE *fp, struct nvlist_prtctl *pctl, + struct nvlist_printops *ops) +{ + pctl->nvprt_fp = fp; + pctl->nvprt_indent_mode = NVLIST_INDENT_TABBED; + pctl->nvprt_indent = 0; + pctl->nvprt_indentinc = 1; + pctl->nvprt_nmfmt = "%s = "; + pctl->nvprt_eomfmt = "\n"; + pctl->nvprt_btwnarrfmt = " "; + pctl->nvprt_btwnarrfmt_nl = 0; + + pctl->nvprt_dfltops = (struct nvlist_printops *)&defprtops; + pctl->nvprt_custops = ops; +} + +nvlist_prtctl_t +nvlist_prtctl_alloc(void) +{ + struct nvlist_prtctl *pctl; + struct nvlist_printops *ops; + + if ((pctl = malloc(sizeof (*pctl))) == NULL) + return (NULL); + + if ((ops = calloc(1, sizeof (*ops))) == NULL) { + free(pctl); + return (NULL); + } + + prtctl_defaults(stdout, pctl, ops); + + return (pctl); +} + void -nvlist_print_with_indent(FILE *fp, nvlist_t *nvl, int depth) +nvlist_prtctl_free(nvlist_prtctl_t pctl) { - int i; + if (pctl != NULL) { + free(pctl->nvprt_custops); + free(pctl); + } +} + +/* + * ====================================================================== + * | | + * | Top-level print request interfaces. | + * | | + * ====================================================================== + */ + +/* + * nvlist_print - Prints elements in an event buffer + */ +static void +nvlist_print_with_indent(nvlist_t *nvl, nvlist_prtctl_t pctl) +{ + FILE *fp = pctl->nvprt_fp; char *name; uint_t nelem; nvpair_t *nvp; @@ -60,7 +572,7 @@ nvlist_print_with_indent(FILE *fp, nvlis if (nvl == NULL) return; - indent(fp, depth); + indent(pctl, 0); (void) fprintf(fp, "nvlist version: %d\n", NVL_VERSION(nvl)); nvp = nvlist_next_nvpair(nvl, NULL); @@ -68,198 +580,174 @@ nvlist_print_with_indent(FILE *fp, nvlis while (nvp) { data_type_t type = nvpair_type(nvp); - indent(fp, depth); name = nvpair_name(nvp); - (void) fprintf(fp, "\t%s =", name); nelem = 0; + switch (type) { case DATA_TYPE_BOOLEAN: { - (void) fprintf(fp, " 1"); + RENDER(pctl, boolean, nvl, name, 1); break; } case DATA_TYPE_BOOLEAN_VALUE: { boolean_t val; (void) nvpair_value_boolean_value(nvp, &val); - (void) fprintf(fp, " %d", val); + RENDER(pctl, boolean_value, nvl, name, val); break; } case DATA_TYPE_BYTE: { uchar_t val; (void) nvpair_value_byte(nvp, &val); - (void) fprintf(fp, " 0x%2.2x", val); + RENDER(pctl, byte, nvl, name, val); break; } case DATA_TYPE_INT8: { int8_t val; (void) nvpair_value_int8(nvp, &val); - (void) fprintf(fp, " %d", val); + RENDER(pctl, int8, nvl, name, val); break; } case DATA_TYPE_UINT8: { uint8_t val; (void) nvpair_value_uint8(nvp, &val); - (void) fprintf(fp, " 0x%x", val); + RENDER(pctl, uint8, nvl, name, val); break; } case DATA_TYPE_INT16: { int16_t val; (void) nvpair_value_int16(nvp, &val); - (void) fprintf(fp, " %d", val); + RENDER(pctl, int16, nvl, name, val); break; } case DATA_TYPE_UINT16: { uint16_t val; (void) nvpair_value_uint16(nvp, &val); - (void) fprintf(fp, " 0x%x", val); + RENDER(pctl, uint16, nvl, name, val); break; } case DATA_TYPE_INT32: { int32_t val; (void) nvpair_value_int32(nvp, &val); - (void) fprintf(fp, " %d", val); + RENDER(pctl, int32, nvl, name, val); break; } case DATA_TYPE_UINT32: { uint32_t val; (void) nvpair_value_uint32(nvp, &val); - (void) fprintf(fp, " 0x%x", val); + RENDER(pctl, uint32, nvl, name, val); break; } case DATA_TYPE_INT64: { int64_t val; (void) nvpair_value_int64(nvp, &val); - (void) fprintf(fp, " %" PRId64 , val); + RENDER(pctl, int64, nvl, name, val); break; } case DATA_TYPE_UINT64: { uint64_t val; (void) nvpair_value_uint64(nvp, &val); - (void) fprintf(fp, " 0x%" PRIx64, val); + RENDER(pctl, uint64, nvl, name, val); break; } case DATA_TYPE_DOUBLE: { double val; (void) nvpair_value_double(nvp, &val); - (void) fprintf(fp, " 0x%f", val); + RENDER(pctl, double, nvl, name, val); break; } case DATA_TYPE_STRING: { char *val; (void) nvpair_value_string(nvp, &val); - (void) fprintf(fp, " %s", val); + RENDER(pctl, string, nvl, name, val); break; } case DATA_TYPE_BOOLEAN_ARRAY: { boolean_t *val; (void) nvpair_value_boolean_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - (void) fprintf(fp, " %d", val[i]); + ARENDER(pctl, boolean_array, nvl, name, val, nelem); break; } case DATA_TYPE_BYTE_ARRAY: { uchar_t *val; (void) nvpair_value_byte_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - (void) fprintf(fp, " 0x%2.2x", val[i]); + ARENDER(pctl, byte_array, nvl, name, val, nelem); break; } case DATA_TYPE_INT8_ARRAY: { int8_t *val; (void) nvpair_value_int8_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - (void) fprintf(fp, " %d", val[i]); + ARENDER(pctl, int8_array, nvl, name, val, nelem); break; } case DATA_TYPE_UINT8_ARRAY: { uint8_t *val; (void) nvpair_value_uint8_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - (void) fprintf(fp, " 0x%x", val[i]); + ARENDER(pctl, uint8_array, nvl, name, val, nelem); break; } case DATA_TYPE_INT16_ARRAY: { int16_t *val; (void) nvpair_value_int16_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - (void) fprintf(fp, " %d", val[i]); + ARENDER(pctl, int16_array, nvl, name, val, nelem); break; } case DATA_TYPE_UINT16_ARRAY: { uint16_t *val; (void) nvpair_value_uint16_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - (void) fprintf(fp, " 0x%x", val[i]); + ARENDER(pctl, uint16_array, nvl, name, val, nelem); break; } case DATA_TYPE_INT32_ARRAY: { int32_t *val; (void) nvpair_value_int32_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - (void) fprintf(fp, " %d", val[i]); + ARENDER(pctl, int32_array, nvl, name, val, nelem); break; } case DATA_TYPE_UINT32_ARRAY: { uint32_t *val; (void) nvpair_value_uint32_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - (void) fprintf(fp, " 0x%x", val[i]); + ARENDER(pctl, uint32_array, nvl, name, val, nelem); break; } case DATA_TYPE_INT64_ARRAY: { int64_t *val; (void) nvpair_value_int64_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - (void) fprintf(fp, " %" PRId64, val[i]); + ARENDER(pctl, int64_array, nvl, name, val, nelem); break; } case DATA_TYPE_UINT64_ARRAY: { uint64_t *val; (void) nvpair_value_uint64_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - (void) fprintf(fp, " 0x%" PRIx64, val[i]); + ARENDER(pctl, uint64_array, nvl, name, val, nelem); break; } case DATA_TYPE_STRING_ARRAY: { char **val; (void) nvpair_value_string_array(nvp, &val, &nelem); - for (i = 0; i < nelem; i++) - (void) fprintf(fp, " %s", val[i]); + ARENDER(pctl, string_array, nvl, name, val, nelem); break; } case DATA_TYPE_HRTIME: { hrtime_t val; (void) nvpair_value_hrtime(nvp, &val); - (void) fprintf(fp, " 0x%jx", (intmax_t)val); + RENDER(pctl, hrtime, nvl, name, val); break; } case DATA_TYPE_NVLIST: { nvlist_t *val; (void) nvpair_value_nvlist(nvp, &val); - (void) fprintf(fp, " (embedded nvlist)\n"); - nvlist_print_with_indent(fp, val, depth + 1); - indent(fp, depth + 1); - (void) fprintf(fp, "(end %s)\n", name); + RENDER(pctl, nvlist, nvl, name, val); break; } case DATA_TYPE_NVLIST_ARRAY: { nvlist_t **val; (void) nvpair_value_nvlist_array(nvp, &val, &nelem); - (void) fprintf(fp, " (array of embedded nvlists)\n"); - for (i = 0; i < nelem; i++) { - indent(fp, depth + 1); - (void) fprintf(fp, - "(start %s[%d])\n", name, i); - nvlist_print_with_indent(fp, val[i], depth + 1); - indent(fp, depth + 1); - (void) fprintf(fp, "(end %s[%d])\n", name, i); - } + ARENDER(pctl, nvlist_array, nvl, name, val, nelem); break; } default: (void) fprintf(fp, " unknown data type (%d)", type); break; } - (void) fprintf(fp, "\n"); nvp = nvlist_next_nvpair(nvl, nvp); } } @@ -267,9 +755,17 @@ nvlist_print_with_indent(FILE *fp, nvlis void nvlist_print(FILE *fp, nvlist_t *nvl) { - nvlist_print_with_indent(fp, nvl, 0); + struct nvlist_prtctl pc; + + prtctl_defaults(fp, &pc, NULL); + nvlist_print_with_indent(nvl, &pc); } +void +nvlist_prt(nvlist_t *nvl, nvlist_prtctl_t pctl) +{ + nvlist_print_with_indent(nvl, pctl); +} #define NVP(elem, type, vtype, ptype, format) { \ vtype value; \ @@ -298,6 +794,7 @@ dump_nvlist(nvlist_t *list, int indent) { nvpair_t *elem = NULL; boolean_t bool_value; + boolean_t *bool_array_value; nvlist_t *nvlist_value; nvlist_t **nvlist_array_value; uint_t i, count; @@ -308,6 +805,10 @@ dump_nvlist(nvlist_t *list, int indent) while ((elem = nvlist_next_nvpair(list, elem)) != NULL) { switch (nvpair_type(elem)) { + case DATA_TYPE_BOOLEAN: + (void) printf("%*s%s\n", indent, "", nvpair_name(elem)); + break; + case DATA_TYPE_BOOLEAN_VALUE: (void) nvpair_value_boolean_value(elem, &bool_value); (void) printf("%*s%s: %s\n", indent, "", @@ -343,17 +844,27 @@ dump_nvlist(nvlist_t *list, int indent) break; case DATA_TYPE_INT64: - NVP(elem, int64, int64_t, int64_t, "%" PRIx64); + NVP(elem, int64, int64_t, longlong_t, "%lld"); break; case DATA_TYPE_UINT64: - NVP(elem, uint64, uint64_t, uint64_t, "%" PRIu64); + NVP(elem, uint64, uint64_t, u_longlong_t, "%llu"); break; case DATA_TYPE_STRING: NVP(elem, string, char *, char *, "'%s'"); break; + case DATA_TYPE_BOOLEAN_ARRAY: + (void) nvpair_value_boolean_array(elem, + &bool_array_value, &count); + for (i = 0; i < count; i++) { + (void) printf("%*s%s[%d]: %s\n", indent, "", + nvpair_name(elem), i, + bool_array_value[i] ? "true" : "false"); + } + break; + case DATA_TYPE_BYTE_ARRAY: NVPA(elem, byte_array, uchar_t, int, "%u"); break; @@ -383,12 +894,12 @@ dump_nvlist(nvlist_t *list, int indent) break; case DATA_TYPE_INT64_ARRAY: - NVPA(elem, int64_array, int64_t, int64_t, "%" PRId64); + NVPA(elem, int64_array, int64_t, longlong_t, "%lld"); break; case DATA_TYPE_UINT64_ARRAY: - NVPA(elem, uint64_array, uint64_t, uint64_t, - "%" PRIu64); + NVPA(elem, uint64_array, uint64_t, u_longlong_t, + "%llu"); break; case DATA_TYPE_STRING_ARRAY: @@ -421,6 +932,14 @@ dump_nvlist(nvlist_t *list, int indent) } /* + * ====================================================================== + * | | + * | Misc private interface. | + * | | + * ====================================================================== + */ + +/* * Determine if string 'value' matches 'nvp' value. The 'value' string is * converted, depending on the type of 'nvp', prior to match. For numeric * types, a radix independent sscanf conversion of 'value' is used. If 'nvp' @@ -710,7 +1229,8 @@ nvpair_value_match_regex(nvpair_t *nvp, break; } case DATA_TYPE_BOOLEAN_VALUE: { - boolean_t val, val_arg; + int32_t val_arg; + boolean_t val; /* scanf boolean_t from value and check for match */ sr = sscanf(value, "%"SCNi32, &val_arg); @@ -721,7 +1241,8 @@ nvpair_value_match_regex(nvpair_t *nvp, break; } case DATA_TYPE_BOOLEAN_ARRAY: { - boolean_t *val_array, val_arg; + boolean_t *val_array; + int32_t val_arg; /* check indexed value of array for match */ sr = sscanf(value, "%"SCNi32, &val_arg); Index: src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 libnvpair.h --- src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h 27 Feb 2010 22:30:14 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h 11 Dec 2014 11:54:49 -0000 @@ -19,8 +19,8 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ #ifndef _LIBNVPAIR_H @@ -35,10 +35,159 @@ extern "C" { #endif -void nvlist_print(FILE *, nvlist_t *); -int nvpair_value_match(nvpair_t *, int, char *, char **); -int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, char **); -void dump_nvlist(nvlist_t *, int); +/* + * All interfaces described in this file are private to Solaris, and + * are subject to change at any time and without notice. The public + * nvlist/nvpair interfaces, as documented in manpage sections 3NVPAIR, + * are all imported from included above. + */ + +extern int nvpair_value_match(nvpair_t *, int, char *, char **); +extern int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, + char **); + +extern void nvlist_print(FILE *, nvlist_t *); +extern int nvlist_print_json(FILE *, nvlist_t *); +extern void dump_nvlist(nvlist_t *, int); + +/* + * Private nvlist printing interface that allows the caller some control + * over output rendering (as opposed to nvlist_print and dump_nvlist). + * + * Obtain an opaque nvlist_prtctl_t cookie using nvlist_prtctl_alloc + * (NULL on failure); on return the cookie is set up for default formatting + * and rendering. Quote the cookie in subsequent customisation functions and + * then pass the cookie to nvlist_prt to render the nvlist. Finally, + * use nvlist_prtctl_free to release the cookie. + * + * For all nvlist_lookup_xxx and nvlist_lookup_xxx_array functions + * we have a corresponding brace of functions that appoint replacement + * rendering functions: + * + * extern void nvlist_prtctl_xxx(nvlist_prtctl_t, + * void (*)(nvlist_prtctl_t ctl, void *private, const char *name, + * xxxtype value)) + * + * and + * + * extern void nvlist_prtctl_xxx_array(nvlist_prtctl_t, + * void (*)(nvlist_prtctl_t ctl, void *private, const char *name, + * xxxtype value, uint_t count)) + * + * where xxxtype is the C datatype corresponding to xxx, eg int8_t for "int8" + * and char * for "string". The function that is appointed to render the + * specified datatype receives as arguments the cookie, the nvlist + * member name, the value of that member (or a pointer for array function), + * and (for array rendering functions) a count of the number of elements. + */ + +typedef struct nvlist_prtctl *nvlist_prtctl_t; /* opaque */ + +enum nvlist_indent_mode { + NVLIST_INDENT_ABS, /* Absolute indentation */ + NVLIST_INDENT_TABBED /* Indent with tabstops */ +}; + +extern nvlist_prtctl_t nvlist_prtctl_alloc(void); +extern void nvlist_prtctl_free(nvlist_prtctl_t); +extern void nvlist_prt(nvlist_t *, nvlist_prtctl_t); + +/* Output stream */ +extern void nvlist_prtctl_setdest(nvlist_prtctl_t, FILE *); +extern FILE *nvlist_prtctl_getdest(nvlist_prtctl_t); + +/* Indentation mode, start indent, indent increment; default tabbed/0/1 */ +extern void nvlist_prtctl_setindent(nvlist_prtctl_t, enum nvlist_indent_mode, + int, int); +extern void nvlist_prtctl_doindent(nvlist_prtctl_t, int); + +enum nvlist_prtctl_fmt { + NVLIST_FMT_MEMBER_NAME, /* name fmt; default "%s = " */ + NVLIST_FMT_MEMBER_POSTAMBLE, /* after nvlist member; default "\n" */ + NVLIST_FMT_BTWN_ARRAY /* between array members; default " " */ +}; + +extern void nvlist_prtctl_setfmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, + const char *); +extern void nvlist_prtctl_dofmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, ...); + +/* + * Function prototypes for interfaces that appoint a new rendering function + * for single-valued nvlist members. + * + * A replacement function receives arguments as follows: + * + * nvlist_prtctl_t Print control structure; do not change preferences + * for this object from a print callback function. + * + * void * The function-private cookie argument registered + * when the replacement function was appointed. + * + * nvlist_t * The full nvlist that is being processed. The + * rendering function is called to render a single + * member (name and value passed as below) but it may + * want to reference or incorporate other aspects of + * the full nvlist. + * + * const char * Member name to render + * + * valtype Value of the member to render + * + * The function must return non-zero if it has rendered output for this + * member, or 0 if it wants to default to standard rendering for this + * one member. + */ + +#define NVLIST_PRINTCTL_SVDECL(funcname, valtype) \ + extern void funcname(nvlist_prtctl_t, \ + int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, valtype), \ + void *) + +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean, int); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean_value, boolean_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_byte, uchar_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int8, int8_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint8, uint8_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int16, int16_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint16, uint16_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int32, int32_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint32, uint32_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int64, int64_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint64, uint64_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_double, double); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_string, char *); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_hrtime, hrtime_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_nvlist, nvlist_t *); + +#undef NVLIST_PRINTCTL_SVDECL /* was just for "clarity" above */ + +/* + * Function prototypes for interfaces that appoint a new rendering function + * for array-valued nvlist members. + * + * One additional argument is taken: uint_t for the number of array elements + * + * Return values as above. + */ +#define NVLIST_PRINTCTL_AVDECL(funcname, vtype) \ + extern void funcname(nvlist_prtctl_t, \ + int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, uint_t), \ + void *) + +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_boolean_array, boolean_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_byte_array, uchar_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int8_array, int8_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint8_array, uint8_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int16_array, int16_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint16_array, uint16_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int32_array, int32_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint32_array, uint32_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int64_array, int64_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint64_array, uint64_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_string_array, char **); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_nvlist_array, nvlist_t **); + +#undef NVLIST_PRINTCTL_AVDECL /* was just for "clarity" above */ #ifdef __cplusplus } Index: src/external/cddl/osnet/dist/lib/libnvpair/nvpair_json.c =================================================================== RCS file: src/external/cddl/osnet/dist/lib/libnvpair/nvpair_json.c diff -N src/external/cddl/osnet/dist/lib/libnvpair/nvpair_json.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/lib/libnvpair/nvpair_json.c 11 Dec 2014 11:54:49 -0000 @@ -0,0 +1,403 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright (c) 2014, Joyent, Inc. + */ + +#include +#include +#include +#include +#include + +#include "libnvpair.h" + +#define FPRINTF(fp, ...) \ + do { \ + if (fprintf(fp, __VA_ARGS__) < 0) \ + return (-1); \ + } while (0) + +/* + * When formatting a string for JSON output we must escape certain characters, + * as described in RFC4627. This applies to both member names and + * DATA_TYPE_STRING values. + * + * This function will only operate correctly if the following conditions are + * met: + * + * 1. The input String is encoded in the current locale. + * + * 2. The current locale includes the Basic Multilingual Plane (plane 0) + * as defined in the Unicode standard. + * + * The output will be entirely 7-bit ASCII (as a subset of UTF-8) with all + * representable Unicode characters included in their escaped numeric form. + */ +static int +nvlist_print_json_string(FILE *fp, const char *input) +{ + mbstate_t mbr; + wchar_t c; + size_t sz; + + bzero(&mbr, sizeof (mbr)); + + FPRINTF(fp, "\""); + while ((sz = mbrtowc(&c, input, MB_CUR_MAX, &mbr)) > 0) { + switch (c) { + case '"': + FPRINTF(fp, "\\\""); + break; + case '\n': + FPRINTF(fp, "\\n"); + break; + case '\r': + FPRINTF(fp, "\\r"); + break; + case '\\': + FPRINTF(fp, "\\\\"); + break; + case '\f': + FPRINTF(fp, "\\f"); + break; + case '\t': + FPRINTF(fp, "\\t"); + break; + case '\b': + FPRINTF(fp, "\\b"); + break; + default: + if ((c >= 0x00 && c <= 0x1f) || + (c > 0x7f && c <= 0xffff)) { + /* + * Render both Control Characters and Unicode + * characters in the Basic Multilingual Plane + * as JSON-escaped multibyte characters. + */ + FPRINTF(fp, "\\u%04x", (int)(0xffff & c)); + } else if (c >= 0x20 && c <= 0x7f) { + /* + * Render other 7-bit ASCII characters directly + * and drop other, unrepresentable characters. + */ + FPRINTF(fp, "%c", (int)(0xff & c)); + } + break; + } + input += sz; + } + + if (sz == (size_t)-1 || sz == (size_t)-2) { + /* + * We last read an invalid multibyte character sequence, + * so return an error. + */ + return (-1); + } + + FPRINTF(fp, "\""); + return (0); +} + +/* + * Dump a JSON-formatted representation of an nvlist to the provided FILE *. + * This routine does not output any new-lines or additional whitespace other + * than that contained in strings, nor does it call fflush(3C). + */ +int +nvlist_print_json(FILE *fp, nvlist_t *nvl) +{ + nvpair_t *curr; + boolean_t first = B_TRUE; + + FPRINTF(fp, "{"); + + for (curr = nvlist_next_nvpair(nvl, NULL); curr; + curr = nvlist_next_nvpair(nvl, curr)) { + data_type_t type = nvpair_type(curr); + + if (!first) + FPRINTF(fp, ","); + else + first = B_FALSE; + + if (nvlist_print_json_string(fp, nvpair_name(curr)) == -1) + return (-1); + FPRINTF(fp, ":"); + + switch (type) { + case DATA_TYPE_STRING: { + char *string = fnvpair_value_string(curr); + if (nvlist_print_json_string(fp, string) == -1) + return (-1); + break; + } + + case DATA_TYPE_BOOLEAN: { + FPRINTF(fp, "true"); + break; + } + + case DATA_TYPE_BOOLEAN_VALUE: { + FPRINTF(fp, "%s", fnvpair_value_boolean_value(curr) == + B_TRUE ? "true" : "false"); + break; + } + + case DATA_TYPE_BYTE: { + FPRINTF(fp, "%hhu", fnvpair_value_byte(curr)); + break; + } + + case DATA_TYPE_INT8: { + FPRINTF(fp, "%hhd", fnvpair_value_int8(curr)); + break; + } + + case DATA_TYPE_UINT8: { + FPRINTF(fp, "%hhu", fnvpair_value_uint8_t(curr)); + break; + } + + case DATA_TYPE_INT16: { + FPRINTF(fp, "%hd", fnvpair_value_int16(curr)); + break; + } + + case DATA_TYPE_UINT16: { + FPRINTF(fp, "%hu", fnvpair_value_uint16(curr)); + break; + } + + case DATA_TYPE_INT32: { + FPRINTF(fp, "%d", fnvpair_value_int32(curr)); + break; + } + + case DATA_TYPE_UINT32: { + FPRINTF(fp, "%u", fnvpair_value_uint32(curr)); + break; + } + + case DATA_TYPE_INT64: { + FPRINTF(fp, "%lld", + (long long)fnvpair_value_int64(curr)); + break; + } + + case DATA_TYPE_UINT64: { + FPRINTF(fp, "%llu", + (unsigned long long)fnvpair_value_uint64(curr)); + break; + } + + case DATA_TYPE_HRTIME: { + hrtime_t val; + VERIFY0(nvpair_value_hrtime(curr, &val)); + FPRINTF(fp, "%llu", (unsigned long long)val); + break; + } + + case DATA_TYPE_DOUBLE: { + double val; + VERIFY0(nvpair_value_double(curr, &val)); + FPRINTF(fp, "%f", val); + break; + } + + case DATA_TYPE_NVLIST: { + if (nvlist_print_json(fp, + fnvpair_value_nvlist(curr)) == -1) + return (-1); + break; + } + + case DATA_TYPE_STRING_ARRAY: { + char **val; + uint_t valsz, i; + VERIFY0(nvpair_value_string_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + if (nvlist_print_json_string(fp, val[i]) == -1) + return (-1); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_NVLIST_ARRAY: { + nvlist_t **val; + uint_t valsz, i; + VERIFY0(nvpair_value_nvlist_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + if (nvlist_print_json(fp, val[i]) == -1) + return (-1); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_BOOLEAN_ARRAY: { + boolean_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_boolean_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, val[i] == B_TRUE ? + "true" : "false"); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_BYTE_ARRAY: { + uchar_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_byte_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%hhu", val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_UINT8_ARRAY: { + uint8_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_uint8_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%hhu", val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_INT8_ARRAY: { + int8_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_int8_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%hhd", val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_UINT16_ARRAY: { + uint16_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_uint16_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%hu", val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_INT16_ARRAY: { + int16_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_int16_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%hd", val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_UINT32_ARRAY: { + uint32_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_uint32_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%u", val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_INT32_ARRAY: { + int32_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_int32_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%d", val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_UINT64_ARRAY: { + uint64_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_uint64_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%llu", + (unsigned long long)val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_INT64_ARRAY: { + int64_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_int64_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%lld", (long long)val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_UNKNOWN: + return (-1); + } + } + + FPRINTF(fp, "}"); + return (0); +} Index: src/external/cddl/osnet/dist/lib/libuutil/common/libuutil.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libuutil/common/libuutil.h,v retrieving revision 1.1.1.1 diff -u -p -r1.1.1.1 libuutil.h --- src/external/cddl/osnet/dist/lib/libuutil/common/libuutil.h 7 Aug 2009 18:32:34 -0000 1.1.1.1 +++ src/external/cddl/osnet/dist/lib/libuutil/common/libuutil.h 12 Jun 2012 05:55:36 -0000 @@ -19,15 +19,16 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _LIBUUTIL_H #define _LIBUUTIL_H +#include #include #include +#include #ifdef __cplusplus extern "C" { @@ -142,12 +143,21 @@ extern int uu_open_tmp(const char *dir, /* * Convenience functions. */ +#define UU_NELEM(a) (sizeof (a) / sizeof ((a)[0])) + /*PRINTFLIKE1*/ extern char *uu_msprintf(const char *format, ...); extern void *uu_zalloc(size_t); extern char *uu_strdup(const char *); extern void uu_free(void *); +extern boolean_t uu_strcaseeq(const char *a, const char *b); +extern boolean_t uu_streq(const char *a, const char *b); +extern char *uu_strndup(const char *s, size_t n); +extern boolean_t uu_strbw(const char *a, const char *b); +extern void *uu_memdup(const void *buf, size_t sz); +extern void uu_dump(FILE *out, const char *prefix, const void *buf, size_t len); + /* * Comparison function type definition. * Developers should be careful in their use of the _private argument. If you Index: src/external/cddl/osnet/dist/lib/libuutil/common/uu_alloc.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libuutil/common/uu_alloc.c,v retrieving revision 1.1.1.1 diff -u -p -r1.1.1.1 uu_alloc.c --- src/external/cddl/osnet/dist/lib/libuutil/common/uu_alloc.c 7 Aug 2009 18:32:34 -0000 1.1.1.1 +++ src/external/cddl/osnet/dist/lib/libuutil/common/uu_alloc.c 12 Jun 2012 05:55:36 -0000 @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ #include "libuutil_common.h" @@ -67,6 +66,44 @@ uu_strdup(const char *str) return (buf); } +/* + * Duplicate up to n bytes of a string. Kind of sort of like + * strdup(strlcpy(s, n)). + */ +char * +uu_strndup(const char *s, size_t n) +{ + size_t len; + char *p; + + len = strnlen(s, n); + p = uu_zalloc(len + 1); + if (p == NULL) + return (NULL); + + if (len > 0) + (void) memcpy(p, s, len); + p[len] = '\0'; + + return (p); +} + +/* + * Duplicate a block of memory. Combines malloc with memcpy, much as + * strdup combines malloc, strlen, and strcpy. + */ +void * +uu_memdup(const void *buf, size_t sz) +{ + void *p; + + p = uu_zalloc(sz); + if (p == NULL) + return (NULL); + (void) memcpy(p, buf, sz); + return (p); +} + char * uu_msprintf(const char *format, ...) { Index: src/external/cddl/osnet/dist/lib/libuutil/common/uu_dprintf.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libuutil/common/uu_dprintf.c,v retrieving revision 1.1.1.1 diff -u -p -r1.1.1.1 uu_dprintf.c --- src/external/cddl/osnet/dist/lib/libuutil/common/uu_dprintf.c 7 Aug 2009 18:32:34 -0000 1.1.1.1 +++ src/external/cddl/osnet/dist/lib/libuutil/common/uu_dprintf.c 12 Jun 2012 05:55:36 -0000 @@ -33,7 +33,7 @@ #include #include #include -#include +#include #define FACILITY_FMT "%s (%s): " Index: src/external/cddl/osnet/dist/lib/libuutil/common/uu_misc.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libuutil/common/uu_misc.c,v retrieving revision 1.3 diff -u -p -r1.3 uu_misc.c --- src/external/cddl/osnet/dist/lib/libuutil/common/uu_misc.c 26 May 2017 22:50:35 -0000 1.3 +++ src/external/cddl/osnet/dist/lib/libuutil/common/uu_misc.c 30 May 2017 10:49:07 -0000 @@ -20,14 +20,13 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include "libuutil_common.h" +#define HAVE_ASSFAIL 1 + #include #include #include @@ -39,6 +38,7 @@ #include #include #include +#include #if !defined(TEXT_DOMAIN) #define TEXT_DOMAIN "SYS_TEST" @@ -74,10 +74,7 @@ static uint32_t _uu_main_error; void uu_set_error(uint_t code) { - if (thr_main() != 0) { - _uu_main_error = code; - return; - } + #if defined(PTHREAD_ONCE_KEY_NP) if (pthread_key_create_once_np(&uu_error_key, NULL) != 0) uu_error_key_setup = -1; @@ -103,8 +100,6 @@ uu_set_error(uint_t code) uint32_t uu_error(void) { - if (thr_main() != 0) - return (_uu_main_error); if (uu_error_key_setup < 0) /* can't happen? */ return (UU_ERROR_UNKNOWN); @@ -254,3 +249,30 @@ uu_init(void) { (void) pthread_atfork(uu_lockup, uu_release, uu_release_child); } + +/* + * Dump a block of memory in hex+ascii, for debugging + */ +void +uu_dump(FILE *out, const char *prefix, const void *buf, size_t len) +{ + const unsigned char *p = buf; + int i; + + for (i = 0; i < len; i += 16) { + int j; + + (void) fprintf(out, "%s", prefix); + for (j = 0; j < 16 && i + j < len; j++) { + (void) fprintf(out, "%2.2x ", p[i + j]); + } + for (; j < 16; j++) { + (void) fprintf(out, " "); + } + for (j = 0; j < 16 && i + j < len; j++) { + (void) fprintf(out, "%c", + isprint(p[i + j]) ? p[i + j] : '.'); + } + (void) fprintf(out, "\n"); + } +} Index: src/external/cddl/osnet/dist/lib/libuutil/common/uu_pname.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libuutil/common/uu_pname.c,v retrieving revision 1.1.1.1 diff -u -p -r1.1.1.1 uu_pname.c --- src/external/cddl/osnet/dist/lib/libuutil/common/uu_pname.c 7 Aug 2009 18:32:35 -0000 1.1.1.1 +++ src/external/cddl/osnet/dist/lib/libuutil/common/uu_pname.c 13 Apr 2017 17:31:23 -0000 @@ -172,9 +172,7 @@ uu_setpname(char *arg0) * than in each of its consumers. */ if (arg0 == NULL) { - pname = getexecname(); - if (pname == NULL) - pname = "unknown_command"; + pname = "unknown_command"; return (pname); } Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h,v retrieving revision 1.2 diff -u -p -r1.2 libzfs.h --- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h 10 Jan 2017 19:20:34 -0000 1.2 +++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h 22 Apr 2017 17:17:31 -0000 @@ -20,8 +20,14 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2012 Martin Matuska . All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. */ #ifndef _LIBZFS_H @@ -35,7 +41,8 @@ #include #include #include -#include +#include +#include #ifdef __cplusplus extern "C" { @@ -44,15 +51,14 @@ extern "C" { /* * Miscellaneous ZFS constants */ -#define ZFS_MAXNAMELEN MAXNAMELEN -#define ZPOOL_MAXNAMELEN MAXNAMELEN #define ZFS_MAXPROPLEN MAXPATHLEN #define ZPOOL_MAXPROPLEN MAXPATHLEN /* * libzfs errors */ -enum { +typedef enum zfs_error { + EZFS_SUCCESS = 0, /* no error -- success */ EZFS_NOMEM = 2000, /* out of memory */ EZFS_BADPROP, /* invalid property value */ EZFS_PROPREADONLY, /* cannot set readonly property */ @@ -86,14 +92,13 @@ enum { EZFS_SHARENFSFAILED, /* share(1M) failed */ EZFS_PERM, /* permission denied */ EZFS_NOSPC, /* out of space */ + EZFS_FAULT, /* bad address */ EZFS_IO, /* I/O error */ EZFS_INTR, /* signal received */ EZFS_ISSPARE, /* device is a hot spare */ EZFS_INVALCONFIG, /* invalid vdev configuration */ EZFS_RECURSIVE, /* recursive dependency */ EZFS_NOHISTORY, /* no history object */ - EZFS_UNSHAREISCSIFAILED, /* iscsitgtd failed request to unshare */ - EZFS_SHAREISCSIFAILED, /* iscsitgtd failed request to share */ EZFS_POOLPROPS, /* couldn't retrieve pool props */ EZFS_POOL_NOTSUP, /* ops not supported for this type of pool */ EZFS_POOL_INVALARG, /* invalid argument for this pool operation */ @@ -101,12 +106,10 @@ enum { EZFS_OPENFAILED, /* open of device failed */ EZFS_NOCAP, /* couldn't get capacity */ EZFS_LABELFAILED, /* write of label failed */ - EZFS_ISCSISVCUNAVAIL, /* iscsi service unavailable */ EZFS_BADWHO, /* invalid permission who */ EZFS_BADPERM, /* invalid permission */ EZFS_BADPERMSET, /* invalid permission set name */ EZFS_NODELEGATION, /* delegated administration is disabled */ - EZFS_PERMRDONLY, /* pemissions are readonly */ EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */ EZFS_SHARESMBFAILED, /* failed to share over smb */ EZFS_BADCACHE, /* bad cache file */ @@ -121,8 +124,13 @@ enum { EZFS_PIPEFAILED, /* pipe create failed */ EZFS_THREADCREATEFAILED, /* thread create failed */ EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ + EZFS_SCRUBBING, /* currently scrubbing */ + EZFS_NO_SCRUB, /* no active scrub */ + EZFS_DIFF, /* general failure of zfs diff */ + EZFS_DIFFDATA, /* bad zfs diff data */ + EZFS_POOLREADONLY, /* pool is in read-only mode */ EZFS_UNKNOWN -}; +} zfs_error_t; /* * The following data structures are all part @@ -178,9 +186,13 @@ extern libzfs_handle_t *zfs_get_handle(z extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t); +extern void zfs_save_arguments(int argc, char **, char *, int); +extern int zpool_log_history(libzfs_handle_t *, const char *); + extern int libzfs_errno(libzfs_handle_t *); extern const char *libzfs_error_action(libzfs_handle_t *); extern const char *libzfs_error_description(libzfs_handle_t *); +extern int zfs_standard_error(libzfs_handle_t *, int, const char *); extern void libzfs_mnttab_init(libzfs_handle_t *); extern void libzfs_mnttab_fini(libzfs_handle_t *); extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t); @@ -199,20 +211,23 @@ extern void zpool_close(zpool_handle_t * extern const char *zpool_get_name(zpool_handle_t *); extern int zpool_get_state(zpool_handle_t *); extern const char *zpool_state_to_name(vdev_state_t, vdev_aux_t); +extern const char *zpool_pool_state_to_name(pool_state_t); extern void zpool_free_handles(libzfs_handle_t *); +extern int zpool_nextboot(libzfs_handle_t *, uint64_t, uint64_t, const char *); /* * Iterate over all active pools in the system. */ typedef int (*zpool_iter_f)(zpool_handle_t *, void *); extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *); +extern boolean_t zpool_skip_pool(const char *); /* * Functions to create and destroy pools */ extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, nvlist_t *, nvlist_t *); -extern int zpool_destroy(zpool_handle_t *); +extern int zpool_destroy(zpool_handle_t *, const char *); extern int zpool_add(zpool_handle_t *, nvlist_t *); typedef struct splitflags { @@ -226,8 +241,10 @@ typedef struct splitflags { /* * Functions to manipulate pool and vdev state */ -extern int zpool_scrub(zpool_handle_t *, pool_scrub_type_t); +extern int zpool_scan(zpool_handle_t *, pool_scan_func_t); extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); +extern int zpool_reguid(zpool_handle_t *); +extern int zpool_reopen(zpool_handle_t *); extern int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); @@ -247,14 +264,14 @@ extern nvlist_t *zpool_find_vdev(zpool_h boolean_t *, boolean_t *); extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); -extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *); +extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, const char *); /* * Functions to manage pool properties */ extern int zpool_set_prop(zpool_handle_t *, const char *, const char *); extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, - size_t proplen, zprop_source_t *); + size_t proplen, zprop_source_t *, boolean_t); extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, zprop_source_t *); @@ -285,6 +302,15 @@ typedef enum { ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ /* + * If the pool has unsupported features but can still be opened in + * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the + * pool has unsupported features but cannot be opened at all, its + * status is ZPOOL_STATUS_UNSUP_FEAT_READ. + */ + ZPOOL_STATUS_UNSUP_FEAT_READ, /* unsupported features for read */ + ZPOOL_STATUS_UNSUP_FEAT_WRITE, /* unsupported features for write */ + + /* * These faults have no corresponding message ID. At the time we are * checking the status, the original reason for the FMA fault (I/O or * checksum errors) has been lost. @@ -297,10 +323,12 @@ typedef enum { * requiring administrative attention. There is no corresponding * message ID. */ - ZPOOL_STATUS_VERSION_OLDER, /* older on-disk version */ + ZPOOL_STATUS_VERSION_OLDER, /* older legacy on-disk version */ + ZPOOL_STATUS_FEAT_DISABLED, /* supported features are disabled */ ZPOOL_STATUS_RESILVERING, /* device being resilvered */ - ZPOOL_STATUS_OFFLINE_DEV, /* device online */ + ZPOOL_STATUS_OFFLINE_DEV, /* device offline */ ZPOOL_STATUS_REMOVED_DEV, /* removed device */ + ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */ /* * Finally, the following indicates a healthy pool. @@ -316,18 +344,20 @@ extern void zpool_dump_ddt(const ddt_sta * Statistics and configuration functions. */ extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); +extern nvlist_t *zpool_get_features(zpool_handle_t *); extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *); extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **); /* * Import and export functions */ -extern int zpool_export(zpool_handle_t *, boolean_t); -extern int zpool_export_force(zpool_handle_t *); +extern int zpool_export(zpool_handle_t *, boolean_t, const char *); +extern int zpool_export_force(zpool_handle_t *, const char *); extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, char *altroot); extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, - nvlist_t *, boolean_t); + nvlist_t *, int); +extern void zpool_print_unsup_feat(nvlist_t *config); /* * Search for pools to import @@ -356,7 +386,7 @@ extern nvlist_t *zpool_find_import_cache */ struct zfs_cmd; -extern const char *hist_event_table[LOG_END]; +extern const char *zfs_history_event_names[]; extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, boolean_t verbose); @@ -364,12 +394,9 @@ extern int zpool_upgrade(zpool_handle_t extern int zpool_get_history(zpool_handle_t *, nvlist_t **); extern int zpool_history_unpack(char *, uint64_t, uint64_t *, nvlist_t ***, uint_t *); -extern void zpool_set_history_str(const char *subcommand, int argc, - char **argv, char *history_str); -extern int zpool_stage_history(libzfs_handle_t *, const char *); extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, size_t len); -extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); +extern int zfs_ioctl(libzfs_handle_t *, int request, struct zfs_cmd *); extern int zpool_get_physpath(zpool_handle_t *, char *, size_t); extern void zpool_explain_recover(libzfs_handle_t *, const char *, int, nvlist_t *); @@ -379,10 +406,12 @@ extern void zpool_explain_recover(libzfs * underlying datasets, only the references to them. */ extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int); +extern zfs_handle_t *zfs_handle_dup(zfs_handle_t *); extern void zfs_close(zfs_handle_t *); extern zfs_type_t zfs_get_type(const zfs_handle_t *); extern const char *zfs_get_name(const zfs_handle_t *); extern zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *); +extern const char *zfs_get_pool_name(const zfs_handle_t *); /* * Property management functions. Some functions are shared with the kernel, @@ -398,10 +427,11 @@ extern const char *zfs_prop_column_name( extern boolean_t zfs_prop_align_right(zfs_prop_t); extern nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t, - nvlist_t *, uint64_t, zfs_handle_t *, const char *); + nvlist_t *, uint64_t, zfs_handle_t *, zpool_handle_t *, const char *); extern const char *zfs_prop_to_name(zfs_prop_t); extern int zfs_prop_set(zfs_handle_t *, const char *, const char *); +extern int zfs_prop_set_list(zfs_handle_t *, nvlist_t *); extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, zprop_source_t *, char *, size_t, boolean_t); extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t, @@ -412,12 +442,20 @@ extern int zfs_prop_get_userquota_int(zf uint64_t *propvalue); extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, char *propbuf, int proplen, boolean_t literal); +extern int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue); +extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, + char *propbuf, int proplen, boolean_t literal); +extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, + char *buf, size_t len); extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); extern const char *zfs_prop_values(zfs_prop_t); extern int zfs_prop_is_string(zfs_prop_t prop); extern nvlist_t *zfs_get_user_props(zfs_handle_t *); extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *); +extern nvlist_t *zfs_get_clones_nvl(zfs_handle_t *); + typedef struct zprop_list { int pl_prop; @@ -429,16 +467,26 @@ typedef struct zprop_list { boolean_t pl_fixed; } zprop_list_t; -extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t); +extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t, + boolean_t); extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); #define ZFS_MOUNTPOINT_NONE "none" #define ZFS_MOUNTPOINT_LEGACY "legacy" +#define ZFS_FEATURE_DISABLED "disabled" +#define ZFS_FEATURE_ENABLED "enabled" +#define ZFS_FEATURE_ACTIVE "active" + +#define ZFS_UNSUPPORTED_INACTIVE "inactive" +#define ZFS_UNSUPPORTED_READONLY "readonly" + /* * zpool property management */ extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **); +extern int zpool_prop_get_feature(zpool_handle_t *, const char *, char *, + size_t); extern const char *zpool_prop_default_string(zpool_prop_t); extern uint64_t zpool_prop_default_numeric(zpool_prop_t); extern const char *zpool_prop_column_name(zpool_prop_t); @@ -490,8 +538,21 @@ extern int zfs_iter_root(libzfs_handle_t extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *); extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); -extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *); +extern int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *); extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *); +extern int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *); +extern int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *); + +typedef struct get_all_cb { + zfs_handle_t **cb_handles; + size_t cb_alloc; + size_t cb_used; + boolean_t cb_verbose; + int (*cb_getone)(zfs_handle_t *, void *); +} get_all_cb_t; + +void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); +int libzfs_dataset_cmp(const void *, const void *); /* * Functions to create and destroy datasets. @@ -501,83 +562,135 @@ extern int zfs_create(libzfs_handle_t *, extern int zfs_create_ancestors(libzfs_handle_t *, const char *); extern int zfs_destroy(zfs_handle_t *, boolean_t); extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); +extern int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t); extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); +extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, + nvlist_t *props); extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); -extern int zfs_rename(zfs_handle_t *, const char *, boolean_t); + +typedef struct renameflags { + /* recursive rename */ + int recurse : 1; + + /* don't unmount file systems */ + int nounmount : 1; + + /* force unmount file systems */ + int forceunmount : 1; +} renameflags_t; + +extern int zfs_rename(zfs_handle_t *, const char *, const char *, + renameflags_t flags); typedef struct sendflags { /* print informational messages (ie, -v was specified) */ - int verbose : 1; + boolean_t verbose; /* recursive send (ie, -R) */ - int replicate : 1; + boolean_t replicate; /* for incrementals, do all intermediate snapshots */ - int doall : 1; /* (ie, -I) */ + boolean_t doall; /* if dataset is a clone, do incremental from its origin */ - int fromorigin : 1; + boolean_t fromorigin; /* do deduplication */ - int dedup : 1; + boolean_t dedup; /* send properties (ie, -p) */ - int props : 1; + boolean_t props; + + /* do not send (no-op, ie. -n) */ + boolean_t dryrun; + + /* parsable verbose output (ie. -P) */ + boolean_t parsable; + + /* show progress (ie. -v) */ + boolean_t progress; + + /* large blocks (>128K) are permitted */ + boolean_t largeblock; + + /* WRITE_EMBEDDED records of type DATA are permitted */ + boolean_t embed_data; } sendflags_t; typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); extern int zfs_send(zfs_handle_t *, const char *, const char *, - sendflags_t, int, snapfilter_cb_t, void *); + sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); +extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags); +extern int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd, + const char *); +extern nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, + const char *token); extern int zfs_promote(zfs_handle_t *); -extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t, - boolean_t, boolean_t); -extern int zfs_hold_range(zfs_handle_t *, const char *, const char *, - const char *, boolean_t, boolean_t); +extern int zfs_hold(zfs_handle_t *, const char *, const char *, + boolean_t, int); +extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); -extern int zfs_release_range(zfs_handle_t *, const char *, const char *, - const char *, boolean_t); +extern int zfs_get_holds(zfs_handle_t *, nvlist_t **); extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *); typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, uid_t rid, uint64_t space); -extern int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, - zfs_userspace_cb_t func, void *arg); +extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t, + zfs_userspace_cb_t, void *); + +extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **); +extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *); typedef struct recvflags { /* print informational messages (ie, -v was specified) */ - int verbose : 1; + boolean_t verbose; /* the destination is a prefix, not the exact fs (ie, -d) */ - int isprefix : 1; + boolean_t isprefix; /* * Only the tail of the sent snapshot path is appended to the * destination to determine the received snapshot name (ie, -e). */ - int istail : 1; + boolean_t istail; /* do not actually do the recv, just check if it would work (ie, -n) */ - int dryrun : 1; + boolean_t dryrun; /* rollback/destroy filesystems as necessary (eg, -F) */ - int force : 1; + boolean_t force; /* set "canmount=off" on all modified filesystems */ - int canmountoff : 1; + boolean_t canmountoff; + + /* + * Mark the file systems as "resumable" and do not destroy them if the + * receive is interrupted + */ + boolean_t resumable; /* byteswap flag is used internally; callers need not specify */ - int byteswap : 1; + boolean_t byteswap; /* do not mount file systems as they are extracted (private) */ - int nomount : 1; + boolean_t nomount; } recvflags_t; -extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t, - int, avl_tree_t *); +extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, + recvflags_t *, int, avl_tree_t *); + +typedef enum diff_flags { + ZFS_DIFF_PARSEABLE = 0x1, + ZFS_DIFF_TIMESTAMP = 0x2, + ZFS_DIFF_CLASSIFY = 0x4 +} diff_flags_t; + +extern int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *, + int); /* * Miscellaneous functions. @@ -589,6 +702,7 @@ extern zfs_handle_t *zfs_path_to_zhandle extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, zfs_type_t); extern int zfs_spa_version(zfs_handle_t *, int *); +extern boolean_t zfs_bookmark_exists(const char *path); /* * Mount support functions. @@ -620,21 +734,26 @@ extern int zfs_unshareall_nfs(zfs_handle extern int zfs_unshareall_smb(zfs_handle_t *); extern int zfs_unshareall_bypath(zfs_handle_t *, const char *); extern int zfs_unshareall(zfs_handle_t *); -extern boolean_t zfs_is_shared_iscsi(zfs_handle_t *); -extern int zfs_share_iscsi(zfs_handle_t *); -extern int zfs_unshare_iscsi(zfs_handle_t *); -extern int zfs_iscsi_perm_check(libzfs_handle_t *, char *, ucred_t *); extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, void *, void *, int, zfs_share_op_t); +#ifdef __FreeBSD__ +/* + * FreeBSD-specific jail support function. + */ +extern int zfs_jail(zfs_handle_t *, int, int); +#endif + /* * When dealing with nvlists, verify() is extremely useful */ +#ifndef verify #ifdef NDEBUG #define verify(EX) ((void)(EX)) #else #define verify(EX) assert(EX) #endif +#endif /* * Utility function to convert a number to a human-readable form. @@ -684,6 +803,11 @@ extern boolean_t libzfs_fru_compare(libz extern boolean_t libzfs_fru_notself(libzfs_handle_t *, const char *); extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *); +#ifndef illumos +extern int zmount(const char *, const char *, int, char *, char *, int, char *, + int); +#endif + #ifdef __cplusplus } #endif Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 libzfs_changelist.c --- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c 27 Feb 2010 22:30:18 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c 10 Oct 2016 11:14:25 -0000 @@ -20,10 +20,14 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Portions Copyright 2007 Ramprakash Jelari + * Copyright (c) 2011 Pawel Jakub Dawidek . + * All rights reserved. + * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov */ #include @@ -116,27 +120,14 @@ changelist_prefix(prop_changelist_t *clp if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned) continue; - if (ZFS_IS_VOLUME(cn->cn_handle)) { - switch (clp->cl_realprop) { - case ZFS_PROP_NAME: - /* If this was a rename, unshare the zvol */ - (void) zfs_unshare_iscsi(cn->cn_handle); - break; - - case ZFS_PROP_VOLSIZE: - /* - * If this was a change to the volume size, we - * need to unshare and reshare the volume. - */ - (void) zfs_unshare_iscsi(cn->cn_handle); - break; - } - } else { + if (!ZFS_IS_VOLUME(cn->cn_handle)) { /* * Do the property specific processing. */ switch (clp->cl_prop) { case ZFS_PROP_MOUNTPOINT: + if (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT) + break; if (zfs_unmount(cn->cn_handle, NULL, clp->cl_mflags) != 0) { ret = -1; @@ -146,6 +137,9 @@ changelist_prefix(prop_changelist_t *clp case ZFS_PROP_SHARESMB: (void) zfs_unshare_smb(cn->cn_handle, NULL); break; + + default: + break; } } } @@ -183,8 +177,10 @@ changelist_postfix(prop_changelist_t *cl if ((cn = uu_list_last(clp->cl_list)) == NULL) return (0); - if (clp->cl_prop == ZFS_PROP_MOUNTPOINT) + if (clp->cl_prop == ZFS_PROP_MOUNTPOINT && + !(clp->cl_gflags & CL_GATHER_DONT_UNMOUNT)) { remove_mountpoint(cn->cn_handle); + } /* * It is possible that the changelist_prefix() used libshare @@ -224,24 +220,8 @@ changelist_postfix(prop_changelist_t *cl zfs_refresh_properties(cn->cn_handle); - if (ZFS_IS_VOLUME(cn->cn_handle)) { - if (cn->cn_shared || - clp->cl_prop == ZFS_PROP_SHAREISCSI) { - if (zfs_prop_get(cn->cn_handle, - ZFS_PROP_SHAREISCSI, shareopts, - sizeof (shareopts), NULL, NULL, 0, - B_FALSE) == 0 && - strcmp(shareopts, "off") == 0) { - errors += - zfs_unshare_iscsi(cn->cn_handle); - } else { - errors += - zfs_share_iscsi(cn->cn_handle); - } - } - + if (ZFS_IS_VOLUME(cn->cn_handle)) continue; - } /* * Remount if previously mounted or mountpoint was legacy, @@ -255,7 +235,8 @@ changelist_postfix(prop_changelist_t *cl shareopts, sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0) && (strcmp(shareopts, "off") != 0)); - mounted = zfs_is_mounted(cn->cn_handle, NULL); + mounted = (clp->cl_gflags & CL_GATHER_DONT_UNMOUNT) || + zfs_is_mounted(cn->cn_handle, NULL); if (!mounted && (cn->cn_mounted || ((sharenfs || sharesmb || clp->cl_waslegacy) && @@ -316,7 +297,7 @@ void changelist_rename(prop_changelist_t *clp, const char *src, const char *dst) { prop_changenode_t *cn; - char newname[ZFS_MAXNAMELEN]; + char newname[ZFS_MAX_DATASET_NAME_LEN]; for (cn = uu_list_first(clp->cl_list); cn != NULL; cn = uu_list_next(clp->cl_list, cn)) { @@ -498,7 +479,6 @@ change_one(zfs_handle_t *zhp, void *data * This is necessary when the original mountpoint * is legacy or none. */ - ASSERT(!clp->cl_alldependents); verify(uu_list_insert_before(clp->cl_list, uu_list_first(clp->cl_list), cn) == 0); } @@ -640,8 +620,7 @@ changelist_gather(zfs_handle_t *zhp, zfs if (clp->cl_prop != ZFS_PROP_MOUNTPOINT && clp->cl_prop != ZFS_PROP_SHARENFS && - clp->cl_prop != ZFS_PROP_SHARESMB && - clp->cl_prop != ZFS_PROP_SHAREISCSI) + clp->cl_prop != ZFS_PROP_SHARESMB) return (clp); /* Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.c =================================================================== RCS file: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.c diff -N src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.c 25 Apr 2017 23:51:09 -0000 @@ -0,0 +1,122 @@ +/* + * CDDL HEADER SART + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 Martin Matuska . All rights reserved. + */ + +#include + +#include "libzfs_compat.h" + +int zfs_ioctl_version = ZFS_IOCVER_UNDEF; +static int zfs_spa_version = -1; + +/* + * Get zfs_ioctl_version + */ +int +get_zfs_ioctl_version(void) +{ + size_t ver_size; + int ver = ZFS_IOCVER_NONE; + + ver_size = sizeof(ver); + if (sysctlbyname("vfs.zfs.version.ioctl", &ver, &ver_size, NULL, 0) < 0) + err(1, "sysctl vfs.zfs.version.ioctl failed"); + + return (ver); +} + +/* + * Get the SPA version + */ +static int +get_zfs_spa_version(void) +{ + size_t ver_size; + int ver = 0; + + ver_size = sizeof(ver); + if (sysctlbyname("vfs.zfs.version.spa", &ver, &ver_size, NULL, 0) < 0) + err(1, "sysctl vfs.zfs.version.spa failed"); + + return (ver); +} + +/* + * This is FreeBSD version of ioctl, because Solaris' ioctl() updates + * zc_nvlist_dst_size even if an error is returned, on FreeBSD if an + * error is returned zc_nvlist_dst_size won't be updated. + */ +int +zcmd_ioctl(int fd, int request, zfs_cmd_t *zc) +{ + size_t oldsize; + int ret, cflag = ZFS_CMD_COMPAT_NONE; + + if (zfs_ioctl_version == ZFS_IOCVER_UNDEF) + zfs_ioctl_version = get_zfs_ioctl_version(); + + if (zfs_ioctl_version >= ZFS_IOCVER_DEADMAN) { + switch (zfs_ioctl_version) { + case ZFS_IOCVER_RESUME: + cflag = ZFS_CMD_COMPAT_RESUME; + break; + case ZFS_IOCVER_EDBP: + cflag = ZFS_CMD_COMPAT_EDBP; + break; + case ZFS_IOCVER_ZCMD: + cflag = ZFS_CMD_COMPAT_ZCMD; + break; + case ZFS_IOCVER_LZC: + cflag = ZFS_CMD_COMPAT_LZC; + break; + case ZFS_IOCVER_DEADMAN: + cflag = ZFS_CMD_COMPAT_DEADMAN; + break; + } + } else { + /* + * If vfs.zfs.version.ioctl is not defined, assume we have v28 + * compatible binaries and use vfs.zfs.version.spa to test for v15 + */ + cflag = ZFS_CMD_COMPAT_V28; + + if (zfs_spa_version < 0) + zfs_spa_version = get_zfs_spa_version(); + + if (zfs_spa_version == SPA_VERSION_15 || + zfs_spa_version == SPA_VERSION_14 || + zfs_spa_version == SPA_VERSION_13) + cflag = ZFS_CMD_COMPAT_V15; + } + + oldsize = zc->zc_nvlist_dst_size; + ret = zcmd_ioctl_compat(fd, request, zc, cflag); + + if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) { + ret = -1; + errno = ENOMEM; + } + + return (ret); +} Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.h =================================================================== RCS file: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.h diff -N src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_compat.h 20 Apr 2017 22:54:17 -0000 @@ -0,0 +1,44 @@ +/* + * CDDL HEADER SART + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 Martin Matuska . All rights reserved. + */ + +#ifndef _LIBZFS_COMPAT_H +#define _LIBZFS_COMPAT_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +int get_zfs_ioctl_version(void); +int zcmd_ioctl(int fd, int request, zfs_cmd_t *zc); + +#define ioctl(fd, ioc, zc) zcmd_ioctl((fd), (ioc), (zc)) + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBZFS_COMPAT_H */ Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 libzfs_config.c --- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c 27 Feb 2010 22:30:18 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c 10 Oct 2016 11:14:25 -0000 @@ -18,12 +18,19 @@ * * CDDL HEADER END */ + /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2015 by Syneto S.R.L. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. + */ + +/* * The pool configuration repository is stored in /etc/zfs/zpool.cache as a * single packed nvlist. While it would be nice to just read in this * file from userland, this wouldn't work from a local zone. So we have to have @@ -218,6 +225,37 @@ zpool_get_config(zpool_handle_t *zhp, nv } /* + * Retrieves a list of enabled features and their refcounts and caches it in + * the pool handle. + */ +nvlist_t * +zpool_get_features(zpool_handle_t *zhp) +{ + nvlist_t *config, *features; + + config = zpool_get_config(zhp, NULL); + + if (config == NULL || !nvlist_exists(config, + ZPOOL_CONFIG_FEATURE_STATS)) { + int error; + boolean_t missing = B_FALSE; + + error = zpool_refresh_stats(zhp, &missing); + + if (error != 0 || missing) + return (NULL); + + config = zpool_get_config(zhp, NULL); + } + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, + &features) != 0) + return (NULL); + + return (features); +} + +/* * Refresh the vdev statistics associated with the given pool. This is used in * iostat to show configuration changes and determine the delta from the last * time the function was called. This function can fail, in case the pool has @@ -281,8 +319,7 @@ zpool_refresh_stats(zpool_handle_t *zhp, verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &newtxg) == 0); - if (zhp->zpool_old_config != NULL) - nvlist_free(zhp->zpool_old_config); + nvlist_free(zhp->zpool_old_config); if (oldtxg != newtxg) { nvlist_free(zhp->zpool_config); @@ -302,6 +339,62 @@ zpool_refresh_stats(zpool_handle_t *zhp, } /* + * The following environment variables are undocumented + * and should be used for testing purposes only: + * + * __ZFS_POOL_EXCLUDE - don't iterate over the pools it lists + * __ZFS_POOL_RESTRICT - iterate only over the pools it lists + * + * This function returns B_TRUE if the pool should be skipped + * during iteration. + */ +boolean_t +zpool_skip_pool(const char *poolname) +{ + static boolean_t initialized = B_FALSE; + static const char *exclude = NULL; + static const char *restricted = NULL; + + const char *cur, *end; + int len; + int namelen = strlen(poolname); + + if (!initialized) { + initialized = B_TRUE; + exclude = getenv("__ZFS_POOL_EXCLUDE"); + restricted = getenv("__ZFS_POOL_RESTRICT"); + } + + if (exclude != NULL) { + cur = exclude; + do { + end = strchr(cur, ' '); + len = (NULL == end) ? strlen(cur) : (end - cur); + if (len == namelen && 0 == strncmp(cur, poolname, len)) + return (B_TRUE); + cur += (len + 1); + } while (NULL != end); + } + + if (NULL == restricted) + return (B_FALSE); + + cur = restricted; + do { + end = strchr(cur, ' '); + len = (NULL == end) ? strlen(cur) : (end - cur); + + if (len == namelen && 0 == strncmp(cur, poolname, len)) { + return (B_FALSE); + } + + cur += (len + 1); + } while (NULL != end); + + return (B_TRUE); +} + +/* * Iterate over all pools in the system. */ int @@ -324,6 +417,9 @@ zpool_iter(libzfs_handle_t *hdl, zpool_i for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL; cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) { + if (zpool_skip_pool(cn->cn_name)) + continue; + if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0) { hdl->libzfs_pool_iter--; return (-1); @@ -359,6 +455,9 @@ zfs_iter_root(libzfs_handle_t *hdl, zfs_ for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL; cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) { + if (zpool_skip_pool(cn->cn_name)) + continue; + if ((zhp = make_dataset_handle(hdl, cn->cn_name)) == NULL) continue; Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c,v retrieving revision 1.3 diff -u -p -r1.3 libzfs_dataset.c --- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c 3 Apr 2010 19:01:15 -0000 1.3 +++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c 22 Apr 2017 08:56:07 -0000 @@ -20,8 +20,16 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. + * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. + * Copyright (c) 2013 Martin Matuska. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. + * Copyright 2016 Igor Kozhukhov */ #include @@ -41,14 +49,12 @@ #include #include #include -#include #include -#include -#include #include #include #include +#include #include #include "zfs_namecheck.h" @@ -73,66 +79,30 @@ zfs_type_to_name(zfs_type_t type) return (dgettext(TEXT_DOMAIN, "snapshot")); case ZFS_TYPE_VOLUME: return (dgettext(TEXT_DOMAIN, "volume")); + case ZFS_TYPE_POOL: + return (dgettext(TEXT_DOMAIN, "pool")); + case ZFS_TYPE_BOOKMARK: + return (dgettext(TEXT_DOMAIN, "bookmark")); + default: + assert(!"unhandled zfs_type_t"); } return (NULL); } /* - * Given a path and mask of ZFS types, return a string describing this dataset. - * This is used when we fail to open a dataset and we cannot get an exact type. - * We guess what the type would have been based on the path and the mask of - * acceptable types. - */ -static const char * -path_to_str(const char *path, int types) -{ - /* - * When given a single type, always report the exact type. - */ - if (types == ZFS_TYPE_SNAPSHOT) - return (dgettext(TEXT_DOMAIN, "snapshot")); - if (types == ZFS_TYPE_FILESYSTEM) - return (dgettext(TEXT_DOMAIN, "filesystem")); - if (types == ZFS_TYPE_VOLUME) - return (dgettext(TEXT_DOMAIN, "volume")); - - /* - * The user is requesting more than one type of dataset. If this is the - * case, consult the path itself. If we're looking for a snapshot, and - * a '@' is found, then report it as "snapshot". Otherwise, remove the - * snapshot attribute and try again. - */ - if (types & ZFS_TYPE_SNAPSHOT) { - if (strchr(path, '@') != NULL) - return (dgettext(TEXT_DOMAIN, "snapshot")); - return (path_to_str(path, types & ~ZFS_TYPE_SNAPSHOT)); - } - - /* - * The user has requested either filesystems or volumes. - * We have no way of knowing a priori what type this would be, so always - * report it as "filesystem" or "volume", our two primitive types. - */ - if (types & ZFS_TYPE_FILESYSTEM) - return (dgettext(TEXT_DOMAIN, "filesystem")); - - assert(types & ZFS_TYPE_VOLUME); - return (dgettext(TEXT_DOMAIN, "volume")); -} - -/* * Validate a ZFS path. This is used even before trying to open the dataset, to * provide a more meaningful error message. We call zfs_error_aux() to * explain exactly why the name was not valid. */ -static int +int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, boolean_t modifying) { namecheck_err_t why; char what; + (void) zfs_prop_get_table(); if (dataset_namecheck(path, &why, &what) != 0) { if (hdl != NULL) { switch (why) { @@ -181,6 +151,11 @@ zfs_validate_name(libzfs_handle_t *hdl, zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "reserved disk name")); break; + + default: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "(%d) not defined"), why); + break; } } @@ -290,7 +265,7 @@ zpool_handle(zfs_handle_t *zhp) int len; zpool_handle_t *zph; - len = strcspn(zhp->zfs_name, "/@") + 1; + len = strcspn(zhp->zfs_name, "/@#") + 1; pool_name = zfs_alloc(zhp->zfs_hdl, len); (void) strlcpy(pool_name, zhp->zfs_name, len); @@ -459,7 +434,9 @@ make_dataset_handle_common(zfs_handle_t else abort(); /* we should never see any other types */ - zhp->zpool_hdl = zpool_handle(zhp); + if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) + return (-1); + return (0); } @@ -492,7 +469,7 @@ make_dataset_handle(libzfs_handle_t *hdl return (zhp); } -static zfs_handle_t * +zfs_handle_t * make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc) { zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); @@ -509,6 +486,133 @@ make_dataset_handle_zc(libzfs_handle_t * return (zhp); } +zfs_handle_t * +make_dataset_simple_handle_zc(zfs_handle_t *pzhp, zfs_cmd_t *zc) +{ + zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); + + if (zhp == NULL) + return (NULL); + + zhp->zfs_hdl = pzhp->zfs_hdl; + (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); + zhp->zfs_head_type = pzhp->zfs_type; + zhp->zfs_type = ZFS_TYPE_SNAPSHOT; + zhp->zpool_hdl = zpool_handle(zhp); + return (zhp); +} + +zfs_handle_t * +zfs_handle_dup(zfs_handle_t *zhp_orig) +{ + zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); + + if (zhp == NULL) + return (NULL); + + zhp->zfs_hdl = zhp_orig->zfs_hdl; + zhp->zpool_hdl = zhp_orig->zpool_hdl; + (void) strlcpy(zhp->zfs_name, zhp_orig->zfs_name, + sizeof (zhp->zfs_name)); + zhp->zfs_type = zhp_orig->zfs_type; + zhp->zfs_head_type = zhp_orig->zfs_head_type; + zhp->zfs_dmustats = zhp_orig->zfs_dmustats; + if (zhp_orig->zfs_props != NULL) { + if (nvlist_dup(zhp_orig->zfs_props, &zhp->zfs_props, 0) != 0) { + (void) no_memory(zhp->zfs_hdl); + zfs_close(zhp); + return (NULL); + } + } + if (zhp_orig->zfs_user_props != NULL) { + if (nvlist_dup(zhp_orig->zfs_user_props, + &zhp->zfs_user_props, 0) != 0) { + (void) no_memory(zhp->zfs_hdl); + zfs_close(zhp); + return (NULL); + } + } + if (zhp_orig->zfs_recvd_props != NULL) { + if (nvlist_dup(zhp_orig->zfs_recvd_props, + &zhp->zfs_recvd_props, 0)) { + (void) no_memory(zhp->zfs_hdl); + zfs_close(zhp); + return (NULL); + } + } + zhp->zfs_mntcheck = zhp_orig->zfs_mntcheck; + if (zhp_orig->zfs_mntopts != NULL) { + zhp->zfs_mntopts = zfs_strdup(zhp_orig->zfs_hdl, + zhp_orig->zfs_mntopts); + } + zhp->zfs_props_table = zhp_orig->zfs_props_table; + return (zhp); +} + +boolean_t +zfs_bookmark_exists(const char *path) +{ + nvlist_t *bmarks; + nvlist_t *props; + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + char *bmark_name; + char *pound; + int err; + boolean_t rv; + + + (void) strlcpy(fsname, path, sizeof (fsname)); + pound = strchr(fsname, '#'); + if (pound == NULL) + return (B_FALSE); + + *pound = '\0'; + bmark_name = pound + 1; + props = fnvlist_alloc(); + err = lzc_get_bookmarks(fsname, props, &bmarks); + nvlist_free(props); + if (err != 0) { + nvlist_free(bmarks); + return (B_FALSE); + } + + rv = nvlist_exists(bmarks, bmark_name); + nvlist_free(bmarks); + return (rv); +} + +zfs_handle_t * +make_bookmark_handle(zfs_handle_t *parent, const char *path, + nvlist_t *bmark_props) +{ + zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); + + if (zhp == NULL) + return (NULL); + + /* Fill in the name. */ + zhp->zfs_hdl = parent->zfs_hdl; + (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); + + /* Set the property lists. */ + if (nvlist_dup(bmark_props, &zhp->zfs_props, 0) != 0) { + free(zhp); + return (NULL); + } + + /* Set the types. */ + zhp->zfs_head_type = parent->zfs_head_type; + zhp->zfs_type = ZFS_TYPE_BOOKMARK; + + if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) { + nvlist_free(zhp->zfs_props); + free(zhp); + return (NULL); + } + + return (zhp); +} + /* * Opens the given snapshot, filesystem, or volume. The 'types' * argument is a mask of acceptable types. The function will print an @@ -542,6 +646,22 @@ zfs_open(libzfs_handle_t *hdl, const cha return (NULL); } + if (zhp == NULL) { + char *at = strchr(path, '@'); + + if (at != NULL) + *at = '\0'; + errno = 0; + if ((zhp = make_dataset_handle(hdl, path)) == NULL) { + (void) zfs_standard_error(hdl, errno, errbuf); + return (NULL); + } + if (at != NULL) + *at = '@'; + (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); + zhp->zfs_type = ZFS_TYPE_SNAPSHOT; + } + if (!(types & zhp->zfs_type)) { (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); zfs_close(zhp); @@ -592,7 +712,7 @@ libzfs_mnttab_init(libzfs_handle_t *hdl) sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node)); } -void +static void libzfs_mnttab_update(libzfs_handle_t *hdl) { struct mnttab entry; @@ -618,7 +738,8 @@ libzfs_mnttab_fini(libzfs_handle_t *hdl) void *cookie = NULL; mnttab_node_t *mtn; - while (mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) { + while ((mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) + != NULL) { free(mtn->mtn_mt.mnt_special); free(mtn->mtn_mt.mnt_mountp); free(mtn->mtn_mt.mnt_fstype); @@ -690,7 +811,8 @@ libzfs_mnttab_remove(libzfs_handle_t *hd mnttab_node_t *ret; find.mtn_mt.mnt_special = (char *)fsname; - if (ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) { + if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) + != NULL) { avl_remove(&hdl->libzfs_mnttab_cache, ret); free(ret->mtn_mt.mnt_special); free(ret->mtn_mt.mnt_mountp); @@ -739,7 +861,8 @@ zfs_which_resv_prop(zfs_handle_t *zhp, z */ nvlist_t * zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, - uint64_t zoned, zfs_handle_t *zhp, const char *errbuf) + uint64_t zoned, zfs_handle_t *zhp, zpool_handle_t *zpool_hdl, + const char *errbuf) { nvpair_t *elem; uint64_t intval; @@ -872,6 +995,12 @@ zfs_valid_proplist(libzfs_handle_t *hdl, goto error; } continue; + } else if (prop == ZPROP_INVAL && zfs_prop_written(propname)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is readonly"), + propname); + (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); + goto error; } if (prop == ZPROP_INVAL) { @@ -923,36 +1052,39 @@ zfs_valid_proplist(libzfs_handle_t *hdl, break; } - case ZFS_PROP_RECORDSIZE: case ZFS_PROP_VOLBLOCKSIZE: - /* must be power of two within SPA_{MIN,MAX}BLOCKSIZE */ - if (intval < SPA_MINBLOCKSIZE || - intval > SPA_MAXBLOCKSIZE || !ISP2(intval)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' must be power of 2 from %u " - "to %uk"), propname, - (uint_t)SPA_MINBLOCKSIZE, - (uint_t)SPA_MAXBLOCKSIZE >> 10); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; + case ZFS_PROP_RECORDSIZE: + { + int maxbs = SPA_MAXBLOCKSIZE; + if (zpool_hdl != NULL) { + maxbs = zpool_get_prop_int(zpool_hdl, + ZPOOL_PROP_MAXBLOCKSIZE, NULL); } - break; - - case ZFS_PROP_SHAREISCSI: - if (strcmp(strval, "off") != 0 && - strcmp(strval, "on") != 0 && - strcmp(strval, "type=disk") != 0) { + /* + * Volumes are limited to a volblocksize of 128KB, + * because they typically service workloads with + * small random writes, which incur a large performance + * penalty with large blocks. + */ + if (prop == ZFS_PROP_VOLBLOCKSIZE) + maxbs = SPA_OLD_MAXBLOCKSIZE; + /* + * The value must be a power of two between + * SPA_MINBLOCKSIZE and maxbs. + */ + if (intval < SPA_MINBLOCKSIZE || + intval > maxbs || !ISP2(intval)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' must be 'on', 'off', or 'type=disk'"), - propname); + "'%s' must be power of 2 from 512B " + "to %uKB"), propname, maxbs >> 10); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } - break; -#ifdef PORT_SOLARIS + } case ZFS_PROP_MLSLABEL: { +#ifdef illumos /* * Verify the mlslabel string and convert to * internal hex label string. @@ -1001,10 +1133,15 @@ badlabel: "invalid mlslabel '%s'"), strval); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); m_label_free(new_sl); /* OK if null */ +#else /* !illumos */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "mlslabel is not supported on FreeBSD")); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); +#endif /* illumos */ goto error; } -#endif + case ZFS_PROP_MOUNTPOINT: { namecheck_err_t why; @@ -1027,6 +1164,13 @@ badlabel: "component of '%s' is too long"), propname); break; + + default: + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, + "(%d) not defined"), + why); + break; } (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; @@ -1145,12 +1289,17 @@ badlabel: } break; + case ZFS_PROP_UTF8ONLY: chosen_utf = (int)intval; break; + case ZFS_PROP_NORMALIZE: chosen_normal = (int)intval; break; + + default: + break; } /* @@ -1199,6 +1348,9 @@ badlabel: goto error; } break; + + default: + break; } } } @@ -1223,39 +1375,56 @@ badlabel: (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } + return (ret); + +error: + nvlist_free(ret); + return (NULL); +} + +int +zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) +{ + uint64_t old_volsize; + uint64_t new_volsize; + uint64_t old_reservation; + uint64_t new_reservation; + zfs_prop_t resv_prop; + nvlist_t *props; /* * If this is an existing volume, and someone is setting the volsize, * make sure that it matches the reservation, or add it if necessary. */ - if (zhp != NULL && type == ZFS_TYPE_VOLUME && - nvlist_lookup_uint64(ret, zfs_prop_to_name(ZFS_PROP_VOLSIZE), - &intval) == 0) { - uint64_t old_volsize = zfs_prop_get_int(zhp, - ZFS_PROP_VOLSIZE); - uint64_t old_reservation; - uint64_t new_reservation; - zfs_prop_t resv_prop; - - if (zfs_which_resv_prop(zhp, &resv_prop) < 0) - goto error; - old_reservation = zfs_prop_get_int(zhp, resv_prop); + old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); + if (zfs_which_resv_prop(zhp, &resv_prop) < 0) + return (-1); + old_reservation = zfs_prop_get_int(zhp, resv_prop); - if (old_volsize == old_reservation && - nvlist_lookup_uint64(ret, zfs_prop_to_name(resv_prop), - &new_reservation) != 0) { - if (nvlist_add_uint64(ret, - zfs_prop_to_name(resv_prop), intval) != 0) { - (void) no_memory(hdl); - goto error; - } - } + props = fnvlist_alloc(); + fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); + + if ((zvol_volsize_to_reservation(old_volsize, props) != + old_reservation) || nvlist_exists(nvl, + zfs_prop_to_name(resv_prop))) { + fnvlist_free(props); + return (0); } - return (ret); + if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), + &new_volsize) != 0) { + fnvlist_free(props); + return (-1); + } + new_reservation = zvol_volsize_to_reservation(new_volsize, props); + fnvlist_free(props); -error: - nvlist_free(ret); - return (NULL); + if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop), + new_reservation) != 0) { + (void) no_memory(zhp->zfs_hdl); + return (-1); + } + return (1); } void @@ -1300,6 +1469,12 @@ zfs_setprop_error(libzfs_handle_t *hdl, (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf); break; + case E2BIG: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property value too long")); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + break; + case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool and or dataset must be upgraded to set this " @@ -1308,11 +1483,27 @@ zfs_setprop_error(libzfs_handle_t *hdl, break; case ERANGE: - if (prop == ZFS_PROP_COMPRESSION) { + case EDOM: + if (prop == ZFS_PROP_COMPRESSION || + prop == ZFS_PROP_RECORDSIZE) { (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property setting is not allowed on " "bootable datasets")); (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); + } else if (prop == ZFS_PROP_CHECKSUM || + prop == ZFS_PROP_DEDUP) { + (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property setting is not allowed on " + "root pools")); + (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); + } else { + (void) zfs_standard_error(hdl, err, errbuf); + } + break; + + case EINVAL: + if (prop == ZPROP_INVAL) { + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); } else { (void) zfs_standard_error(hdl, err, errbuf); } @@ -1340,15 +1531,10 @@ zfs_setprop_error(libzfs_handle_t *hdl, int zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) { - zfs_cmd_t zc = { 0 }; int ret = -1; - prop_changelist_t *cl = NULL; char errbuf[1024]; libzfs_handle_t *hdl = zhp->zfs_hdl; - nvlist_t *nvl = NULL, *realprops; - zfs_prop_t prop; - boolean_t do_prefix; - uint64_t idx; + nvlist_t *nvl = NULL; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), @@ -1360,66 +1546,187 @@ zfs_prop_set(zfs_handle_t *zhp, const ch goto error; } - if ((realprops = zfs_valid_proplist(hdl, zhp->zfs_type, nvl, - zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, errbuf)) == NULL) - goto error; + ret = zfs_prop_set_list(zhp, nvl); +error: nvlist_free(nvl); - nvl = realprops; + return (ret); +} - prop = zfs_name_to_prop(propname); - if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL) - goto error; - if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "child dataset with inherited mountpoint is used " - "in a non-global zone")); - ret = zfs_error(hdl, EZFS_ZONED, errbuf); +/* + * Given an nvlist of property names and values, set the properties for the + * given dataset. + */ +int +zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) +{ + zfs_cmd_t zc = { 0 }; + int ret = -1; + prop_changelist_t **cls = NULL; + int cl_idx; + char errbuf[1024]; + libzfs_handle_t *hdl = zhp->zfs_hdl; + nvlist_t *nvl; + int nvl_len; + int added_resv = 0; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), + zhp->zfs_name); + + if ((nvl = zfs_valid_proplist(hdl, zhp->zfs_type, props, + zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, zhp->zpool_hdl, + errbuf)) == NULL) goto error; - } /* - * If the dataset's canmount property is being set to noauto, - * then we want to prevent unmounting & remounting it. + * We have to check for any extra properties which need to be added + * before computing the length of the nvlist. */ - do_prefix = !((prop == ZFS_PROP_CANMOUNT) && - (zprop_string_to_index(prop, propval, &idx, - ZFS_TYPE_DATASET) == 0) && (idx == ZFS_CANMOUNT_NOAUTO)); - - if (do_prefix && (ret = changelist_prefix(cl)) != 0) + for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL); + elem != NULL; + elem = nvlist_next_nvpair(nvl, elem)) { + if (zfs_name_to_prop(nvpair_name(elem)) == ZFS_PROP_VOLSIZE && + (added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1) { + goto error; + } + } + /* + * Check how many properties we're setting and allocate an array to + * store changelist pointers for postfix(). + */ + nvl_len = 0; + for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL); + elem != NULL; + elem = nvlist_next_nvpair(nvl, elem)) + nvl_len++; + if ((cls = calloc(nvl_len, sizeof (prop_changelist_t *))) == NULL) goto error; + cl_idx = 0; + for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL); + elem != NULL; + elem = nvlist_next_nvpair(nvl, elem)) { + + zfs_prop_t prop = zfs_name_to_prop(nvpair_name(elem)); + + assert(cl_idx < nvl_len); + /* + * We don't want to unmount & remount the dataset when changing + * its canmount property to 'on' or 'noauto'. We only use + * the changelist logic to unmount when setting canmount=off. + */ + if (prop != ZFS_PROP_CANMOUNT || + (fnvpair_value_uint64(elem) == ZFS_CANMOUNT_OFF && + zfs_is_mounted(zhp, NULL))) { + cls[cl_idx] = changelist_gather(zhp, prop, 0, 0); + if (cls[cl_idx] == NULL) + goto error; + } + + if (prop == ZFS_PROP_MOUNTPOINT && + changelist_haszonedchild(cls[cl_idx])) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "child dataset with inherited mountpoint is used " + "in a non-global zone")); + ret = zfs_error(hdl, EZFS_ZONED, errbuf); + goto error; + } + + /* We don't support those properties on FreeBSD. */ + switch (prop) { + case ZFS_PROP_DEVICES: + case ZFS_PROP_ISCSIOPTIONS: + case ZFS_PROP_XATTR: + case ZFS_PROP_VSCAN: + case ZFS_PROP_NBMAND: + case ZFS_PROP_MLSLABEL: + (void) snprintf(errbuf, sizeof (errbuf), + "property '%s' not supported on FreeBSD", + nvpair_name(elem)); + ret = zfs_error(hdl, EZFS_PERM, errbuf); + goto error; + } + + if (cls[cl_idx] != NULL && + (ret = changelist_prefix(cls[cl_idx])) != 0) + goto error; + + cl_idx++; + } + assert(cl_idx == nvl_len); + /* - * Execute the corresponding ioctl() to set this property. + * Execute the corresponding ioctl() to set this list of properties. */ (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0) + if ((ret = zcmd_write_src_nvlist(hdl, &zc, nvl)) != 0 || + (ret = zcmd_alloc_dst_nvlist(hdl, &zc, 0)) != 0) goto error; ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); if (ret != 0) { - zfs_setprop_error(hdl, prop, errno, errbuf); - } else { - if (do_prefix) - ret = changelist_postfix(cl); + /* Get the list of unset properties back and report them. */ + nvlist_t *errorprops = NULL; + if (zcmd_read_dst_nvlist(hdl, &zc, &errorprops) != 0) + goto error; + for (nvpair_t *elem = nvlist_next_nvpair(nvl, NULL); + elem != NULL; + elem = nvlist_next_nvpair(nvl, elem)) { + zfs_prop_t prop = zfs_name_to_prop(nvpair_name(elem)); + zfs_setprop_error(hdl, prop, errno, errbuf); + } + nvlist_free(errorprops); + + if (added_resv && errno == ENOSPC) { + /* clean up the volsize property we tried to set */ + uint64_t old_volsize = zfs_prop_get_int(zhp, + ZFS_PROP_VOLSIZE); + nvlist_free(nvl); + nvl = NULL; + zcmd_free_nvlists(&zc); - /* - * Refresh the statistics so the new property value - * is reflected. - */ - if (ret == 0) + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + goto error; + if (nvlist_add_uint64(nvl, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), + old_volsize) != 0) + goto error; + if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0) + goto error; + (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); + } + } else { + for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { + if (cls[cl_idx] != NULL) { + int clp_err = changelist_postfix(cls[cl_idx]); + if (clp_err != 0) + ret = clp_err; + } + } + + /* + * Refresh the statistics so the new property value + * is reflected. + */ + if (ret == 0) (void) get_stats(zhp); } error: nvlist_free(nvl); zcmd_free_nvlists(&zc); - if (cl) - changelist_free(cl); + if (cls != NULL) { + for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { + if (cls[cl_idx] != NULL) + changelist_free(cls[cl_idx]); + } + free(cls); + } return (ret); } @@ -1477,7 +1784,7 @@ zfs_prop_inherit(zfs_handle_t *zhp, cons return (zfs_error(hdl, EZFS_PROPTYPE, errbuf)); /* - * Normalize the name, to get rid of shorthand abbrevations. + * Normalize the name, to get rid of shorthand abbreviations. */ propname = zfs_prop_to_name(prop); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); @@ -1550,22 +1857,21 @@ getprop_uint64(zfs_handle_t *zhp, zfs_pr return (value); } -static char * +static const char * getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source) { nvlist_t *nv; - char *value; + const char *value; *source = NULL; if (nvlist_lookup_nvlist(zhp->zfs_props, zfs_prop_to_name(prop), &nv) == 0) { - verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); + value = fnvlist_lookup_string(nv, ZPROP_VALUE); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { verify(!zhp->zfs_props_table || zhp->zfs_props_table[prop] == B_TRUE); - if ((value = (char *)zfs_prop_default_string(prop)) == NULL) - value = ""; + value = zfs_prop_default_string(prop); *source = ""; } @@ -1649,6 +1955,9 @@ get_numeric_property(zfs_handle_t *zhp, mntopt_on = MNTOPT_NBMAND; mntopt_off = MNTOPT_NONBMAND; break; + + default: + break; } /* @@ -1706,6 +2015,10 @@ get_numeric_property(zfs_handle_t *zhp, case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + case ZFS_PROP_FILESYSTEM_COUNT: + case ZFS_PROP_SNAPSHOT_COUNT: *val = getprop_uint64(zhp, prop, source); if (*source == NULL) { @@ -1740,11 +2053,14 @@ get_numeric_property(zfs_handle_t *zhp, zcmd_free_nvlists(&zc); return (-1); } - if (zplprops) - nvlist_free(zplprops); + nvlist_free(zplprops); zcmd_free_nvlists(&zc); break; + case ZFS_PROP_INCONSISTENT: + *val = zhp->zfs_dmustats.dds_inconsistent; + break; + default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: @@ -1821,8 +2137,6 @@ zfs_prop_get_recvd(zfs_handle_t *zhp, co err = zfs_prop_get(zhp, prop, propbuf, proplen, NULL, NULL, 0, literal); zfs_unset_recvd_props_mode(zhp, &cookie); - } else if (zfs_prop_userquota(propname)) { - return (-1); } else { nvlist_t *propval; char *recvdval; @@ -1837,6 +2151,117 @@ zfs_prop_get_recvd(zfs_handle_t *zhp, co return (err == 0 ? 0 : -1); } +static int +get_clones_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) +{ + nvlist_t *value; + nvpair_t *pair; + + value = zfs_get_clones_nvl(zhp); + if (value == NULL) + return (-1); + + propbuf[0] = '\0'; + for (pair = nvlist_next_nvpair(value, NULL); pair != NULL; + pair = nvlist_next_nvpair(value, pair)) { + if (propbuf[0] != '\0') + (void) strlcat(propbuf, ",", proplen); + (void) strlcat(propbuf, nvpair_name(pair), proplen); + } + + return (0); +} + +struct get_clones_arg { + uint64_t numclones; + nvlist_t *value; + const char *origin; + char buf[ZFS_MAX_DATASET_NAME_LEN]; +}; + +int +get_clones_cb(zfs_handle_t *zhp, void *arg) +{ + struct get_clones_arg *gca = arg; + + if (gca->numclones == 0) { + zfs_close(zhp); + return (0); + } + + if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, gca->buf, sizeof (gca->buf), + NULL, NULL, 0, B_TRUE) != 0) + goto out; + if (strcmp(gca->buf, gca->origin) == 0) { + fnvlist_add_boolean(gca->value, zfs_get_name(zhp)); + gca->numclones--; + } + +out: + (void) zfs_iter_children(zhp, get_clones_cb, gca); + zfs_close(zhp); + return (0); +} + +nvlist_t * +zfs_get_clones_nvl(zfs_handle_t *zhp) +{ + nvlist_t *nv, *value; + + if (nvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_CLONES), &nv) != 0) { + struct get_clones_arg gca; + + /* + * if this is a snapshot, then the kernel wasn't able + * to get the clones. Do it by slowly iterating. + */ + if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) + return (NULL); + if (nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) != 0) + return (NULL); + if (nvlist_alloc(&value, NV_UNIQUE_NAME, 0) != 0) { + nvlist_free(nv); + return (NULL); + } + + gca.numclones = zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES); + gca.value = value; + gca.origin = zhp->zfs_name; + + if (gca.numclones != 0) { + zfs_handle_t *root; + char pool[ZFS_MAX_DATASET_NAME_LEN]; + char *cp = pool; + + /* get the pool name */ + (void) strlcpy(pool, zhp->zfs_name, sizeof (pool)); + (void) strsep(&cp, "/@"); + root = zfs_open(zhp->zfs_hdl, pool, + ZFS_TYPE_FILESYSTEM); + + (void) get_clones_cb(root, &gca); + } + + if (gca.numclones != 0 || + nvlist_add_nvlist(nv, ZPROP_VALUE, value) != 0 || + nvlist_add_nvlist(zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_CLONES), nv) != 0) { + nvlist_free(nv); + nvlist_free(value); + return (NULL); + } + nvlist_free(nv); + nvlist_free(value); + verify(0 == nvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_CLONES), &nv)); + } + + verify(nvlist_lookup_nvlist(nv, ZPROP_VALUE, &value) == 0); + + return (value); +} + /* * Retrieve a property from the given object. If 'literal' is specified, then * numbers are left as exact values. Otherwise, numbers are converted to a @@ -1850,7 +2275,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop { char *source = NULL; uint64_t val; - char *str; + const char *str; const char *strval; boolean_t received = zfs_is_recvd_props_mode(zhp); @@ -1920,8 +2345,8 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop } if ((zpool_get_prop(zhp->zpool_hdl, - ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL)) || - (strcmp(root, "-") == 0)) + ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL, + B_FALSE)) || (strcmp(root, "-") == 0)) root[0] = '\0'; /* * Special case an alternate root of '/'. This will @@ -1955,13 +2380,14 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop break; case ZFS_PROP_ORIGIN: - (void) strlcpy(propbuf, getprop_string(zhp, prop, &source), - proplen); - /* - * If there is no parent at all, return failure to indicate that - * it doesn't apply to this dataset. - */ - if (propbuf[0] == '\0') + str = getprop_string(zhp, prop, &source); + if (str == NULL) + return (-1); + (void) strlcpy(propbuf, str, proplen); + break; + + case ZFS_PROP_CLONES: + if (get_clones_string(zhp, propbuf, proplen) != 0) return (-1); break; @@ -1993,6 +2419,31 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop } break; + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + case ZFS_PROP_FILESYSTEM_COUNT: + case ZFS_PROP_SNAPSHOT_COUNT: + + if (get_numeric_property(zhp, prop, src, &source, &val) != 0) + return (-1); + + /* + * If limit is UINT64_MAX, we translate this into 'none' (unless + * literal is set), and indicate that it's the default value. + * Otherwise, we print the number nicely and indicate that it's + * set locally. + */ + if (literal) { + (void) snprintf(propbuf, proplen, "%llu", + (u_longlong_t)val); + } else if (val == UINT64_MAX) { + (void) strlcpy(propbuf, "none", proplen); + } else { + zfs_nicenum(val, propbuf, proplen); + } + break; + + case ZFS_PROP_REFRATIO: case ZFS_PROP_COMPRESSRATIO: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); @@ -2012,6 +2463,9 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop case ZFS_TYPE_SNAPSHOT: str = "snapshot"; break; + case ZFS_TYPE_BOOKMARK: + str = "bookmark"; + break; default: abort(); } @@ -2042,9 +2496,10 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop */ (void) strlcpy(propbuf, zhp->zfs_name, proplen); break; -#ifdef PORT_SOLARIS + case ZFS_PROP_MLSLABEL: { +#ifdef illumos m_label_t *new_sl = NULL; char *ascii = NULL; /* human readable label */ @@ -2078,9 +2533,23 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop (void) strlcpy(propbuf, ascii, proplen); free(ascii); +#else /* !illumos */ + propbuf[0] = '\0'; +#endif /* illumos */ } break; -#endif + + case ZFS_PROP_GUID: + /* + * GUIDs are stored as numbers, but they are identifiers. + * We don't want them to be pretty printed, because pretty + * printing mangles the ID into a truncated and useless value. + */ + if (get_numeric_property(zhp, prop, src, &source, &val) != 0) + return (-1); + (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); + break; + default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: @@ -2095,8 +2564,10 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop break; case PROP_TYPE_STRING: - (void) strlcpy(propbuf, - getprop_string(zhp, prop, &source), proplen); + str = getprop_string(zhp, prop, &source); + if (str == NULL) + return (-1); + (void) strlcpy(propbuf, str, proplen); break; case PROP_TYPE_INDEX: @@ -2172,19 +2643,16 @@ zfs_prop_get_numeric(zfs_handle_t *zhp, return (0); } -#ifdef PORT_SOLARIS /* NetBSD zfs QUOTA support */ static int idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser, char **domainp, idmap_rid_t *ridp) { - idmap_handle_t *idmap_hdl = NULL; +#ifdef illumos idmap_get_handle_t *get_hdl = NULL; idmap_stat status; int err = EINVAL; - if (idmap_init(&idmap_hdl) != IDMAP_SUCCESS) - goto out; - if (idmap_get_create(idmap_hdl, &get_hdl) != IDMAP_SUCCESS) + if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS) goto out; if (isuser) { @@ -2203,9 +2671,11 @@ idmap_id_to_numeric_domain_rid(uid_t id, out: if (get_hdl) idmap_get_destroy(get_hdl); - if (idmap_hdl) - (void) idmap_fini(idmap_hdl); return (err); +#else /* !illumos */ + assert(!"invalid code path"); + return (EINVAL); // silence compiler warning +#endif /* illumos */ } /* @@ -2223,7 +2693,7 @@ userquota_propname_decode(const char *pr boolean_t isuser; domain[0] = '\0'; - + *ridp = 0; /* Figure out the property type ({user|group}{quota|space}) */ for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { if (strncmp(propname, zfs_userquota_prop_prefixes[type], @@ -2240,39 +2710,69 @@ userquota_propname_decode(const char *pr cp = strchr(propname, '@') + 1; if (strchr(cp, '@')) { +#ifdef illumos /* * It's a SID name (eg "user@domain") that needs to be * turned into S-1-domainID-RID. */ - directory_error_t e; + int flag = 0; + idmap_stat stat, map_stat; + uid_t pid; + idmap_rid_t rid; + idmap_get_handle_t *gh = NULL; + + stat = idmap_get_create(&gh); + if (stat != IDMAP_SUCCESS) { + idmap_get_destroy(gh); + return (ENOMEM); + } if (zoned && getzoneid() == GLOBAL_ZONEID) return (ENOENT); if (isuser) { - e = directory_sid_from_user_name(NULL, - cp, &numericsid); + stat = idmap_getuidbywinname(cp, NULL, flag, &pid); + if (stat < 0) + return (ENOENT); + stat = idmap_get_sidbyuid(gh, pid, flag, &numericsid, + &rid, &map_stat); } else { - e = directory_sid_from_group_name(NULL, - cp, &numericsid); + stat = idmap_getgidbywinname(cp, NULL, flag, &pid); + if (stat < 0) + return (ENOENT); + stat = idmap_get_sidbygid(gh, pid, flag, &numericsid, + &rid, &map_stat); + } + if (stat < 0) { + idmap_get_destroy(gh); + return (ENOENT); } - if (e != NULL) { - directory_error_free(e); + stat = idmap_get_mappings(gh); + idmap_get_destroy(gh); + + if (stat < 0) { return (ENOENT); } if (numericsid == NULL) return (ENOENT); cp = numericsid; + *ridp = rid; /* will be further decoded below */ +#else /* !illumos */ + return (ENOENT); +#endif /* illumos */ } if (strncmp(cp, "S-1-", 4) == 0) { /* It's a numeric SID (eg "S-1-234-567-89") */ (void) strlcpy(domain, cp, domainlen); - cp = strrchr(domain, '-'); - *cp = '\0'; - cp++; - errno = 0; - *ridp = strtoull(cp, &end, 10); + if (*ridp == 0) { + cp = strrchr(domain, '-'); + *cp = '\0'; + cp++; + *ridp = strtoull(cp, &end, 10); + } else { + end = ""; + } if (numericsid) { free(numericsid); numericsid = NULL; @@ -2322,14 +2822,6 @@ userquota_propname_decode(const char *pr ASSERT3P(numericsid, ==, NULL); return (0); } -#else -static int -userquota_propname_decode(const char *propname, boolean_t zoned, - zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp) -{ - return (ENOENT); -} -#endif /* PORT_SOLARIS */ static int zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname, @@ -2338,7 +2830,7 @@ zfs_prop_get_userquota_common(zfs_handle int err; zfs_cmd_t zc = { 0 }; - (void) strncpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); err = userquota_propname_decode(propname, zfs_prop_get_int(zhp, ZFS_PROP_ZONED), @@ -2390,144 +2882,85 @@ zfs_prop_get_userquota(zfs_handle_t *zhp return (0); } -/* - * Returns the name of the given zfs handle. - */ -const char * -zfs_get_name(const zfs_handle_t *zhp) -{ - return (zhp->zfs_name); -} - -/* - * Returns the type of the given zfs handle. - */ -zfs_type_t -zfs_get_type(const zfs_handle_t *zhp) +int +zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue) { - return (zhp->zfs_type); -} + int err; + zfs_cmd_t zc = { 0 }; + const char *snapname; -static int -zfs_do_list_ioctl(zfs_handle_t *zhp, int arg, zfs_cmd_t *zc) -{ - int rc; - uint64_t orig_cookie; + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - orig_cookie = zc->zc_cookie; -top: - (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); - rc = ioctl(zhp->zfs_hdl->libzfs_fd, arg, zc); + snapname = strchr(propname, '@') + 1; + if (strchr(snapname, '@')) { + (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + } else { + /* snapname is the short name, append it to zhp's fsname */ + char *cp; - if (rc == -1) { - switch (errno) { - case ENOMEM: - /* expand nvlist memory and try again */ - if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, zc) != 0) { - zcmd_free_nvlists(zc); - return (-1); - } - zc->zc_cookie = orig_cookie; - goto top; - /* - * An errno value of ESRCH indicates normal completion. - * If ENOENT is returned, then the underlying dataset - * has been removed since we obtained the handle. - */ - case ESRCH: - case ENOENT: - rc = 1; - break; - default: - rc = zfs_standard_error(zhp->zfs_hdl, errno, - dgettext(TEXT_DOMAIN, - "cannot iterate filesystems")); - break; - } + (void) strlcpy(zc.zc_value, zhp->zfs_name, + sizeof (zc.zc_value)); + cp = strchr(zc.zc_value, '@'); + if (cp != NULL) + *cp = '\0'; + (void) strlcat(zc.zc_value, "@", sizeof (zc.zc_value)); + (void) strlcat(zc.zc_value, snapname, sizeof (zc.zc_value)); } - return (rc); + + err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SPACE_WRITTEN, &zc); + if (err) + return (err); + + *propvalue = zc.zc_cookie; + return (0); } -/* - * Iterate over all child filesystems - */ int -zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data) +zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, + char *propbuf, int proplen, boolean_t literal) { - zfs_cmd_t zc = { 0 }; - zfs_handle_t *nzhp; - int ret; - - if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM) - return (0); + int err; + uint64_t propvalue; - if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) - return (-1); + err = zfs_prop_get_written_int(zhp, propname, &propvalue); - while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT, - &zc)) == 0) { - /* - * Silently ignore errors, as the only plausible explanation is - * that the pool has since been removed. - */ - if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl, - &zc)) == NULL) { - continue; - } + if (err) + return (err); - if ((ret = func(nzhp, data)) != 0) { - zcmd_free_nvlists(&zc); - return (ret); - } + if (literal) { + (void) snprintf(propbuf, proplen, "%llu", propvalue); + } else { + zfs_nicenum(propvalue, propbuf, proplen); } - zcmd_free_nvlists(&zc); - return ((ret < 0) ? ret : 0); + return (0); } /* - * Iterate over all snapshots + * Returns the name of the given zfs handle. */ -int -zfs_iter_snapshots(zfs_handle_t *zhp, zfs_iter_f func, void *data) +const char * +zfs_get_name(const zfs_handle_t *zhp) { - zfs_cmd_t zc = { 0 }; - zfs_handle_t *nzhp; - int ret; - - if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) - return (0); - - if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) - return (-1); - while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT, - &zc)) == 0) { - - if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl, - &zc)) == NULL) { - continue; - } - - if ((ret = func(nzhp, data)) != 0) { - zcmd_free_nvlists(&zc); - return (ret); - } - } - zcmd_free_nvlists(&zc); - return ((ret < 0) ? ret : 0); + return (zhp->zfs_name); } /* - * Iterate over all children, snapshots and filesystems + * Returns the name of the parent pool for the given zfs handle. */ -int -zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data) +const char * +zfs_get_pool_name(const zfs_handle_t *zhp) { - int ret; - - if ((ret = zfs_iter_filesystems(zhp, func, data)) != 0) - return (ret); + return (zhp->zpool_hdl->zpool_name); +} - return (zfs_iter_snapshots(zhp, func, data)); +/* + * Returns the type of the given zfs handle. + */ +zfs_type_t +zfs_get_type(const zfs_handle_t *zhp) +{ + return (zhp->zfs_type); } /* @@ -2553,18 +2986,19 @@ is_descendant(const char *ds1, const cha /* * Given a complete name, return just the portion that refers to the parent. - * Can return NULL if this is a pool. + * Will return -1 if there is no parent (path is just the name of the + * pool). */ static int parent_name(const char *path, char *buf, size_t buflen) { - char *loc; + char *slashp; - if ((loc = strrchr(path, '/')) == NULL) - return (-1); + (void) strlcpy(buf, path, buflen); - (void) strncpy(buf, path, MIN(buflen, loc - path)); - buf[loc - path] = '\0'; + if ((slashp = strrchr(buf, '/')) == NULL) + return (-1); + *slashp = '\0'; return (0); } @@ -2582,7 +3016,7 @@ check_parents(libzfs_handle_t *hdl, cons boolean_t accept_ancestor, int *prefixlen) { zfs_cmd_t zc = { 0 }; - char parent[ZFS_MAXNAMELEN]; + char parent[ZFS_MAX_DATASET_NAME_LEN]; char *slash; zfs_handle_t *zhp; char errbuf[1024]; @@ -2710,8 +3144,7 @@ create_parents(libzfs_handle_t *hdl, cha * up to the prefixlen-long one. */ for (cp = target + prefixlen + 1; - cp = strchr(cp, '/'); *cp = '/', cp++) { - char *logstr; + (cp = strchr(cp, '/')) != NULL; *cp = '/', cp++) { *cp = '\0'; @@ -2722,16 +3155,12 @@ create_parents(libzfs_handle_t *hdl, cha continue; } - logstr = hdl->libzfs_log_str; - hdl->libzfs_log_str = NULL; if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM, NULL) != 0) { - hdl->libzfs_log_str = logstr; opname = dgettext(TEXT_DOMAIN, "create"); goto ancestorerr; } - hdl->libzfs_log_str = logstr; h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); if (h == NULL) { opname = dgettext(TEXT_DOMAIN, "open"); @@ -2767,7 +3196,7 @@ zfs_create_ancestors(libzfs_handle_t *hd { int prefix; char *path_copy; - int rc; + int rc = 0; if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0) return (-1); @@ -2789,12 +3218,12 @@ int zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, nvlist_t *props) { - zfs_cmd_t zc = { 0 }; int ret; uint64_t size = 0; uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); char errbuf[1024]; uint64_t zoned; + enum lzc_dataset_type ost; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); @@ -2814,21 +3243,34 @@ zfs_create(libzfs_handle_t *hdl, const c * will return ENOENT, not EEXIST. To prevent this from happening, we * first try to see if the dataset exists. */ - (void) strlcpy(zc.zc_name, path, sizeof (zc.zc_name)); - if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { + if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset already exists")); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); } if (type == ZFS_TYPE_VOLUME) - zc.zc_objset_type = DMU_OST_ZVOL; + ost = LZC_DATSET_TYPE_ZVOL; else - zc.zc_objset_type = DMU_OST_ZFS; + ost = LZC_DATSET_TYPE_ZFS; + + /* open zpool handle for prop validation */ + char pool_path[ZFS_MAX_DATASET_NAME_LEN]; + (void) strlcpy(pool_path, path, sizeof (pool_path)); + + /* truncate pool_path at first slash */ + char *p = strchr(pool_path, '/'); + if (p != NULL) + *p = '\0'; + + zpool_handle_t *zpool_handle = zpool_open(hdl, pool_path); if (props && (props = zfs_valid_proplist(hdl, type, props, - zoned, NULL, errbuf)) == 0) + zoned, NULL, zpool_handle, errbuf)) == 0) { + zpool_close(zpool_handle); return (-1); + } + zpool_close(zpool_handle); if (type == ZFS_TYPE_VOLUME) { /* @@ -2876,18 +3318,13 @@ zfs_create(libzfs_handle_t *hdl, const c } } - if (props && zcmd_write_src_nvlist(hdl, &zc, props) != 0) - return (-1); - nvlist_free(props); - /* create the dataset */ - ret = zfs_ioctl(hdl, ZFS_IOC_CREATE, &zc); - - zcmd_free_nvlists(&zc); + ret = lzc_create(path, ost, props); + nvlist_free(props); /* check for failure */ if (ret != 0) { - char parent[ZFS_MAXNAMELEN]; + char parent[ZFS_MAX_DATASET_NAME_LEN]; (void) parent_name(path, parent, sizeof (parent)); switch (errno) { @@ -2901,15 +3338,6 @@ zfs_create(libzfs_handle_t *hdl, const c "parent '%s' is not a filesystem"), parent); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); - case EDOM: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "volume block size must be power of 2 from " - "%u to %uk"), - (uint_t)SPA_MINBLOCKSIZE, - (uint_t)SPA_MAXBLOCKSIZE >> 10); - - return (zfs_error(hdl, EZFS_BADPROP, errbuf)); - case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to set this " @@ -2935,32 +3363,38 @@ zfs_create(libzfs_handle_t *hdl, const c /* * Destroys the given dataset. The caller must make sure that the filesystem - * isn't mounted, and that there are no active dependents. + * isn't mounted, and that there are no active dependents. If the file system + * does not exist this function does nothing. */ int zfs_destroy(zfs_handle_t *zhp, boolean_t defer) { zfs_cmd_t zc = { 0 }; + if (zhp->zfs_type == ZFS_TYPE_BOOKMARK) { + nvlist_t *nv = fnvlist_alloc(); + fnvlist_add_boolean(nv, zhp->zfs_name); + int error = lzc_destroy_bookmarks(nv, NULL); + fnvlist_free(nv); + if (error != 0) { + return (zfs_standard_error_fmt(zhp->zfs_hdl, errno, + dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), + zhp->zfs_name)); + } + return (0); + } + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (ZFS_IS_VOLUME(zhp)) { - /* - * If user doesn't have permissions to unshare volume, then - * abort the request. This would only happen for a - * non-privileged user. - */ - if (zfs_unshare_iscsi(zhp) != 0) { - return (-1); - } - zc.zc_objset_type = DMU_OST_ZVOL; } else { zc.zc_objset_type = DMU_OST_ZFS; } zc.zc_defer_destroy = defer; - if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0) { + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0 && + errno != ENOENT) { return (zfs_standard_error_fmt(zhp->zfs_hdl, errno, dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zfs_name)); @@ -2972,36 +3406,26 @@ zfs_destroy(zfs_handle_t *zhp, boolean_t } struct destroydata { - char *snapname; - boolean_t gotone; - boolean_t closezhp; + nvlist_t *nvl; + const char *snapname; }; static int zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) { struct destroydata *dd = arg; - zfs_handle_t *szhp; - char name[ZFS_MAXNAMELEN]; - boolean_t closezhp = dd->closezhp; + char name[ZFS_MAX_DATASET_NAME_LEN]; int rv = 0; - (void) strlcpy(name, zhp->zfs_name, sizeof (name)); - (void) strlcat(name, "@", sizeof (name)); - (void) strlcat(name, dd->snapname, sizeof (name)); - - szhp = make_dataset_handle(zhp->zfs_hdl, name); - if (szhp) { - dd->gotone = B_TRUE; - zfs_close(szhp); - } + (void) snprintf(name, sizeof (name), + "%s@%s", zhp->zfs_name, dd->snapname); - dd->closezhp = B_TRUE; - if (!dd->gotone) - rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, arg); - if (closezhp) - zfs_close(zhp); - return (rv); + if (lzc_exists(name)) + verify(nvlist_add_boolean(dd->nvl, name) == 0); + + rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, dd); + zfs_close(zhp); + return (rv); } /* @@ -3010,43 +3434,68 @@ zfs_check_snap_cb(zfs_handle_t *zhp, voi int zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) { - zfs_cmd_t zc = { 0 }; int ret; struct destroydata dd = { 0 }; dd.snapname = snapname; - (void) zfs_check_snap_cb(zhp, &dd); + verify(nvlist_alloc(&dd.nvl, NV_UNIQUE_NAME, 0) == 0); + (void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd); - if (!dd.gotone) { - return (zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, + if (nvlist_empty(dd.nvl)) { + ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"), - zhp->zfs_name, snapname)); + zhp->zfs_name, snapname); + } else { + ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer); } + nvlist_free(dd.nvl); + return (ret); +} - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); - zc.zc_defer_destroy = defer; +/* + * Destroys all the snapshots named in the nvlist. + */ +int +zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer) +{ + int ret; + nvlist_t *errlist = NULL; - ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY_SNAPS, &zc); - if (ret != 0) { + ret = lzc_destroy_snaps(snaps, defer, &errlist); + + if (ret == 0) { + nvlist_free(errlist); + return (0); + } + + if (nvlist_empty(errlist)) { char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot destroy snapshots")); - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot destroy '%s@%s'"), zc.zc_name, snapname); + ret = zfs_standard_error(hdl, ret, errbuf); + } + for (nvpair_t *pair = nvlist_next_nvpair(errlist, NULL); + pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"), + nvpair_name(pair)); - switch (errno) { + switch (fnvpair_value_int32(pair)) { case EEXIST: - zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, - "snapshot is cloned")); - return (zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf)); - + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "snapshot is cloned")); + ret = zfs_error(hdl, EZFS_EXISTS, errbuf); + break; default: - return (zfs_standard_error(zhp->zfs_hdl, errno, - errbuf)); + ret = zfs_standard_error(hdl, errno, errbuf); + break; } } - return (0); + nvlist_free(errlist); + return (ret); } /* @@ -3055,12 +3504,10 @@ zfs_destroy_snaps(zfs_handle_t *zhp, cha int zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) { - zfs_cmd_t zc = { 0 }; - char parent[ZFS_MAXNAMELEN]; + char parent[ZFS_MAX_DATASET_NAME_LEN]; int ret; char errbuf[1024]; libzfs_handle_t *hdl = zhp->zfs_hdl; - zfs_type_t type; uint64_t zoned; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); @@ -3068,7 +3515,7 @@ zfs_clone(zfs_handle_t *zhp, const char (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create '%s'"), target); - /* validate the target name */ + /* validate the target/clone name */ if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); @@ -3079,32 +3526,21 @@ zfs_clone(zfs_handle_t *zhp, const char (void) parent_name(target, parent, sizeof (parent)); /* do the clone */ - if (ZFS_IS_VOLUME(zhp)) { - zc.zc_objset_type = DMU_OST_ZVOL; - type = ZFS_TYPE_VOLUME; - } else { - zc.zc_objset_type = DMU_OST_ZFS; - type = ZFS_TYPE_FILESYSTEM; - } if (props) { + zfs_type_t type; + if (ZFS_IS_VOLUME(zhp)) { + type = ZFS_TYPE_VOLUME; + } else { + type = ZFS_TYPE_FILESYSTEM; + } if ((props = zfs_valid_proplist(hdl, type, props, zoned, - zhp, errbuf)) == NULL) + zhp, zhp->zpool_hdl, errbuf)) == NULL) return (-1); - - if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { - nvlist_free(props); - return (-1); - } - - nvlist_free(props); } - (void) strlcpy(zc.zc_name, target, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, zhp->zfs_name, sizeof (zc.zc_value)); - ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_CREATE, &zc); - - zcmd_free_nvlists(&zc); + ret = lzc_clone(target, zhp->zfs_name, props); + nvlist_free(props); if (ret != 0) { switch (errno) { @@ -3189,74 +3625,149 @@ zfs_promote(zfs_handle_t *zhp) return (ret); } +typedef struct snapdata { + nvlist_t *sd_nvl; + const char *sd_snapname; +} snapdata_t; + +static int +zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) +{ + snapdata_t *sd = arg; + char name[ZFS_MAX_DATASET_NAME_LEN]; + int rv = 0; + + if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) { + (void) snprintf(name, sizeof (name), + "%s@%s", zfs_get_name(zhp), sd->sd_snapname); + + fnvlist_add_boolean(sd->sd_nvl, name); + + rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); + } + zfs_close(zhp); + + return (rv); +} + /* - * Takes a snapshot of the given dataset. + * Creates snapshots. The keys in the snaps nvlist are the snapshots to be + * created. */ int -zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, - nvlist_t *props) +zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props) { - const char *delim; - char parent[ZFS_MAXNAMELEN]; - zfs_handle_t *zhp; - zfs_cmd_t zc = { 0 }; int ret; char errbuf[1024]; + nvpair_t *elem; + nvlist_t *errors; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot snapshot '%s'"), path); + "cannot create snapshots ")); - /* validate the target name */ - if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) - return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - - if (props) { - if ((props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, - props, B_FALSE, NULL, errbuf)) == NULL) - return (-1); + elem = NULL; + while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) { + const char *snapname = nvpair_name(elem); - if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { - nvlist_free(props); - return (-1); + /* validate the target name */ + if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT, + B_TRUE)) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot create snapshot '%s'"), snapname); + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } + } - nvlist_free(props); + /* + * get pool handle for prop validation. assumes all snaps are in the + * same pool, as does lzc_snapshot (below). + */ + char pool[ZFS_MAX_DATASET_NAME_LEN]; + elem = nvlist_next_nvpair(snaps, NULL); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/@")] = '\0'; + zpool_handle_t *zpool_hdl = zpool_open(hdl, pool); + + if (props != NULL && + (props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, + props, B_FALSE, NULL, zpool_hdl, errbuf)) == NULL) { + zpool_close(zpool_hdl); + return (-1); } + zpool_close(zpool_hdl); - /* make sure the parent exists and is of the appropriate type */ - delim = strchr(path, '@'); - (void) strncpy(parent, path, delim - path); - parent[delim - path] = '\0'; + ret = lzc_snapshot(snaps, props, &errors); - if ((zhp = zfs_open(hdl, parent, ZFS_TYPE_FILESYSTEM | - ZFS_TYPE_VOLUME)) == NULL) { - zcmd_free_nvlists(&zc); - return (-1); + if (ret != 0) { + boolean_t printed = B_FALSE; + for (elem = nvlist_next_nvpair(errors, NULL); + elem != NULL; + elem = nvlist_next_nvpair(errors, elem)) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot create snapshot '%s'"), nvpair_name(elem)); + (void) zfs_standard_error(hdl, + fnvpair_value_int32(elem), errbuf); + printed = B_TRUE; + } + if (!printed) { + switch (ret) { + case EXDEV: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "multiple snapshots of same " + "fs not allowed")); + (void) zfs_error(hdl, EZFS_EXISTS, errbuf); + + break; + default: + (void) zfs_standard_error(hdl, ret, errbuf); + } + } } - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, delim+1, sizeof (zc.zc_value)); - if (ZFS_IS_VOLUME(zhp)) - zc.zc_objset_type = DMU_OST_ZVOL; - else - zc.zc_objset_type = DMU_OST_ZFS; - zc.zc_cookie = recursive; - ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SNAPSHOT, &zc); + nvlist_free(props); + nvlist_free(errors); + return (ret); +} - zcmd_free_nvlists(&zc); +int +zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, + nvlist_t *props) +{ + int ret; + snapdata_t sd = { 0 }; + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + char *cp; + zfs_handle_t *zhp; + char errbuf[1024]; - /* - * if it was recursive, the one that actually failed will be in - * zc.zc_name. - */ - if (ret != 0) { - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value); - (void) zfs_standard_error(hdl, errno, errbuf); + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot snapshot %s"), path); + + if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + + (void) strlcpy(fsname, path, sizeof (fsname)); + cp = strchr(fsname, '@'); + *cp = '\0'; + sd.sd_snapname = cp + 1; + + if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME)) == NULL) { + return (-1); } - zfs_close(zhp); + verify(nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) == 0); + if (recursive) { + (void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd); + } else { + fnvlist_add_boolean(sd.sd_nvl, path); + } + ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props); + nvlist_free(sd.sd_nvl); + zfs_close(zhp); return (ret); } @@ -3270,49 +3781,44 @@ typedef struct rollback_data { const char *cb_target; /* the snapshot */ uint64_t cb_create; /* creation time reference */ boolean_t cb_error; - boolean_t cb_dependent; boolean_t cb_force; } rollback_data_t; static int +rollback_destroy_dependent(zfs_handle_t *zhp, void *data) +{ + rollback_data_t *cbp = data; + prop_changelist_t *clp; + + /* We must destroy this clone; first unmount it */ + clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, + cbp->cb_force ? MS_FORCE: 0); + if (clp == NULL || changelist_prefix(clp) != 0) { + cbp->cb_error = B_TRUE; + zfs_close(zhp); + return (0); + } + if (zfs_destroy(zhp, B_FALSE) != 0) + cbp->cb_error = B_TRUE; + else + changelist_remove(clp, zhp->zfs_name); + (void) changelist_postfix(clp); + changelist_free(clp); + + zfs_close(zhp); + return (0); +} + +static int rollback_destroy(zfs_handle_t *zhp, void *data) { rollback_data_t *cbp = data; - if (!cbp->cb_dependent) { - if (strcmp(zhp->zfs_name, cbp->cb_target) != 0 && - zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT && - zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > - cbp->cb_create) { - char *logstr; - - cbp->cb_dependent = B_TRUE; - cbp->cb_error |= zfs_iter_dependents(zhp, B_FALSE, - rollback_destroy, cbp); - cbp->cb_dependent = B_FALSE; - - logstr = zhp->zfs_hdl->libzfs_log_str; - zhp->zfs_hdl->libzfs_log_str = NULL; - cbp->cb_error |= zfs_destroy(zhp, B_FALSE); - zhp->zfs_hdl->libzfs_log_str = logstr; - } - } else { - /* We must destroy this clone; first unmount it */ - prop_changelist_t *clp; + if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { + cbp->cb_error |= zfs_iter_dependents(zhp, B_FALSE, + rollback_destroy_dependent, cbp); - clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, - cbp->cb_force ? MS_FORCE: 0); - if (clp == NULL || changelist_prefix(clp) != 0) { - cbp->cb_error = B_TRUE; - zfs_close(zhp); - return (0); - } - if (zfs_destroy(zhp, B_FALSE) != 0) - cbp->cb_error = B_TRUE; - else - changelist_remove(clp, zhp->zfs_name); - (void) changelist_postfix(clp); - changelist_free(clp); + cbp->cb_error |= zfs_destroy(zhp, B_FALSE); } zfs_close(zhp); @@ -3323,29 +3829,29 @@ rollback_destroy(zfs_handle_t *zhp, void * Given a dataset, rollback to a specific snapshot, discarding any * data changes since then and making it the active dataset. * - * Any snapshots more recent than the target are destroyed, along with - * their dependents. + * Any snapshots and bookmarks more recent than the target are + * destroyed, along with their dependents (i.e. clones). */ int zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) { rollback_data_t cb = { 0 }; int err; - zfs_cmd_t zc = { 0 }; boolean_t restore_resv = 0; - uint64_t old_volsize, new_volsize; + uint64_t old_volsize = 0, new_volsize; zfs_prop_t resv_prop; assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM || zhp->zfs_type == ZFS_TYPE_VOLUME); /* - * Destroy all recent snapshots and its dependends. + * Destroy all recent snapshots and their dependents. */ cb.cb_force = force; cb.cb_target = snap->zfs_name; cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); - (void) zfs_iter_children(zhp, rollback_destroy, &cb); + (void) zfs_iter_snapshots(zhp, B_FALSE, rollback_destroy, &cb); + (void) zfs_iter_bookmarks(zhp, rollback_destroy, &cb); if (cb.cb_error) return (-1); @@ -3363,22 +3869,15 @@ zfs_rollback(zfs_handle_t *zhp, zfs_hand (old_volsize == zfs_prop_get_int(zhp, resv_prop)); } - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - - if (ZFS_IS_VOLUME(zhp)) - zc.zc_objset_type = DMU_OST_ZVOL; - else - zc.zc_objset_type = DMU_OST_ZFS; - /* * We rely on zfs_iter_children() to verify that there are no * newer snapshots for the given dataset. Therefore, we can * simply pass the name on to the ioctl() call. There is still * an unlikely race condition where the user has taken a * snapshot since we verified that this was the most recent. - * */ - if ((err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_ROLLBACK, &zc)) != 0) { + err = lzc_rollback(zhp->zfs_name, NULL, 0); + if (err != 0) { (void) zfs_standard_error_fmt(zhp->zfs_hdl, errno, dgettext(TEXT_DOMAIN, "cannot rollback '%s'"), zhp->zfs_name); @@ -3405,54 +3904,20 @@ zfs_rollback(zfs_handle_t *zhp, zfs_hand } /* - * Iterate over all dependents for a given dataset. This includes both - * hierarchical dependents (children) and data dependents (snapshots and - * clones). The bulk of the processing occurs in get_dependents() in - * libzfs_graph.c. - */ -int -zfs_iter_dependents(zfs_handle_t *zhp, boolean_t allowrecursion, - zfs_iter_f func, void *data) -{ - char **dependents; - size_t count; - int i; - zfs_handle_t *child; - int ret = 0; - - if (get_dependents(zhp->zfs_hdl, allowrecursion, zhp->zfs_name, - &dependents, &count) != 0) - return (-1); - - for (i = 0; i < count; i++) { - if ((child = make_dataset_handle(zhp->zfs_hdl, - dependents[i])) == NULL) - continue; - - if ((ret = func(child, data)) != 0) - break; - } - - for (i = 0; i < count; i++) - free(dependents[i]); - free(dependents); - - return (ret); -} - -/* * Renames the given dataset. */ int -zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) +zfs_rename(zfs_handle_t *zhp, const char *source, const char *target, + renameflags_t flags) { - int ret; + int ret = 0; zfs_cmd_t zc = { 0 }; char *delim; prop_changelist_t *cl = NULL; zfs_handle_t *zhrp = NULL; char *parentname = NULL; - char parent[ZFS_MAXNAMELEN]; + char parent[ZFS_MAX_DATASET_NAME_LEN]; + char property[ZFS_MAXPROPLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; char errbuf[1024]; @@ -3463,6 +3928,18 @@ zfs_rename(zfs_handle_t *zhp, const char (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename to '%s'"), target); + if (source != NULL) { + /* + * This is recursive snapshots rename, put snapshot name + * (that might not exist) into zfs_name. + */ + assert(flags.recurse); + + (void) strlcat(zhp->zfs_name, "@", sizeof(zhp->zfs_name)); + (void) strlcat(zhp->zfs_name, source, sizeof(zhp->zfs_name)); + zhp->zfs_type = ZFS_TYPE_SNAPSHOT; + } + /* * Make sure the target name is valid */ @@ -3499,7 +3976,7 @@ zfs_rename(zfs_handle_t *zhp, const char if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } else { - if (recursive) { + if (flags.recurse) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "recursive rename must be a snapshot")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); @@ -3540,7 +4017,19 @@ zfs_rename(zfs_handle_t *zhp, const char return (zfs_error(hdl, EZFS_ZONED, errbuf)); } - if (recursive) { + /* + * Avoid unmounting file systems with mountpoint property set to + * 'legacy' or 'none' even if -u option is not given. + */ + if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && + !flags.recurse && !flags.nounmount && + zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, property, + sizeof (property), NULL, NULL, 0, B_FALSE) == 0 && + (strcmp(property, "legacy") == 0 || + strcmp(property, "none") == 0)) { + flags.nounmount = B_TRUE; + } + if (flags.recurse) { parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name); if (parentname == NULL) { @@ -3554,16 +4043,19 @@ zfs_rename(zfs_handle_t *zhp, const char ret = -1; goto error; } - - } else { - if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0)) == NULL) + } else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) { + if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, + flags.nounmount ? CL_GATHER_DONT_UNMOUNT : 0, + flags.forceunmount ? MS_FORCE : 0)) == NULL) { return (-1); + } if (changelist_haszonedchild(cl)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "child dataset with inherited mountpoint is used " "in a non-global zone")); (void) zfs_error(hdl, EZFS_ZONED, errbuf); + ret = -1; goto error; } @@ -3579,7 +4071,9 @@ zfs_rename(zfs_handle_t *zhp, const char (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); - zc.zc_cookie = recursive; + zc.zc_cookie = flags.recurse ? 1 : 0; + if (flags.nounmount) + zc.zc_cookie |= 2; if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) { /* @@ -3589,7 +4083,7 @@ zfs_rename(zfs_handle_t *zhp, const char (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zc.zc_name); - if (recursive && errno == EEXIST) { + if (flags.recurse && errno == EEXIST) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "a child dataset already has a snapshot " "with the new name")); @@ -3602,23 +4096,23 @@ zfs_rename(zfs_handle_t *zhp, const char * On failure, we still want to remount any filesystems that * were previously mounted, so we don't alter the system state. */ - if (!recursive) + if (cl != NULL) (void) changelist_postfix(cl); } else { - if (!recursive) { + if (cl != NULL) { changelist_rename(cl, zfs_get_name(zhp), target); ret = changelist_postfix(cl); } } error: - if (parentname) { + if (parentname != NULL) { free(parentname); } - if (zhrp) { + if (zhrp != NULL) { zfs_close(zhrp); } - if (cl) { + if (cl != NULL) { changelist_free(cl); } return (ret); @@ -3653,7 +4147,8 @@ zfs_get_recvd_props(zfs_handle_t *zhp) * of the RECEIVED column. */ int -zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received) +zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received, + boolean_t literal) { libzfs_handle_t *hdl = zhp->zfs_hdl; zprop_list_t *entry; @@ -3715,18 +4210,18 @@ zfs_expand_proplist(zfs_handle_t *zhp, z * Now go through and check the width of any non-fixed columns */ for (entry = *plp; entry != NULL; entry = entry->pl_next) { - if (entry->pl_fixed) + if (entry->pl_fixed && !literal) continue; if (entry->pl_prop != ZPROP_INVAL) { if (zfs_prop_get(zhp, entry->pl_prop, - buf, sizeof (buf), NULL, NULL, 0, B_FALSE) == 0) { + buf, sizeof (buf), NULL, NULL, 0, literal) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } if (received && zfs_prop_get_recvd(zhp, zfs_prop_to_name(entry->pl_prop), - buf, sizeof (buf), B_FALSE) == 0) + buf, sizeof (buf), literal) == 0) if (strlen(buf) > entry->pl_recvd_width) entry->pl_recvd_width = strlen(buf); } else { @@ -3739,7 +4234,7 @@ zfs_expand_proplist(zfs_handle_t *zhp, z } if (received && zfs_prop_get_recvd(zhp, entry->pl_user_prop, - buf, sizeof (buf), B_FALSE) == 0) + buf, sizeof (buf), literal) == 0) if (strlen(buf) > entry->pl_recvd_width) entry->pl_recvd_width = strlen(buf); } @@ -3749,52 +4244,6 @@ zfs_expand_proplist(zfs_handle_t *zhp, z } int -zfs_iscsi_perm_check(libzfs_handle_t *hdl, char *dataset, ucred_t *cred) -{ - zfs_cmd_t zc = { 0 }; - nvlist_t *nvp; - gid_t gid; - uid_t uid; - const gid_t *groups; - int group_cnt; - int error; - - if (nvlist_alloc(&nvp, NV_UNIQUE_NAME, 0) != 0) - return (no_memory(hdl)); - - uid = ucred_geteuid(cred); - gid = ucred_getegid(cred); - group_cnt = ucred_getgroups(cred, &groups); - - if (uid == (uid_t)-1 || gid == (uid_t)-1 || group_cnt == (uid_t)-1) - return (1); - - if (nvlist_add_uint32(nvp, ZFS_DELEG_PERM_UID, uid) != 0) { - nvlist_free(nvp); - return (1); - } - - if (nvlist_add_uint32(nvp, ZFS_DELEG_PERM_GID, gid) != 0) { - nvlist_free(nvp); - return (1); - } - - if (nvlist_add_uint32_array(nvp, - ZFS_DELEG_PERM_GROUPS, (uint32_t *)groups, group_cnt) != 0) { - nvlist_free(nvp); - return (1); - } - (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); - - if (zcmd_write_src_nvlist(hdl, &zc, nvp)) - return (-1); - - error = ioctl(hdl->libzfs_fd, ZFS_IOC_ISCSI_PERM_CHECK, &zc); - nvlist_free(nvp); - return (error); -} - -int zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path, char *resource, void *export, void *sharetab, int sharemax, zfs_share_op_t operation) @@ -3845,6 +4294,7 @@ zfs_prune_proplist(zfs_handle_t *zhp, ui } } +#ifdef illumos static int zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path, zfs_smb_acl_op_t cmd, char *resource1, char *resource2) @@ -3860,7 +4310,7 @@ zfs_smb_acl_mgmt(libzfs_handle_t *hdl, c if (cmd == ZFS_SMB_ACL_RENAME) { if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); - return 0; + return (0); } } @@ -3891,8 +4341,7 @@ zfs_smb_acl_mgmt(libzfs_handle_t *hdl, c return (-1); } error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc); - if (nvlist) - nvlist_free(nvlist); + nvlist_free(nvlist); return (error); } @@ -3926,68 +4375,151 @@ zfs_smb_acl_rename(libzfs_handle_t *hdl, return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME, oldname, newname)); } +#endif /* illumos */ int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, zfs_userspace_cb_t func, void *arg) { zfs_cmd_t zc = { 0 }; - int error; zfs_useracct_t buf[100]; + libzfs_handle_t *hdl = zhp->zfs_hdl; + int ret; - (void) strncpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); zc.zc_objset_type = type; zc.zc_nvlist_dst = (uintptr_t)buf; - /* CONSTCOND */ - while (1) { + for (;;) { zfs_useracct_t *zua = buf; zc.zc_nvlist_dst_size = sizeof (buf); - error = ioctl(zhp->zfs_hdl->libzfs_fd, - ZFS_IOC_USERSPACE_MANY, &zc); - if (error || zc.zc_nvlist_dst_size == 0) + if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) { + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot get used/quota for %s"), zc.zc_name); + return (zfs_standard_error_fmt(hdl, errno, errbuf)); + } + if (zc.zc_nvlist_dst_size == 0) break; while (zc.zc_nvlist_dst_size > 0) { - error = func(arg, zua->zu_domain, zua->zu_rid, - zua->zu_space); - if (error != 0) - return (error); + if ((ret = func(arg, zua->zu_domain, zua->zu_rid, + zua->zu_space)) != 0) + return (ret); zua++; zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t); } } - return (error); + return (0); +} + +struct holdarg { + nvlist_t *nvl; + const char *snapname; + const char *tag; + boolean_t recursive; + int error; +}; + +static int +zfs_hold_one(zfs_handle_t *zhp, void *arg) +{ + struct holdarg *ha = arg; + char name[ZFS_MAX_DATASET_NAME_LEN]; + int rv = 0; + + (void) snprintf(name, sizeof (name), + "%s@%s", zhp->zfs_name, ha->snapname); + + if (lzc_exists(name)) + fnvlist_add_string(ha->nvl, name, ha->tag); + + if (ha->recursive) + rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha); + zfs_close(zhp); + return (rv); } int zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, - boolean_t recursive, boolean_t temphold, boolean_t enoent_ok) + boolean_t recursive, int cleanup_fd) { - zfs_cmd_t zc = { 0 }; + int ret; + struct holdarg ha; + + ha.nvl = fnvlist_alloc(); + ha.snapname = snapname; + ha.tag = tag; + ha.recursive = recursive; + (void) zfs_hold_one(zfs_handle_dup(zhp), &ha); + + if (nvlist_empty(ha.nvl)) { + char errbuf[1024]; + + fnvlist_free(ha.nvl); + ret = ENOENT; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot hold snapshot '%s@%s'"), + zhp->zfs_name, snapname); + (void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf); + return (ret); + } + + ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl); + fnvlist_free(ha.nvl); + + return (ret); +} + +int +zfs_hold_nvl(zfs_handle_t *zhp, int cleanup_fd, nvlist_t *holds) +{ + int ret; + nvlist_t *errors; libzfs_handle_t *hdl = zhp->zfs_hdl; + char errbuf[1024]; + nvpair_t *elem; - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); - if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string)) - >= sizeof (zc.zc_string)) - return (zfs_error(hdl, EZFS_TAGTOOLONG, tag)); - zc.zc_cookie = recursive; - zc.zc_temphold = temphold; + errors = NULL; + ret = lzc_hold(holds, cleanup_fd, &errors); - if (zfs_ioctl(hdl, ZFS_IOC_HOLD, &zc) != 0) { - char errbuf[ZFS_MAXNAMELEN+32]; + if (ret == 0) { + /* There may be errors even in the success case. */ + fnvlist_free(errors); + return (0); + } - /* - * if it was recursive, the one that actually failed will be in - * zc.zc_name. - */ - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot hold '%s@%s'"), zc.zc_name, snapname); - switch (errno) { + if (nvlist_empty(errors)) { + /* no hold-specific errors */ + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot hold")); + switch (ret) { + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + case EINVAL: + (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); + break; + default: + (void) zfs_standard_error(hdl, ret, errbuf); + } + } + + for (elem = nvlist_next_nvpair(errors, NULL); + elem != NULL; + elem = nvlist_next_nvpair(errors, elem)) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot hold snapshot '%s'"), nvpair_name(elem)); + switch (fnvpair_value_int32(elem)) { case E2BIG: /* * Temporary tags wind up having the ds object id @@ -3995,173 +4527,294 @@ zfs_hold(zfs_handle_t *zhp, const char * * above, it's still possible for the tag to wind * up being slightly too long. */ - return (zfs_error(hdl, EZFS_TAGTOOLONG, errbuf)); - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgraded")); - return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); + (void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf); + break; case EINVAL: - return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); + break; case EEXIST: - return (zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf)); - case ENOENT: - if (enoent_ok) - return (0); - /* FALLTHROUGH */ + (void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf); + break; default: - return (zfs_standard_error_fmt(hdl, errno, errbuf)); + (void) zfs_standard_error(hdl, + fnvpair_value_int32(elem), errbuf); } } - return (0); + fnvlist_free(errors); + return (ret); } -struct hold_range_arg { - zfs_handle_t *origin; - const char *fromsnap; - const char *tosnap; - char lastsnapheld[ZFS_MAXNAMELEN]; - const char *tag; - boolean_t temphold; - boolean_t seento; - boolean_t seenfrom; - boolean_t holding; - boolean_t recursive; -}; - static int -zfs_hold_range_one(zfs_handle_t *zhp, void *arg) +zfs_release_one(zfs_handle_t *zhp, void *arg) { - struct hold_range_arg *hra = arg; - const char *thissnap; - int error; + struct holdarg *ha = arg; + char name[ZFS_MAX_DATASET_NAME_LEN]; + int rv = 0; + nvlist_t *existing_holds; - thissnap = strchr(zfs_get_name(zhp), '@') + 1; + (void) snprintf(name, sizeof (name), + "%s@%s", zhp->zfs_name, ha->snapname); - if (hra->fromsnap && !hra->seenfrom && - strcmp(hra->fromsnap, thissnap) == 0) - hra->seenfrom = B_TRUE; + if (lzc_get_holds(name, &existing_holds) != 0) { + ha->error = ENOENT; + } else if (!nvlist_exists(existing_holds, ha->tag)) { + ha->error = ESRCH; + } else { + nvlist_t *torelease = fnvlist_alloc(); + fnvlist_add_boolean(torelease, ha->tag); + fnvlist_add_nvlist(ha->nvl, name, torelease); + fnvlist_free(torelease); + } - /* snap is older or newer than the desired range, ignore it */ - if (hra->seento || !hra->seenfrom) { - zfs_close(zhp); + if (ha->recursive) + rv = zfs_iter_filesystems(zhp, zfs_release_one, ha); + zfs_close(zhp); + return (rv); +} + +int +zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, + boolean_t recursive) +{ + int ret; + struct holdarg ha; + nvlist_t *errors = NULL; + nvpair_t *elem; + libzfs_handle_t *hdl = zhp->zfs_hdl; + char errbuf[1024]; + + ha.nvl = fnvlist_alloc(); + ha.snapname = snapname; + ha.tag = tag; + ha.recursive = recursive; + ha.error = 0; + (void) zfs_release_one(zfs_handle_dup(zhp), &ha); + + if (nvlist_empty(ha.nvl)) { + fnvlist_free(ha.nvl); + ret = ha.error; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot release hold from snapshot '%s@%s'"), + zhp->zfs_name, snapname); + if (ret == ESRCH) { + (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); + } else { + (void) zfs_standard_error(hdl, ret, errbuf); + } + return (ret); + } + + ret = lzc_release(ha.nvl, &errors); + fnvlist_free(ha.nvl); + + if (ret == 0) { + /* There may be errors even in the success case. */ + fnvlist_free(errors); return (0); } - if (hra->holding) { - /* We could be racing with destroy, so ignore ENOENT. */ - error = zfs_hold(hra->origin, thissnap, hra->tag, - hra->recursive, hra->temphold, B_TRUE); - if (error == 0) { - (void) strlcpy(hra->lastsnapheld, zfs_get_name(zhp), - sizeof (hra->lastsnapheld)); + if (nvlist_empty(errors)) { + /* no hold-specific errors */ + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot release")); + switch (errno) { + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + default: + (void) zfs_standard_error_fmt(hdl, errno, errbuf); } - } else { - error = zfs_release(hra->origin, thissnap, hra->tag, - hra->recursive); } - if (!hra->seento && strcmp(hra->tosnap, thissnap) == 0) - hra->seento = B_TRUE; + for (elem = nvlist_next_nvpair(errors, NULL); + elem != NULL; + elem = nvlist_next_nvpair(errors, elem)) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot release hold from snapshot '%s'"), + nvpair_name(elem)); + switch (fnvpair_value_int32(elem)) { + case ESRCH: + (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); + break; + case EINVAL: + (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); + break; + default: + (void) zfs_standard_error_fmt(hdl, + fnvpair_value_int32(elem), errbuf); + } + } - zfs_close(zhp); - return (error); + fnvlist_free(errors); + return (ret); } -/* - * Add a user hold on the set of snapshots starting with fromsnap up to - * and including tosnap. If we're unable to to acquire a particular hold, - * undo any holds up to that point. - */ int -zfs_hold_range(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - const char *tag, boolean_t recursive, boolean_t temphold) +zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl) { - struct hold_range_arg arg = { 0 }; - int error; + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zfs_hdl; + int nvsz = 2048; + void *nvbuf; + int err = 0; + char errbuf[1024]; - arg.origin = zhp; - arg.fromsnap = fromsnap; - arg.tosnap = tosnap; - arg.tag = tag; - arg.temphold = temphold; - arg.holding = B_TRUE; - arg.recursive = recursive; - arg.seenfrom = (fromsnap == NULL); + assert(zhp->zfs_type == ZFS_TYPE_VOLUME || + zhp->zfs_type == ZFS_TYPE_FILESYSTEM); - error = zfs_iter_snapshots_sorted(zhp, zfs_hold_range_one, &arg); +tryagain: - /* - * Make sure we either hold the entire range or none. - */ - if (error && arg.lastsnapheld[0] != '\0') { - (void) zfs_release_range(zhp, fromsnap, - (const char *)arg.lastsnapheld, tag, recursive); + nvbuf = malloc(nvsz); + if (nvbuf == NULL) { + err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno))); + goto out; } - return (error); + + zc.zc_nvlist_dst_size = nvsz; + zc.zc_nvlist_dst = (uintptr_t)nvbuf; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + if (ioctl(hdl->libzfs_fd, ZFS_IOC_GET_FSACL, &zc) != 0) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"), + zc.zc_name); + switch (errno) { + case ENOMEM: + free(nvbuf); + nvsz = zc.zc_nvlist_dst_size; + goto tryagain; + + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + err = zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + case EINVAL: + err = zfs_error(hdl, EZFS_BADTYPE, errbuf); + break; + case ENOENT: + err = zfs_error(hdl, EZFS_NOENT, errbuf); + break; + default: + err = zfs_standard_error_fmt(hdl, errno, errbuf); + break; + } + } else { + /* success */ + int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0); + if (rc) { + (void) snprintf(errbuf, sizeof (errbuf), dgettext( + TEXT_DOMAIN, "cannot get permissions on '%s'"), + zc.zc_name); + err = zfs_standard_error_fmt(hdl, rc, errbuf); + } + } + + free(nvbuf); +out: + return (err); } int -zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, - boolean_t recursive) +zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zfs_hdl; + char *nvbuf; + char errbuf[1024]; + size_t nvsz; + int err; - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); - if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string)) - >= sizeof (zc.zc_string)) - return (zfs_error(hdl, EZFS_TAGTOOLONG, tag)); - zc.zc_cookie = recursive; + assert(zhp->zfs_type == ZFS_TYPE_VOLUME || + zhp->zfs_type == ZFS_TYPE_FILESYSTEM); - if (zfs_ioctl(hdl, ZFS_IOC_RELEASE, &zc) != 0) { - char errbuf[ZFS_MAXNAMELEN+32]; + err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE); + assert(err == 0); - /* - * if it was recursive, the one that actually failed will be in - * zc.zc_name. - */ - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot release '%s' from '%s@%s'"), tag, zc.zc_name, - snapname); + nvbuf = malloc(nvsz); + + err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0); + assert(err == 0); + + zc.zc_nvlist_src_size = nvsz; + zc.zc_nvlist_src = (uintptr_t)nvbuf; + zc.zc_perm_action = un; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"), + zc.zc_name); switch (errno) { - case ESRCH: - return (zfs_error(hdl, EZFS_REFTAG_RELE, errbuf)); case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded")); - return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); + err = zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; case EINVAL: - return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + err = zfs_error(hdl, EZFS_BADTYPE, errbuf); + break; + case ENOENT: + err = zfs_error(hdl, EZFS_NOENT, errbuf); + break; default: - return (zfs_standard_error_fmt(hdl, errno, errbuf)); + err = zfs_standard_error_fmt(hdl, errno, errbuf); + break; } } - return (0); + free(nvbuf); + + return (err); } -/* - * Release a user hold from the set of snapshots starting with fromsnap - * up to and including tosnap. - */ int -zfs_release_range(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - const char *tag, boolean_t recursive) +zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) { - struct hold_range_arg arg = { 0 }; + int err; + char errbuf[1024]; + + err = lzc_get_holds(zhp->zfs_name, nvl); - arg.origin = zhp; - arg.fromsnap = fromsnap; - arg.tosnap = tosnap; - arg.tag = tag; - arg.recursive = recursive; - arg.seenfrom = (fromsnap == NULL); + if (err != 0) { + libzfs_handle_t *hdl = zhp->zfs_hdl; - return (zfs_iter_snapshots_sorted(zhp, zfs_hold_range_one, &arg)); + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"), + zhp->zfs_name); + switch (err) { + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + err = zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + case EINVAL: + err = zfs_error(hdl, EZFS_BADTYPE, errbuf); + break; + case ENOENT: + err = zfs_error(hdl, EZFS_NOENT, errbuf); + break; + default: + err = zfs_standard_error_fmt(hdl, errno, errbuf); + break; + } + } + + return (err); } +/* + * Convert the zvol's volume size to an appropriate reservation. + * Note: If this routine is updated, it is necessary to update the ZFS test + * suite's shell version in reservation.kshlib. + */ uint64_t zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) { @@ -4199,3 +4852,48 @@ zvol_volsize_to_reservation(uint64_t vol volsize += numdb; return (volsize); } + +#ifdef __FreeBSD__ +/* + * Attach/detach the given filesystem to/from the given jail. + */ +int +zfs_jail(zfs_handle_t *zhp, int jailid, int attach) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + zfs_cmd_t zc = { 0 }; + char errbuf[1024]; + unsigned long cmd; + int ret; + + if (attach) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot jail '%s'"), zhp->zfs_name); + } else { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot unjail '%s'"), zhp->zfs_name); + } + + switch (zhp->zfs_type) { + case ZFS_TYPE_VOLUME: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "volumes can not be jailed")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case ZFS_TYPE_SNAPSHOT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "snapshots can not be jailed")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + } + assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM); + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + zc.zc_objset_type = DMU_OST_ZFS; + zc.zc_jailid = jailid; + + cmd = attach ? ZFS_IOC_JAIL : ZFS_IOC_UNJAIL; + if ((ret = ioctl(hdl->libzfs_fd, cmd, &zc)) != 0) + zfs_standard_error(hdl, errno, errbuf); + + return (ret); +} +#endif Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_diff.c =================================================================== RCS file: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_diff.c diff -N src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_diff.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_diff.c 10 Oct 2016 11:14:25 -0000 @@ -0,0 +1,842 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright 2016 Joyent, Inc. + * Copyright 2016 Igor Kozhukhov + */ + +/* + * zfs diff support + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libzfs_impl.h" + +#define ZDIFF_SNAPDIR "/.zfs/snapshot/" +#define ZDIFF_SHARESDIR "/.zfs/shares/" +#define ZDIFF_PREFIX "zfs-diff-%d" + +#define ZDIFF_ADDED '+' +#define ZDIFF_MODIFIED 'M' +#define ZDIFF_REMOVED '-' +#define ZDIFF_RENAMED 'R' + +static boolean_t +do_name_cmp(const char *fpath, const char *tpath) +{ + char *fname, *tname; + fname = strrchr(fpath, '/') + 1; + tname = strrchr(tpath, '/') + 1; + return (strcmp(fname, tname) == 0); +} + +typedef struct differ_info { + zfs_handle_t *zhp; + char *fromsnap; + char *frommnt; + char *tosnap; + char *tomnt; + char *ds; + char *dsmnt; + char *tmpsnap; + char errbuf[1024]; + boolean_t isclone; + boolean_t scripted; + boolean_t classify; + boolean_t timestamped; + uint64_t shares; + int zerr; + int cleanupfd; + int outputfd; + int datafd; +} differ_info_t; + +/* + * Given a {dsname, object id}, get the object path + */ +static int +get_stats_for_obj(differ_info_t *di, const char *dsname, uint64_t obj, + char *pn, int maxlen, zfs_stat_t *sb) +{ + zfs_cmd_t zc = { 0 }; + int error; + + (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); + zc.zc_obj = obj; + + errno = 0; + error = ioctl(di->zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJ_TO_STATS, &zc); + di->zerr = errno; + + /* we can get stats even if we failed to get a path */ + (void) memcpy(sb, &zc.zc_stat, sizeof (zfs_stat_t)); + if (error == 0) { + ASSERT(di->zerr == 0); + (void) strlcpy(pn, zc.zc_value, maxlen); + return (0); + } + + if (di->zerr == EPERM) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "The sys_config privilege or diff delegated permission " + "is needed\nto discover path names")); + return (-1); + } else { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Unable to determine path or stats for " + "object %lld in %s"), obj, dsname); + return (-1); + } +} + +/* + * stream_bytes + * + * Prints a file name out a character at a time. If the character is + * not in the range of what we consider "printable" ASCII, display it + * as an escaped 3-digit octal value. ASCII values less than a space + * are all control characters and we declare the upper end as the + * DELete character. This also is the last 7-bit ASCII character. + * We choose to treat all 8-bit ASCII as not printable for this + * application. + */ +static void +stream_bytes(FILE *fp, const char *string) +{ + char c; + + while ((c = *string++) != '\0') { + if (c > ' ' && c != '\\' && c < '\177') { + (void) fprintf(fp, "%c", c); + } else { + (void) fprintf(fp, "\\%03o", (uint8_t)c); + } + } +} + +static void +print_what(FILE *fp, mode_t what) +{ + char symbol; + + switch (what & S_IFMT) { + case S_IFBLK: + symbol = 'B'; + break; + case S_IFCHR: + symbol = 'C'; + break; + case S_IFDIR: + symbol = '/'; + break; +#ifdef S_IFDOOR + case S_IFDOOR: + symbol = '>'; + break; +#endif + case S_IFIFO: + symbol = '|'; + break; + case S_IFLNK: + symbol = '@'; + break; +#ifdef S_IFPORT + case S_IFPORT: + symbol = 'P'; + break; +#endif + case S_IFSOCK: + symbol = '='; + break; + case S_IFREG: + symbol = 'F'; + break; + default: + symbol = '?'; + break; + } + (void) fprintf(fp, "%c", symbol); +} + +static void +print_cmn(FILE *fp, differ_info_t *di, const char *file) +{ + stream_bytes(fp, di->dsmnt); + stream_bytes(fp, file); +} + +static void +print_rename(FILE *fp, differ_info_t *di, const char *old, const char *new, + zfs_stat_t *isb) +{ + if (di->timestamped) + (void) fprintf(fp, "%10lld.%09lld\t", + (longlong_t)isb->zs_ctime[0], + (longlong_t)isb->zs_ctime[1]); + (void) fprintf(fp, "%c\t", ZDIFF_RENAMED); + if (di->classify) { + print_what(fp, isb->zs_mode); + (void) fprintf(fp, "\t"); + } + print_cmn(fp, di, old); + if (di->scripted) + (void) fprintf(fp, "\t"); + else + (void) fprintf(fp, " -> "); + print_cmn(fp, di, new); + (void) fprintf(fp, "\n"); +} + +static void +print_link_change(FILE *fp, differ_info_t *di, int delta, const char *file, + zfs_stat_t *isb) +{ + if (di->timestamped) + (void) fprintf(fp, "%10lld.%09lld\t", + (longlong_t)isb->zs_ctime[0], + (longlong_t)isb->zs_ctime[1]); + (void) fprintf(fp, "%c\t", ZDIFF_MODIFIED); + if (di->classify) { + print_what(fp, isb->zs_mode); + (void) fprintf(fp, "\t"); + } + print_cmn(fp, di, file); + (void) fprintf(fp, "\t(%+d)", delta); + (void) fprintf(fp, "\n"); +} + +static void +print_file(FILE *fp, differ_info_t *di, char type, const char *file, + zfs_stat_t *isb) +{ + if (di->timestamped) + (void) fprintf(fp, "%10lld.%09lld\t", + (longlong_t)isb->zs_ctime[0], + (longlong_t)isb->zs_ctime[1]); + (void) fprintf(fp, "%c\t", type); + if (di->classify) { + print_what(fp, isb->zs_mode); + (void) fprintf(fp, "\t"); + } + print_cmn(fp, di, file); + (void) fprintf(fp, "\n"); +} + +static int +write_inuse_diffs_one(FILE *fp, differ_info_t *di, uint64_t dobj) +{ + struct zfs_stat fsb, tsb; + boolean_t same_name; + mode_t fmode, tmode; + char fobjname[MAXPATHLEN], tobjname[MAXPATHLEN]; + int fobjerr, tobjerr; + int change; + + if (dobj == di->shares) + return (0); + + /* + * Check the from and to snapshots for info on the object. If + * we get ENOENT, then the object just didn't exist in that + * snapshot. If we get ENOTSUP, then we tried to get + * info on a non-ZPL object, which we don't care about anyway. + */ + fobjerr = get_stats_for_obj(di, di->fromsnap, dobj, fobjname, + MAXPATHLEN, &fsb); + if (fobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP) + return (-1); + + tobjerr = get_stats_for_obj(di, di->tosnap, dobj, tobjname, + MAXPATHLEN, &tsb); + if (tobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP) + return (-1); + + /* + * Unallocated object sharing the same meta dnode block + */ + if (fobjerr && tobjerr) { + ASSERT(di->zerr == ENOENT || di->zerr == ENOTSUP); + di->zerr = 0; + return (0); + } + + di->zerr = 0; /* negate get_stats_for_obj() from side that failed */ + fmode = fsb.zs_mode & S_IFMT; + tmode = tsb.zs_mode & S_IFMT; + if (fmode == S_IFDIR || tmode == S_IFDIR || fsb.zs_links == 0 || + tsb.zs_links == 0) + change = 0; + else + change = tsb.zs_links - fsb.zs_links; + + if (fobjerr) { + if (change) { + print_link_change(fp, di, change, tobjname, &tsb); + return (0); + } + print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb); + return (0); + } else if (tobjerr) { + if (change) { + print_link_change(fp, di, change, fobjname, &fsb); + return (0); + } + print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb); + return (0); + } + + if (fmode != tmode && fsb.zs_gen == tsb.zs_gen) + tsb.zs_gen++; /* Force a generational difference */ + same_name = do_name_cmp(fobjname, tobjname); + + /* Simple modification or no change */ + if (fsb.zs_gen == tsb.zs_gen) { + /* No apparent changes. Could we assert !this? */ + if (fsb.zs_ctime[0] == tsb.zs_ctime[0] && + fsb.zs_ctime[1] == tsb.zs_ctime[1]) + return (0); + if (change) { + print_link_change(fp, di, change, + change > 0 ? fobjname : tobjname, &tsb); + } else if (same_name) { + print_file(fp, di, ZDIFF_MODIFIED, fobjname, &tsb); + } else { + print_rename(fp, di, fobjname, tobjname, &tsb); + } + return (0); + } else { + /* file re-created or object re-used */ + print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb); + print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb); + return (0); + } +} + +static int +write_inuse_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) +{ + uint64_t o; + int err; + + for (o = dr->ddr_first; o <= dr->ddr_last; o++) { + if ((err = write_inuse_diffs_one(fp, di, o)) != 0) + return (err); + } + return (0); +} + +static int +describe_free(FILE *fp, differ_info_t *di, uint64_t object, char *namebuf, + int maxlen) +{ + struct zfs_stat sb; + + if (get_stats_for_obj(di, di->fromsnap, object, namebuf, + maxlen, &sb) != 0) { + /* Let it slide, if in the delete queue on from side */ + if (di->zerr == ENOENT && sb.zs_links == 0) { + di->zerr = 0; + return (0); + } + return (-1); + } + + print_file(fp, di, ZDIFF_REMOVED, namebuf, &sb); + return (0); +} + +static int +write_free_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *lhdl = di->zhp->zfs_hdl; + char fobjname[MAXPATHLEN]; + + (void) strlcpy(zc.zc_name, di->fromsnap, sizeof (zc.zc_name)); + zc.zc_obj = dr->ddr_first - 1; + + ASSERT(di->zerr == 0); + + while (zc.zc_obj < dr->ddr_last) { + int err; + + err = ioctl(lhdl->libzfs_fd, ZFS_IOC_NEXT_OBJ, &zc); + if (err == 0) { + if (zc.zc_obj == di->shares) { + zc.zc_obj++; + continue; + } + if (zc.zc_obj > dr->ddr_last) { + break; + } + err = describe_free(fp, di, zc.zc_obj, fobjname, + MAXPATHLEN); + if (err) + break; + } else if (errno == ESRCH) { + break; + } else { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "next allocated object (> %lld) find failure"), + zc.zc_obj); + di->zerr = errno; + break; + } + } + if (di->zerr) + return (-1); + return (0); +} + +static void * +differ(void *arg) +{ + differ_info_t *di = arg; + dmu_diff_record_t dr; + FILE *ofp; + int err = 0; + + if ((ofp = fdopen(di->outputfd, "w")) == NULL) { + di->zerr = errno; + (void) strerror_r(errno, di->errbuf, sizeof (di->errbuf)); + (void) close(di->datafd); + return ((void *)-1); + } + + for (;;) { + char *cp = (char *)&dr; + int len = sizeof (dr); + int rv; + + do { + rv = read(di->datafd, cp, len); + cp += rv; + len -= rv; + } while (len > 0 && rv > 0); + + if (rv < 0 || (rv == 0 && len != sizeof (dr))) { + di->zerr = EPIPE; + break; + } else if (rv == 0) { + /* end of file at a natural breaking point */ + break; + } + + switch (dr.ddr_type) { + case DDR_FREE: + err = write_free_diffs(ofp, di, &dr); + break; + case DDR_INUSE: + err = write_inuse_diffs(ofp, di, &dr); + break; + default: + di->zerr = EPIPE; + break; + } + + if (err || di->zerr) + break; + } + + (void) fclose(ofp); + (void) close(di->datafd); + if (err) + return ((void *)-1); + if (di->zerr) { + ASSERT(di->zerr == EINVAL); + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Internal error: bad data from diff IOCTL")); + return ((void *)-1); + } + return ((void *)0); +} + +static int +find_shares_object(differ_info_t *di) +{ + char fullpath[MAXPATHLEN]; + struct stat64 sb = { 0 }; + + (void) strlcpy(fullpath, di->dsmnt, MAXPATHLEN); + (void) strlcat(fullpath, ZDIFF_SHARESDIR, MAXPATHLEN); + + if (stat64(fullpath, &sb) != 0) { +#ifdef illumos + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, "Cannot stat %s"), fullpath); + return (zfs_error(di->zhp->zfs_hdl, EZFS_DIFF, di->errbuf)); +#else + return (0); +#endif + } + + di->shares = (uint64_t)sb.st_ino; + return (0); +} + +static int +make_temp_snapshot(differ_info_t *di) +{ + libzfs_handle_t *hdl = di->zhp->zfs_hdl; + zfs_cmd_t zc = { 0 }; + + (void) snprintf(zc.zc_value, sizeof (zc.zc_value), + ZDIFF_PREFIX, getpid()); + (void) strlcpy(zc.zc_name, di->ds, sizeof (zc.zc_name)); + zc.zc_cleanup_fd = di->cleanupfd; + + if (ioctl(hdl->libzfs_fd, ZFS_IOC_TMP_SNAPSHOT, &zc) != 0) { + int err = errno; + if (err == EPERM) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, "The diff delegated " + "permission is needed in order\nto create a " + "just-in-time snapshot for diffing\n")); + return (zfs_error(hdl, EZFS_DIFF, di->errbuf)); + } else { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, "Cannot create just-in-time " + "snapshot of '%s'"), zc.zc_name); + return (zfs_standard_error(hdl, err, di->errbuf)); + } + } + + di->tmpsnap = zfs_strdup(hdl, zc.zc_value); + di->tosnap = zfs_asprintf(hdl, "%s@%s", di->ds, di->tmpsnap); + return (0); +} + +static void +teardown_differ_info(differ_info_t *di) +{ + free(di->ds); + free(di->dsmnt); + free(di->fromsnap); + free(di->frommnt); + free(di->tosnap); + free(di->tmpsnap); + free(di->tomnt); + (void) close(di->cleanupfd); +} + +static int +get_snapshot_names(differ_info_t *di, const char *fromsnap, + const char *tosnap) +{ + libzfs_handle_t *hdl = di->zhp->zfs_hdl; + char *atptrf = NULL; + char *atptrt = NULL; + int fdslen, fsnlen; + int tdslen, tsnlen; + + /* + * Can accept + * dataset@snap1 + * dataset@snap1 dataset@snap2 + * dataset@snap1 @snap2 + * dataset@snap1 dataset + * @snap1 dataset@snap2 + */ + if (tosnap == NULL) { + /* only a from snapshot given, must be valid */ + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Badly formed snapshot name %s"), fromsnap); + + if (!zfs_validate_name(hdl, fromsnap, ZFS_TYPE_SNAPSHOT, + B_FALSE)) { + return (zfs_error(hdl, EZFS_INVALIDNAME, + di->errbuf)); + } + + atptrf = strchr(fromsnap, '@'); + ASSERT(atptrf != NULL); + fdslen = atptrf - fromsnap; + + di->fromsnap = zfs_strdup(hdl, fromsnap); + di->ds = zfs_strdup(hdl, fromsnap); + di->ds[fdslen] = '\0'; + + /* the to snap will be a just-in-time snap of the head */ + return (make_temp_snapshot(di)); + } + + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Unable to determine which snapshots to compare")); + + atptrf = strchr(fromsnap, '@'); + atptrt = strchr(tosnap, '@'); + fdslen = atptrf ? atptrf - fromsnap : strlen(fromsnap); + tdslen = atptrt ? atptrt - tosnap : strlen(tosnap); + fsnlen = strlen(fromsnap) - fdslen; /* includes @ sign */ + tsnlen = strlen(tosnap) - tdslen; /* includes @ sign */ + + if (fsnlen <= 1 || tsnlen == 1 || (fdslen == 0 && tdslen == 0) || + (fsnlen == 0 && tsnlen == 0)) { + return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf)); + } else if ((fdslen > 0 && tdslen > 0) && + ((tdslen != fdslen || strncmp(fromsnap, tosnap, fdslen) != 0))) { + /* + * not the same dataset name, might be okay if + * tosnap is a clone of a fromsnap descendant. + */ + char origin[ZFS_MAX_DATASET_NAME_LEN]; + zprop_source_t src; + zfs_handle_t *zhp; + + di->ds = zfs_alloc(di->zhp->zfs_hdl, tdslen + 1); + (void) strncpy(di->ds, tosnap, tdslen); + di->ds[tdslen] = '\0'; + + zhp = zfs_open(hdl, di->ds, ZFS_TYPE_FILESYSTEM); + while (zhp != NULL) { + if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin, + sizeof (origin), &src, NULL, 0, B_FALSE) != 0) { + (void) zfs_close(zhp); + zhp = NULL; + break; + } + if (strncmp(origin, fromsnap, fsnlen) == 0) + break; + + (void) zfs_close(zhp); + zhp = zfs_open(hdl, origin, ZFS_TYPE_FILESYSTEM); + } + + if (zhp == NULL) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Not an earlier snapshot from the same fs")); + return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf)); + } else { + (void) zfs_close(zhp); + } + + di->isclone = B_TRUE; + di->fromsnap = zfs_strdup(hdl, fromsnap); + if (tsnlen) { + di->tosnap = zfs_strdup(hdl, tosnap); + } else { + return (make_temp_snapshot(di)); + } + } else { + int dslen = fdslen ? fdslen : tdslen; + + di->ds = zfs_alloc(hdl, dslen + 1); + (void) strncpy(di->ds, fdslen ? fromsnap : tosnap, dslen); + di->ds[dslen] = '\0'; + + di->fromsnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrf); + if (tsnlen) { + di->tosnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrt); + } else { + return (make_temp_snapshot(di)); + } + } + return (0); +} + +static int +get_mountpoint(differ_info_t *di, char *dsnm, char **mntpt) +{ + boolean_t mounted; + + mounted = is_mounted(di->zhp->zfs_hdl, dsnm, mntpt); + if (mounted == B_FALSE) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Cannot diff an unmounted snapshot")); + return (zfs_error(di->zhp->zfs_hdl, EZFS_BADTYPE, di->errbuf)); + } + + /* Avoid a double slash at the beginning of root-mounted datasets */ + if (**mntpt == '/' && *(*mntpt + 1) == '\0') + **mntpt = '\0'; + return (0); +} + +static int +get_mountpoints(differ_info_t *di) +{ + char *strptr; + char *frommntpt; + + /* + * first get the mountpoint for the parent dataset + */ + if (get_mountpoint(di, di->ds, &di->dsmnt) != 0) + return (-1); + + strptr = strchr(di->tosnap, '@'); + ASSERT3P(strptr, !=, NULL); + di->tomnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", di->dsmnt, + ZDIFF_SNAPDIR, ++strptr); + + strptr = strchr(di->fromsnap, '@'); + ASSERT3P(strptr, !=, NULL); + + frommntpt = di->dsmnt; + if (di->isclone) { + char *mntpt; + int err; + + *strptr = '\0'; + err = get_mountpoint(di, di->fromsnap, &mntpt); + *strptr = '@'; + if (err != 0) + return (-1); + frommntpt = mntpt; + } + + di->frommnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", frommntpt, + ZDIFF_SNAPDIR, ++strptr); + + if (di->isclone) + free(frommntpt); + + return (0); +} + +static int +setup_differ_info(zfs_handle_t *zhp, const char *fromsnap, + const char *tosnap, differ_info_t *di) +{ + di->zhp = zhp; + + di->cleanupfd = open(ZFS_DEV, O_RDWR|O_EXCL); + VERIFY(di->cleanupfd >= 0); + + if (get_snapshot_names(di, fromsnap, tosnap) != 0) + return (-1); + + if (get_mountpoints(di) != 0) + return (-1); + + if (find_shares_object(di) != 0) + return (-1); + + return (0); +} + +int +zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap, + const char *tosnap, int flags) +{ + zfs_cmd_t zc = { 0 }; + char errbuf[1024]; + differ_info_t di = { 0 }; + pthread_t tid; + int pipefd[2]; + int iocerr; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "zfs diff failed")); + + if (setup_differ_info(zhp, fromsnap, tosnap, &di)) { + teardown_differ_info(&di); + return (-1); + } + + if (pipe(pipefd)) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + teardown_differ_info(&di); + return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, errbuf)); + } + + di.scripted = (flags & ZFS_DIFF_PARSEABLE); + di.classify = (flags & ZFS_DIFF_CLASSIFY); + di.timestamped = (flags & ZFS_DIFF_TIMESTAMP); + + di.outputfd = outfd; + di.datafd = pipefd[0]; + + if (pthread_create(&tid, NULL, differ, &di)) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + (void) close(pipefd[0]); + (void) close(pipefd[1]); + teardown_differ_info(&di); + return (zfs_error(zhp->zfs_hdl, + EZFS_THREADCREATEFAILED, errbuf)); + } + + /* do the ioctl() */ + (void) strlcpy(zc.zc_value, di.fromsnap, strlen(di.fromsnap) + 1); + (void) strlcpy(zc.zc_name, di.tosnap, strlen(di.tosnap) + 1); + zc.zc_cookie = pipefd[1]; + + iocerr = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DIFF, &zc); + if (iocerr != 0) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "Unable to obtain diffs")); + if (errno == EPERM) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "\n The sys_mount privilege or diff delegated " + "permission is needed\n to execute the " + "diff ioctl")); + } else if (errno == EXDEV) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "\n Not an earlier snapshot from the same fs")); + } else if (errno != EPIPE || di.zerr == 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + } + (void) close(pipefd[1]); + (void) pthread_cancel(tid); + (void) pthread_join(tid, NULL); + teardown_differ_info(&di); + if (di.zerr != 0 && di.zerr != EPIPE) { + zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr)); + return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf)); + } else { + return (zfs_error(zhp->zfs_hdl, EZFS_DIFFDATA, errbuf)); + } + } + + (void) close(pipefd[1]); + (void) pthread_join(tid, NULL); + + if (di.zerr != 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr)); + return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf)); + } + teardown_differ_info(&di); + return (0); +} Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c =================================================================== RCS file: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c diff -N src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c --- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c 27 Feb 2010 22:30:22 -0000 1.1.1.2 +++ /dev/null 1 Jan 1970 00:00:00 -0000 @@ -1,653 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * Iterate over all children of the current object. This includes the normal - * dataset hierarchy, but also arbitrary hierarchies due to clones. We want to - * walk all datasets in the pool, and construct a directed graph of the form: - * - * home - * | - * +----+----+ - * | | - * v v ws - * bar baz | - * | | - * v v - * @yesterday ----> foo - * - * In order to construct this graph, we have to walk every dataset in the pool, - * because the clone parent is stored as a property of the child, not the - * parent. The parent only keeps track of the number of clones. - * - * In the normal case (without clones) this would be rather expensive. To avoid - * unnecessary computation, we first try a walk of the subtree hierarchy - * starting from the initial node. At each dataset, we construct a node in the - * graph and an edge leading from its parent. If we don't see any snapshots - * with a non-zero clone count, then we are finished. - * - * If we do find a cloned snapshot, then we finish the walk of the current - * subtree, but indicate that we need to do a complete walk. We then perform a - * global walk of all datasets, avoiding the subtree we already processed. - * - * At the end of this, we'll end up with a directed graph of all relevant (and - * possible some irrelevant) datasets in the system. We need to both find our - * limiting subgraph and determine a safe ordering in which to destroy the - * datasets. We do a topological ordering of our graph starting at our target - * dataset, and then walk the results in reverse. - * - * It's possible for the graph to have cycles if, for example, the user renames - * a clone to be the parent of its origin snapshot. The user can request to - * generate an error in this case, or ignore the cycle and continue. - * - * When removing datasets, we want to destroy the snapshots in chronological - * order (because this is the most efficient method). In order to accomplish - * this, we store the creation transaction group with each vertex and keep each - * vertex's edges sorted according to this value. The topological sort will - * automatically walk the snapshots in the correct order. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "libzfs_impl.h" -#include "zfs_namecheck.h" - -#define MIN_EDGECOUNT 4 - -/* - * Vertex structure. Indexed by dataset name, this structure maintains a list - * of edges to other vertices. - */ -struct zfs_edge; -typedef struct zfs_vertex { - char zv_dataset[ZFS_MAXNAMELEN]; - struct zfs_vertex *zv_next; - int zv_visited; - uint64_t zv_txg; - struct zfs_edge **zv_edges; - int zv_edgecount; - int zv_edgealloc; -} zfs_vertex_t; - -enum { - VISIT_SEEN = 1, - VISIT_SORT_PRE, - VISIT_SORT_POST -}; - -/* - * Edge structure. Simply maintains a pointer to the destination vertex. There - * is no need to store the source vertex, since we only use edges in the context - * of the source vertex. - */ -typedef struct zfs_edge { - zfs_vertex_t *ze_dest; - struct zfs_edge *ze_next; -} zfs_edge_t; - -#define ZFS_GRAPH_SIZE 1027 /* this could be dynamic some day */ - -/* - * Graph structure. Vertices are maintained in a hash indexed by dataset name. - */ -typedef struct zfs_graph { - zfs_vertex_t **zg_hash; - size_t zg_size; - size_t zg_nvertex; - const char *zg_root; - int zg_clone_count; -} zfs_graph_t; - -/* - * Allocate a new edge pointing to the target vertex. - */ -static zfs_edge_t * -zfs_edge_create(libzfs_handle_t *hdl, zfs_vertex_t *dest) -{ - zfs_edge_t *zep = zfs_alloc(hdl, sizeof (zfs_edge_t)); - - if (zep == NULL) - return (NULL); - - zep->ze_dest = dest; - - return (zep); -} - -/* - * Destroy an edge. - */ -static void -zfs_edge_destroy(zfs_edge_t *zep) -{ - free(zep); -} - -/* - * Allocate a new vertex with the given name. - */ -static zfs_vertex_t * -zfs_vertex_create(libzfs_handle_t *hdl, const char *dataset) -{ - zfs_vertex_t *zvp = zfs_alloc(hdl, sizeof (zfs_vertex_t)); - - if (zvp == NULL) - return (NULL); - - assert(strlen(dataset) < ZFS_MAXNAMELEN); - - (void) strlcpy(zvp->zv_dataset, dataset, sizeof (zvp->zv_dataset)); - - if ((zvp->zv_edges = zfs_alloc(hdl, - MIN_EDGECOUNT * sizeof (void *))) == NULL) { - free(zvp); - return (NULL); - } - - zvp->zv_edgealloc = MIN_EDGECOUNT; - - return (zvp); -} - -/* - * Destroy a vertex. Frees up any associated edges. - */ -static void -zfs_vertex_destroy(zfs_vertex_t *zvp) -{ - int i; - - for (i = 0; i < zvp->zv_edgecount; i++) - zfs_edge_destroy(zvp->zv_edges[i]); - - free(zvp->zv_edges); - free(zvp); -} - -/* - * Given a vertex, add an edge to the destination vertex. - */ -static int -zfs_vertex_add_edge(libzfs_handle_t *hdl, zfs_vertex_t *zvp, - zfs_vertex_t *dest) -{ - zfs_edge_t *zep = zfs_edge_create(hdl, dest); - - if (zep == NULL) - return (-1); - - if (zvp->zv_edgecount == zvp->zv_edgealloc) { - void *ptr; - - if ((ptr = zfs_realloc(hdl, zvp->zv_edges, - zvp->zv_edgealloc * sizeof (void *), - zvp->zv_edgealloc * 2 * sizeof (void *))) == NULL) - return (-1); - - zvp->zv_edges = ptr; - zvp->zv_edgealloc *= 2; - } - - zvp->zv_edges[zvp->zv_edgecount++] = zep; - - return (0); -} - -static int -zfs_edge_compare(const void *a, const void *b) -{ - const zfs_edge_t *ea = *((zfs_edge_t **)a); - const zfs_edge_t *eb = *((zfs_edge_t **)b); - - if (ea->ze_dest->zv_txg < eb->ze_dest->zv_txg) - return (-1); - if (ea->ze_dest->zv_txg > eb->ze_dest->zv_txg) - return (1); - return (0); -} - -/* - * Sort the given vertex edges according to the creation txg of each vertex. - */ -static void -zfs_vertex_sort_edges(zfs_vertex_t *zvp) -{ - if (zvp->zv_edgecount == 0) - return; - - qsort(zvp->zv_edges, zvp->zv_edgecount, sizeof (void *), - zfs_edge_compare); -} - -/* - * Construct a new graph object. We allow the size to be specified as a - * parameter so in the future we can size the hash according to the number of - * datasets in the pool. - */ -static zfs_graph_t * -zfs_graph_create(libzfs_handle_t *hdl, const char *dataset, size_t size) -{ - zfs_graph_t *zgp = zfs_alloc(hdl, sizeof (zfs_graph_t)); - - if (zgp == NULL) - return (NULL); - - zgp->zg_size = size; - if ((zgp->zg_hash = zfs_alloc(hdl, - size * sizeof (zfs_vertex_t *))) == NULL) { - free(zgp); - return (NULL); - } - - zgp->zg_root = dataset; - zgp->zg_clone_count = 0; - - return (zgp); -} - -/* - * Destroy a graph object. We have to iterate over all the hash chains, - * destroying each vertex in the process. - */ -static void -zfs_graph_destroy(zfs_graph_t *zgp) -{ - int i; - zfs_vertex_t *current, *next; - - for (i = 0; i < zgp->zg_size; i++) { - current = zgp->zg_hash[i]; - while (current != NULL) { - next = current->zv_next; - zfs_vertex_destroy(current); - current = next; - } - } - - free(zgp->zg_hash); - free(zgp); -} - -/* - * Graph hash function. Classic bernstein k=33 hash function, taken from - * usr/src/cmd/sgs/tools/common/strhash.c - */ -static size_t -zfs_graph_hash(zfs_graph_t *zgp, const char *str) -{ - size_t hash = 5381; - int c; - - while ((c = *str++) != 0) - hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ - - return (hash % zgp->zg_size); -} - -/* - * Given a dataset name, finds the associated vertex, creating it if necessary. - */ -static zfs_vertex_t * -zfs_graph_lookup(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset, - uint64_t txg) -{ - size_t idx = zfs_graph_hash(zgp, dataset); - zfs_vertex_t *zvp; - - for (zvp = zgp->zg_hash[idx]; zvp != NULL; zvp = zvp->zv_next) { - if (strcmp(zvp->zv_dataset, dataset) == 0) { - if (zvp->zv_txg == 0) - zvp->zv_txg = txg; - return (zvp); - } - } - - if ((zvp = zfs_vertex_create(hdl, dataset)) == NULL) - return (NULL); - - zvp->zv_next = zgp->zg_hash[idx]; - zvp->zv_txg = txg; - zgp->zg_hash[idx] = zvp; - zgp->zg_nvertex++; - - return (zvp); -} - -/* - * Given two dataset names, create an edge between them. For the source vertex, - * mark 'zv_visited' to indicate that we have seen this vertex, and not simply - * created it as a destination of another edge. If 'dest' is NULL, then this - * is an individual vertex (i.e. the starting vertex), so don't add an edge. - */ -static int -zfs_graph_add(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *source, - const char *dest, uint64_t txg) -{ - zfs_vertex_t *svp, *dvp; - - if ((svp = zfs_graph_lookup(hdl, zgp, source, 0)) == NULL) - return (-1); - svp->zv_visited = VISIT_SEEN; - if (dest != NULL) { - dvp = zfs_graph_lookup(hdl, zgp, dest, txg); - if (dvp == NULL) - return (-1); - if (zfs_vertex_add_edge(hdl, svp, dvp) != 0) - return (-1); - } - - return (0); -} - -/* - * Iterate over all children of the given dataset, adding any vertices - * as necessary. Returns -1 if there was an error, or 0 otherwise. - * This is a simple recursive algorithm - the ZFS namespace typically - * is very flat. We manually invoke the necessary ioctl() calls to - * avoid the overhead and additional semantics of zfs_open(). - */ -static int -iterate_children(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset) -{ - zfs_cmd_t zc = { 0 }; - zfs_vertex_t *zvp; - - /* - * Look up the source vertex, and avoid it if we've seen it before. - */ - zvp = zfs_graph_lookup(hdl, zgp, dataset, 0); - if (zvp == NULL) - return (-1); - if (zvp->zv_visited == VISIT_SEEN) - return (0); - - /* - * Iterate over all children - */ - for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); - ioctl(hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0; - (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) { - /* - * Get statistics for this dataset, to determine the type of the - * dataset and clone statistics. If this fails, the dataset has - * since been removed, and we're pretty much screwed anyway. - */ - zc.zc_objset_stats.dds_origin[0] = '\0'; - if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) - continue; - - if (zc.zc_objset_stats.dds_origin[0] != '\0') { - if (zfs_graph_add(hdl, zgp, - zc.zc_objset_stats.dds_origin, zc.zc_name, - zc.zc_objset_stats.dds_creation_txg) != 0) - return (-1); - /* - * Count origins only if they are contained in the graph - */ - if (isa_child_of(zc.zc_objset_stats.dds_origin, - zgp->zg_root)) - zgp->zg_clone_count--; - } - - /* - * Add an edge between the parent and the child. - */ - if (zfs_graph_add(hdl, zgp, dataset, zc.zc_name, - zc.zc_objset_stats.dds_creation_txg) != 0) - return (-1); - - /* - * Recursively visit child - */ - if (iterate_children(hdl, zgp, zc.zc_name)) - return (-1); - } - - /* - * Now iterate over all snapshots. - */ - bzero(&zc, sizeof (zc)); - - for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); - ioctl(hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT, &zc) == 0; - (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) { - - /* - * Get statistics for this dataset, to determine the type of the - * dataset and clone statistics. If this fails, the dataset has - * since been removed, and we're pretty much screwed anyway. - */ - if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) - continue; - - /* - * Add an edge between the parent and the child. - */ - if (zfs_graph_add(hdl, zgp, dataset, zc.zc_name, - zc.zc_objset_stats.dds_creation_txg) != 0) - return (-1); - - zgp->zg_clone_count += zc.zc_objset_stats.dds_num_clones; - } - - zvp->zv_visited = VISIT_SEEN; - - return (0); -} - -/* - * Returns false if there are no snapshots with dependent clones in this - * subtree or if all of those clones are also in this subtree. Returns - * true if there is an error or there are external dependents. - */ -static boolean_t -external_dependents(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset) -{ - zfs_cmd_t zc = { 0 }; - - /* - * Check whether this dataset is a clone or has clones since - * iterate_children() only checks the children. - */ - (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); - if (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) - return (B_TRUE); - - if (zc.zc_objset_stats.dds_origin[0] != '\0') { - if (zfs_graph_add(hdl, zgp, - zc.zc_objset_stats.dds_origin, zc.zc_name, - zc.zc_objset_stats.dds_creation_txg) != 0) - return (B_TRUE); - if (isa_child_of(zc.zc_objset_stats.dds_origin, dataset)) - zgp->zg_clone_count--; - } - - if ((zc.zc_objset_stats.dds_num_clones) || - iterate_children(hdl, zgp, dataset)) - return (B_TRUE); - - return (zgp->zg_clone_count != 0); -} - -/* - * Construct a complete graph of all necessary vertices. First, iterate over - * only our object's children. If no cloned snapshots are found, or all of - * the cloned snapshots are in this subtree then return a graph of the subtree. - * Otherwise, start at the root of the pool and iterate over all datasets. - */ -static zfs_graph_t * -construct_graph(libzfs_handle_t *hdl, const char *dataset) -{ - zfs_graph_t *zgp = zfs_graph_create(hdl, dataset, ZFS_GRAPH_SIZE); - int ret = 0; - - if (zgp == NULL) - return (zgp); - - if ((strchr(dataset, '/') == NULL) || - (external_dependents(hdl, zgp, dataset))) { - /* - * Determine pool name and try again. - */ - int len = strcspn(dataset, "/@") + 1; - char *pool = zfs_alloc(hdl, len); - - if (pool == NULL) { - zfs_graph_destroy(zgp); - return (NULL); - } - (void) strlcpy(pool, dataset, len); - - if (iterate_children(hdl, zgp, pool) == -1 || - zfs_graph_add(hdl, zgp, pool, NULL, 0) != 0) { - free(pool); - zfs_graph_destroy(zgp); - return (NULL); - } - free(pool); - } - - if (ret == -1 || zfs_graph_add(hdl, zgp, dataset, NULL, 0) != 0) { - zfs_graph_destroy(zgp); - return (NULL); - } - - return (zgp); -} - -/* - * Given a graph, do a recursive topological sort into the given array. This is - * really just a depth first search, so that the deepest nodes appear first. - * hijack the 'zv_visited' marker to avoid visiting the same vertex twice. - */ -static int -topo_sort(libzfs_handle_t *hdl, boolean_t allowrecursion, char **result, - size_t *idx, zfs_vertex_t *zgv) -{ - int i; - - if (zgv->zv_visited == VISIT_SORT_PRE && !allowrecursion) { - /* - * If we've already seen this vertex as part of our depth-first - * search, then we have a cyclic dependency, and we must return - * an error. - */ - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "recursive dependency at '%s'"), - zgv->zv_dataset); - return (zfs_error(hdl, EZFS_RECURSIVE, - dgettext(TEXT_DOMAIN, - "cannot determine dependent datasets"))); - } else if (zgv->zv_visited >= VISIT_SORT_PRE) { - /* - * If we've already processed this as part of the topological - * sort, then don't bother doing so again. - */ - return (0); - } - - zgv->zv_visited = VISIT_SORT_PRE; - - /* avoid doing a search if we don't have to */ - zfs_vertex_sort_edges(zgv); - for (i = 0; i < zgv->zv_edgecount; i++) { - if (topo_sort(hdl, allowrecursion, result, idx, - zgv->zv_edges[i]->ze_dest) != 0) - return (-1); - } - - /* we may have visited this in the course of the above */ - if (zgv->zv_visited == VISIT_SORT_POST) - return (0); - - if ((result[*idx] = zfs_alloc(hdl, - strlen(zgv->zv_dataset) + 1)) == NULL) - return (-1); - - (void) strcpy(result[*idx], zgv->zv_dataset); - *idx += 1; - zgv->zv_visited = VISIT_SORT_POST; - return (0); -} - -/* - * The only public interface for this file. Do the dirty work of constructing a - * child list for the given object. Construct the graph, do the toplogical - * sort, and then return the array of strings to the caller. - * - * The 'allowrecursion' parameter controls behavior when cycles are found. If - * it is set, the the cycle is ignored and the results returned as if the cycle - * did not exist. If it is not set, then the routine will generate an error if - * a cycle is found. - */ -int -get_dependents(libzfs_handle_t *hdl, boolean_t allowrecursion, - const char *dataset, char ***result, size_t *count) -{ - zfs_graph_t *zgp; - zfs_vertex_t *zvp; - - if ((zgp = construct_graph(hdl, dataset)) == NULL) - return (-1); - - if ((*result = zfs_alloc(hdl, - zgp->zg_nvertex * sizeof (char *))) == NULL) { - zfs_graph_destroy(zgp); - return (-1); - } - - if ((zvp = zfs_graph_lookup(hdl, zgp, dataset, 0)) == NULL) { - free(*result); - zfs_graph_destroy(zgp); - return (-1); - } - - *count = 0; - if (topo_sort(hdl, allowrecursion, *result, count, zvp) != 0) { - free(*result); - zfs_graph_destroy(zgp); - return (-1); - } - - /* - * Get rid of the last entry, which is our starting vertex and not - * strictly a dependent. - */ - assert(*count > 0); - free((*result)[*count - 1]); - (*count)--; - - zfs_graph_destroy(zgp); - - return (0); -} Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 libzfs_impl.h --- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h 27 Feb 2010 22:30:17 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h 20 Apr 2017 00:10:55 -0000 @@ -20,35 +20,31 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013 Martin Matuska . All rights reserved. */ -#ifndef _LIBFS_IMPL_H -#define _LIBFS_IMPL_H +#ifndef _LIBZFS_IMPL_H +#define _LIBZFS_IMPL_H -#include #include -#include -#include #include #include +#include +#include +#include #include #include -#include - -#include +#include +#include #ifdef __cplusplus extern "C" { #endif -#ifdef VERIFY -#undef VERIFY -#endif -#define VERIFY verify - typedef struct libzfs_fru { char *zf_device; char *zf_fru; @@ -68,14 +64,13 @@ struct libzfs_handle { int libzfs_desc_active; char libzfs_action[1024]; char libzfs_desc[1024]; - char *libzfs_log_str; int libzfs_printerr; + int libzfs_storeerr; /* stuff error messages into buffer */ void *libzfs_sharehdl; /* libshare handle */ uint_t libzfs_shareflags; boolean_t libzfs_mnttab_enable; avl_tree_t libzfs_mnttab_cache; int libzfs_pool_iter; - topo_hdl_t *libzfs_topo_hdl; libzfs_fru_t **libzfs_fru_hash; libzfs_fru_t *libzfs_fru_list; char libzfs_chassis_id[256]; @@ -86,7 +81,7 @@ struct libzfs_handle { struct zfs_handle { libzfs_handle_t *zfs_hdl; zpool_handle_t *zpool_hdl; - char zfs_name[ZFS_MAXNAMELEN]; + char zfs_name[ZFS_MAX_DATASET_NAME_LEN]; zfs_type_t zfs_type; /* type including snapshot */ zfs_type_t zfs_head_type; /* type excluding snapshot */ dmu_objset_stats_t zfs_dmustats; @@ -107,7 +102,7 @@ struct zfs_handle { struct zpool_handle { libzfs_handle_t *zpool_hdl; zpool_handle_t *zpool_next; - char zpool_name[ZPOOL_MAXNAMELEN]; + char zpool_name[ZFS_MAX_DATASET_NAME_LEN]; int zpool_state; size_t zpool_config_size; nvlist_t *zpool_config; @@ -116,7 +111,7 @@ struct zpool_handle { diskaddr_t zpool_start_block; }; -typedef enum { +typedef enum { PROTO_NFS = 0, PROTO_SMB = 1, PROTO_END = 2 @@ -128,7 +123,6 @@ typedef enum { */ typedef enum { SHARED_NOT_SHARED = 0x0, - SHARED_ISCSI = 0x1, SHARED_NFS = 0x2, SHARED_SMB = 0x4 } zfs_share_type_t; @@ -138,6 +132,7 @@ int zfs_error_fmt(libzfs_handle_t *, int void zfs_error_aux(libzfs_handle_t *, const char *, ...); void *zfs_alloc(libzfs_handle_t *, size_t); void *zfs_realloc(libzfs_handle_t *, void *, size_t, size_t); +char *zfs_asprintf(libzfs_handle_t *, const char *, ...); char *zfs_strdup(libzfs_handle_t *, const char *); int no_memory(libzfs_handle_t *); @@ -148,7 +143,8 @@ int zpool_standard_error_fmt(libzfs_hand int get_dependents(libzfs_handle_t *, boolean_t, const char *, char ***, size_t *); - +zfs_handle_t *make_dataset_handle_zc(libzfs_handle_t *, zfs_cmd_t *); +zfs_handle_t *make_dataset_simple_handle_zc(zfs_handle_t *, zfs_cmd_t *); int zprop_parse_value(libzfs_handle_t *, nvpair_t *, int, zfs_type_t, nvlist_t *, char **, uint64_t *, const char *); @@ -160,7 +156,11 @@ int zprop_expand_list(libzfs_handle_t *h * on each change node regardless of whether or not it is currently * mounted. */ -#define CL_GATHER_MOUNT_ALWAYS 1 +#define CL_GATHER_MOUNT_ALWAYS 0x01 +/* + * Use this changelist_gather() flag to prevent unmounting of file systems. + */ +#define CL_GATHER_DONT_UNMOUNT 0x02 typedef struct prop_changelist prop_changelist_t; @@ -185,11 +185,16 @@ int create_parents(libzfs_handle_t *, ch boolean_t isa_child_of(const char *dataset, const char *parent); zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *); +zfs_handle_t *make_bookmark_handle(zfs_handle_t *, const char *, + nvlist_t *props); int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **); boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *); +int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, + boolean_t modifying); + void namespace_clear(libzfs_handle_t *); /* @@ -209,4 +214,4 @@ extern void libzfs_fru_clear(libzfs_hand } #endif -#endif /* _LIBFS_IMPL_H */ +#endif /* _LIBZFS_IMPL_H */ Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c,v retrieving revision 1.3 diff -u -p -r1.3 libzfs_import.c --- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c 27 Feb 2010 23:43:52 -0000 1.3 +++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c 5 May 2017 17:34:14 -0000 @@ -18,9 +18,12 @@ * * CDDL HEADER END */ + /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2015 RackTop Systems. + * Copyright 2016 Nexenta Systems, Inc. */ /* @@ -39,15 +42,25 @@ * using our derived config, and record the results. */ +#include #include #include #include #include +#include #include #include #include #include #include +#include +#ifdef __FreeBSD__ +#include +#endif +#ifdef __NetBSD__ +#include +static int native_ioctl(int fd, unsigned long cmd, void *arg); +#endif #include @@ -89,6 +102,7 @@ typedef struct pool_list { static char * get_devid(const char *path) { +#ifdef have_devid int fd; ddi_devid_t devid; char *minor, *ret; @@ -108,6 +122,9 @@ get_devid(const char *path) (void) close(fd); return (ret); +#else + return (NULL); +#endif } @@ -189,8 +206,10 @@ fix_paths(nvlist_t *nv, name_entry_t *na if ((devid = get_devid(best->ne_name)) == NULL) { (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); } else { - if (nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, devid) != 0) + if (nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, devid) != 0) { + devid_str_free(devid); return (-1); + } devid_str_free(devid); } @@ -427,13 +446,13 @@ get_configs(libzfs_handle_t *hdl, pool_l pool_entry_t *pe; vdev_entry_t *ve; config_entry_t *ce; - nvlist_t *ret = NULL, *config = NULL, *tmp, *nvtop, *nvroot; + nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot; nvlist_t **spares, **l2cache; uint_t i, nspares, nl2cache; boolean_t config_seen; uint64_t best_txg; - char *name, *hostname; - uint64_t version, guid; + char *name, *hostname = NULL; + uint64_t guid; uint_t children = 0; nvlist_t **child = NULL; uint_t holes; @@ -519,47 +538,48 @@ get_configs(libzfs_handle_t *hdl, pool_l * configuration: * * version - * pool guid - * name - * pool state + * pool guid + * name + * comment (if available) + * pool state * hostid (if available) * hostname (if available) */ - uint64_t state; + uint64_t state, version; + char *comment = NULL; + + version = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_VERSION); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_VERSION, version); + guid = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_GUID); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_GUID, guid); + name = fnvlist_lookup_string(tmp, + ZPOOL_CONFIG_POOL_NAME); + fnvlist_add_string(config, + ZPOOL_CONFIG_POOL_NAME, name); + + if (nvlist_lookup_string(tmp, + ZPOOL_CONFIG_COMMENT, &comment) == 0) + fnvlist_add_string(config, + ZPOOL_CONFIG_COMMENT, comment); + + state = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_STATE); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_STATE, state); - verify(nvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_VERSION, &version) == 0); - if (nvlist_add_uint64(config, - ZPOOL_CONFIG_VERSION, version) != 0) - goto nomem; - verify(nvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_POOL_GUID, &guid) == 0); - if (nvlist_add_uint64(config, - ZPOOL_CONFIG_POOL_GUID, guid) != 0) - goto nomem; - verify(nvlist_lookup_string(tmp, - ZPOOL_CONFIG_POOL_NAME, &name) == 0); - if (nvlist_add_string(config, - ZPOOL_CONFIG_POOL_NAME, name) != 0) - goto nomem; - verify(nvlist_lookup_uint64(tmp, - ZPOOL_CONFIG_POOL_STATE, &state) == 0); - if (nvlist_add_uint64(config, - ZPOOL_CONFIG_POOL_STATE, state) != 0) - goto nomem; hostid = 0; if (nvlist_lookup_uint64(tmp, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { - if (nvlist_add_uint64(config, - ZPOOL_CONFIG_HOSTID, hostid) != 0) - goto nomem; - verify(nvlist_lookup_string(tmp, - ZPOOL_CONFIG_HOSTNAME, - &hostname) == 0); - if (nvlist_add_string(config, - ZPOOL_CONFIG_HOSTNAME, - hostname) != 0) - goto nomem; + fnvlist_add_uint64(config, + ZPOOL_CONFIG_HOSTID, hostid); + hostname = fnvlist_lookup_string(tmp, + ZPOOL_CONFIG_HOSTNAME); + fnvlist_add_string(config, + ZPOOL_CONFIG_HOSTNAME, hostname); } config_seen = B_TRUE; @@ -655,8 +675,10 @@ get_configs(libzfs_handle_t *hdl, pool_l nvlist_add_uint64(holey, ZPOOL_CONFIG_ID, c) != 0 || nvlist_add_uint64(holey, - ZPOOL_CONFIG_GUID, 0ULL) != 0) + ZPOOL_CONFIG_GUID, 0ULL) != 0) { + nvlist_free(holey); goto nomem; + } child[c] = holey; } } @@ -897,10 +919,203 @@ zpool_read_label(int fd, nvlist_t **conf return (0); } +typedef struct rdsk_node { + char *rn_name; + int rn_dfd; + libzfs_handle_t *rn_hdl; + nvlist_t *rn_config; + avl_tree_t *rn_avl; + avl_node_t rn_node; + boolean_t rn_nozpool; +} rdsk_node_t; + +static int +slice_cache_compare(const void *arg1, const void *arg2) +{ + const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; + const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; + char *nm1slice, *nm2slice; + int rv; + + /* + * slices zero and two are the most likely to provide results, + * so put those first + */ + nm1slice = strstr(nm1, "s0"); + nm2slice = strstr(nm2, "s0"); + if (nm1slice && !nm2slice) { + return (-1); + } + if (!nm1slice && nm2slice) { + return (1); + } + nm1slice = strstr(nm1, "s2"); + nm2slice = strstr(nm2, "s2"); + if (nm1slice && !nm2slice) { + return (-1); + } + if (!nm1slice && nm2slice) { + return (1); + } + + rv = strcmp(nm1, nm2); + if (rv == 0) + return (0); + return (rv > 0 ? 1 : -1); +} + +#ifdef illumos +static void +check_one_slice(avl_tree_t *r, char *diskname, uint_t partno, + diskaddr_t size, uint_t blksz) +{ + rdsk_node_t tmpnode; + rdsk_node_t *node; + char sname[MAXNAMELEN]; + + tmpnode.rn_name = &sname[0]; + (void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u", + diskname, partno); + /* + * protect against division by zero for disk labels that + * contain a bogus sector size + */ + if (blksz == 0) + blksz = DEV_BSIZE; + /* too small to contain a zpool? */ + if ((size < (SPA_MINDEVSIZE / blksz)) && + (node = avl_find(r, &tmpnode, NULL))) + node->rn_nozpool = B_TRUE; +} +#endif /* illumos */ + +static void +nozpool_all_slices(avl_tree_t *r, const char *sname) +{ +#ifdef illumos + char diskname[MAXNAMELEN]; + char *ptr; + int i; + + (void) strncpy(diskname, sname, MAXNAMELEN); + if (((ptr = strrchr(diskname, 's')) == NULL) && + ((ptr = strrchr(diskname, 'p')) == NULL)) + return; + ptr[0] = 's'; + ptr[1] = '\0'; + for (i = 0; i < NDKMAP; i++) + check_one_slice(r, diskname, i, 0, 1); + ptr[0] = 'p'; + for (i = 0; i <= FD_NUMPART; i++) + check_one_slice(r, diskname, i, 0, 1); +#endif /* illumos */ +} + +#ifdef illumos +static void +check_slices(avl_tree_t *r, int fd, const char *sname) +{ + struct extvtoc vtoc; + struct dk_gpt *gpt; + char diskname[MAXNAMELEN]; + char *ptr; + int i; + + (void) strncpy(diskname, sname, MAXNAMELEN); + if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1])) + return; + ptr[1] = '\0'; + + if (read_extvtoc(fd, &vtoc) >= 0) { + for (i = 0; i < NDKMAP; i++) + check_one_slice(r, diskname, i, + vtoc.v_part[i].p_size, vtoc.v_sectorsz); + } else if (efi_alloc_and_read(fd, &gpt) >= 0) { + /* + * on x86 we'll still have leftover links that point + * to slices s[9-15], so use NDKMAP instead + */ + for (i = 0; i < NDKMAP; i++) + check_one_slice(r, diskname, i, + gpt->efi_parts[i].p_size, gpt->efi_lbasize); + /* nodes p[1-4] are never used with EFI labels */ + ptr[0] = 'p'; + for (i = 1; i <= FD_NUMPART; i++) + check_one_slice(r, diskname, i, 0, 1); + efi_free(gpt); + } +} +#endif /* illumos */ + +static void +zpool_open_func(void *arg) +{ + rdsk_node_t *rn = arg; + struct stat64 statbuf; + nvlist_t *config; + int fd; + + if (rn->rn_nozpool) + return; + if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) { + /* symlink to a device that's no longer there */ + if (errno == ENOENT) + nozpool_all_slices(rn->rn_avl, rn->rn_name); + return; + } + /* + * Ignore failed stats. We only want regular + * files, character devs and block devs. + */ + if (fstat64(fd, &statbuf) != 0 || + (!S_ISREG(statbuf.st_mode) && + !S_ISCHR(statbuf.st_mode) && + !S_ISBLK(statbuf.st_mode))) { + (void) close(fd); + return; + } + /* this file is too small to hold a zpool */ +#ifdef illumos + if (S_ISREG(statbuf.st_mode) && + statbuf.st_size < SPA_MINDEVSIZE) { + (void) close(fd); + return; + } else if (!S_ISREG(statbuf.st_mode)) { + /* + * Try to read the disk label first so we don't have to + * open a bunch of minor nodes that can't have a zpool. + */ + check_slices(rn->rn_avl, fd, rn->rn_name); + } +#endif /* illumos */ +#ifdef __FreeBSD__ + if (statbuf.st_size < SPA_MINDEVSIZE) { + (void) close(fd); + return; + } +#endif /* __FreeBSD__ */ +#ifdef __NetBSD__ + off_t size; + + if (native_ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || + size < SPA_MINDEVSIZE) { + (void) close(fd); + return; + } +#endif + + if ((zpool_read_label(fd, &config)) != 0) { + (void) close(fd); + (void) no_memory(rn->rn_hdl); + return; + } + (void) close(fd); + + rn->rn_config = config; +} + /* - * Given a file descriptor, clear (zero) the label information. This function - * is currently only used in the appliance stack as part of the ZFS sysevent - * module. + * Given a file descriptor, clear (zero) the label information. */ int zpool_clear_label(int fd) @@ -919,8 +1134,10 @@ zpool_clear_label(int fd) for (l = 0; l < VDEV_LABELS; l++) { if (pwrite64(fd, label, sizeof (vdev_label_t), - label_offset(size, l)) != sizeof (vdev_label_t)) + label_offset(size, l)) != sizeof (vdev_label_t)) { + free(label); return (-1); + } } free(label); @@ -938,20 +1155,20 @@ static nvlist_t * zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) { int i, dirs = iarg->paths; - DIR *dirp = NULL; struct dirent64 *dp; char path[MAXPATHLEN]; char *end, **dir = iarg->path; size_t pathleft; - struct stat64 statbuf; - nvlist_t *ret = NULL, *config; - static char *default_dir = "/dev/dsk"; - int fd; + nvlist_t *ret = NULL; + static char *default_dir = "/dev"; pool_list_t pools = { 0 }; pool_entry_t *pe, *penext; vdev_entry_t *ve, *venext; config_entry_t *ce, *cenext; name_entry_t *ne, *nenext; + avl_tree_t slice_cache; + rdsk_node_t *slice; + void *cookie; if (dirs == 0) { dirs = 1; @@ -964,8 +1181,11 @@ zpool_find_import_impl(libzfs_handle_t * * and toplevel GUID. */ for (i = 0; i < dirs; i++) { - char *rdsk; + tpool_t *t; + char rdsk[MAXPATHLEN]; int dfd; + boolean_t config_failed = B_FALSE; + DIR *dirp; /* use realpath to normalize the path */ if (realpath(dir[i], path) == 0) { @@ -978,18 +1198,22 @@ zpool_find_import_impl(libzfs_handle_t * *end = 0; pathleft = &path[sizeof (path)] - end; +#ifdef illumos /* * Using raw devices instead of block devices when we're * reading the labels skips a bunch of slow operations during * close(2) processing, so we replace /dev/dsk with /dev/rdsk. */ - if (strcmp(path, "/dev/dsk/") == 0) - rdsk = "/dev/rdsk/"; + if (strcmp(path, ZFS_DISK_ROOTD) == 0) + (void) strlcpy(rdsk, ZFS_RDISK_ROOTD, sizeof (rdsk)); else - rdsk = path; +#endif + (void) strlcpy(rdsk, path, sizeof (rdsk)); if ((dfd = open64(rdsk, O_RDONLY)) < 0 || (dirp = fdopendir(dfd)) == NULL) { + if (dfd >= 0) + (void) close(dfd); zfs_error_aux(hdl, strerror(errno)); (void) zfs_error_fmt(hdl, EZFS_BADPATH, dgettext(TEXT_DOMAIN, "cannot open '%s'"), @@ -997,6 +1221,79 @@ zpool_find_import_impl(libzfs_handle_t * goto error; } + avl_create(&slice_cache, slice_cache_compare, + sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); + +#ifdef __FreeBSD__ + if (strcmp(rdsk, "/dev/") == 0) { + struct gmesh mesh; + struct gclass *mp; + struct ggeom *gp; + struct gprovider *pp; + + errno = geom_gettree(&mesh); + if (errno != 0) { + zfs_error_aux(hdl, strerror(errno)); + (void) zfs_error_fmt(hdl, EZFS_BADPATH, + dgettext(TEXT_DOMAIN, "cannot get GEOM tree")); + goto error; + } + + LIST_FOREACH(mp, &mesh.lg_class, lg_class) { + LIST_FOREACH(gp, &mp->lg_geom, lg_geom) { + LIST_FOREACH(pp, &gp->lg_provider, lg_provider) { + slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zfs_strdup(hdl, pp->lg_name); + slice->rn_avl = &slice_cache; + slice->rn_dfd = dfd; + slice->rn_hdl = hdl; + slice->rn_nozpool = B_FALSE; + avl_add(&slice_cache, slice); + } + } + } + + geom_deletetree(&mesh); + goto skipdir; + } +#endif +#ifdef __NetBSD__ + if (strcmp(rdsk, "/dev/") == 0) { + static const char mib_name[] = "hw.disknames"; + size_t len; + char *disknames, *last, *name; + char part; + + part = getrawpartition(); + if (sysctlbyname(mib_name, NULL, &len, NULL, 0) == -1) { + zfs_error_aux(hdl, strerror(errno)); + (void) zfs_error_fmt(hdl, EZFS_BADPATH, + dgettext(TEXT_DOMAIN, "cannot get hw.disknames list")); + + avl_destroy(&slice_cache); + (void) closedir(dirp); + goto error; + } + disknames = zfs_alloc(hdl, len + 2); + (void)sysctlbyname(mib_name, disknames, &len, NULL, 0); + + + for ((name = strtok_r(disknames, " ", &last)); name; + (name = strtok_r(NULL, " ", &last))) { + slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zfs_asprintf(hdl, "%s%c", name, 'a' + part); + slice->rn_avl = &slice_cache; + slice->rn_dfd = dfd; + slice->rn_hdl = hdl; + slice->rn_nozpool = B_FALSE; + avl_add(&slice_cache, slice); + } + free(disknames); + + goto skipdir; + } +#endif + /* * This is not MT-safe, but we have no MT consumers of libzfs */ @@ -1006,33 +1303,38 @@ zpool_find_import_impl(libzfs_handle_t * (name[1] == 0 || (name[1] == '.' && name[2] == 0))) continue; - (void)snprintf(path, sizeof (path), "%s/%s", - rdsk, dp->d_name); - - if ((fd = open(path, O_RDONLY)) < 0) - continue; - - /* - * Ignore failed stats. We only want regular - * files, character devs and block devs. - */ - if (fstat64(fd, &statbuf) != 0 || - (!S_ISREG(statbuf.st_mode) && - !S_ISCHR(statbuf.st_mode) && - !S_ISBLK(statbuf.st_mode))) { - (void) close(fd); - continue; - } - - if ((zpool_read_label(fd, &config)) != 0) { - (void) close(fd); - (void) no_memory(hdl); - goto error; - } - - (void) close(fd); - - if (config != NULL) { + slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zfs_strdup(hdl, name); + slice->rn_avl = &slice_cache; + slice->rn_dfd = dfd; + slice->rn_hdl = hdl; + slice->rn_nozpool = B_FALSE; + avl_add(&slice_cache, slice); + } +skipdir: + /* + * create a thread pool to do all of this in parallel; + * rn_nozpool is not protected, so this is racy in that + * multiple tasks could decide that the same slice can + * not hold a zpool, which is benign. Also choose + * double the number of processors; we hold a lot of + * locks in the kernel, so going beyond this doesn't + * buy us much. + */ + t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), + 0, NULL); + for (slice = avl_first(&slice_cache); slice; + (slice = avl_walk(&slice_cache, slice, + AVL_AFTER))) + (void) tpool_dispatch(t, zpool_open_func, slice); + tpool_wait(t); + tpool_destroy(t); + + cookie = NULL; + while ((slice = avl_destroy_nodes(&slice_cache, + &cookie)) != NULL) { + if (slice->rn_config != NULL && !config_failed) { + nvlist_t *config = slice->rn_config; boolean_t matched = B_TRUE; if (iarg->poolname != NULL) { @@ -1052,18 +1354,26 @@ zpool_find_import_impl(libzfs_handle_t * } if (!matched) { nvlist_free(config); - config = NULL; - continue; + } else { + /* + * use the non-raw path for the config + */ + (void) strlcpy(end, slice->rn_name, + pathleft); + if (add_config(hdl, &pools, path, + config) != 0) + config_failed = B_TRUE; } - /* use the non-raw path for the config */ - (void) strlcpy(end, name, pathleft); - if (add_config(hdl, &pools, path, config) != 0) - goto error; } + free(slice->rn_name); + free(slice); } + avl_destroy(&slice_cache); (void) closedir(dirp); - dirp = NULL; + + if (config_failed) + goto error; } ret = get_configs(hdl, &pools, iarg->can_be_active); @@ -1075,8 +1385,7 @@ error: venext = ve->ve_next; for (ce = ve->ve_configs; ce != NULL; ce = cenext) { cenext = ce->ce_next; - if (ce->ce_config) - nvlist_free(ce->ce_config); + nvlist_free(ce->ce_config); free(ce); } free(ve); @@ -1086,14 +1395,10 @@ error: for (ne = pools.names; ne != NULL; ne = nenext) { nenext = ne->ne_next; - if (ne->ne_name) - free(ne->ne_name); + free(ne->ne_name); free(ne); } - if (dirp) - (void) closedir(dirp); - return (ret); } @@ -1182,21 +1487,15 @@ zpool_find_import_cached(libzfs_handle_t elem = NULL; while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) { - verify(nvpair_value_nvlist(elem, &src) == 0); + src = fnvpair_value_nvlist(elem); - verify(nvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME, - &name) == 0); + name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME); if (poolname != NULL && strcmp(poolname, name) != 0) continue; - verify(nvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID, - &this_guid) == 0); - if (guid != 0) { - verify(nvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID, - &this_guid) == 0); - if (guid != this_guid) - continue; - } + this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID); + if (guid != 0 && guid != this_guid) + continue; if (pool_active(hdl, name, this_guid, &active) != 0) { nvlist_free(raw); @@ -1366,6 +1665,24 @@ zpool_in_use(libzfs_handle_t *hdl, int f switch (stateval) { case POOL_STATE_EXPORTED: + /* + * A pool with an exported state may in fact be imported + * read-only, so check the in-core state to see if it's + * active and imported read-only. If it is, set + * its state to active. + */ + if (pool_active(hdl, name, guid, &isactive) == 0 && isactive && + (zhp = zpool_open_canfail(hdl, name)) != NULL) { + if (zpool_get_prop_int(zhp, ZPOOL_PROP_READONLY, NULL)) + stateval = POOL_STATE_ACTIVE; + + /* + * All we needed the zpool handle for is the + * readonly prop check. + */ + zpool_close(zhp); + } + ret = B_TRUE; break; @@ -1439,9 +1756,9 @@ zpool_in_use(libzfs_handle_t *hdl, int f cb.cb_type = ZPOOL_CONFIG_SPARES; if (zpool_iter(hdl, find_aux, &cb) == 1) { name = (char *)zpool_get_name(cb.cb_zhp); - ret = TRUE; + ret = B_TRUE; } else { - ret = FALSE; + ret = B_FALSE; } break; @@ -1455,9 +1772,9 @@ zpool_in_use(libzfs_handle_t *hdl, int f cb.cb_type = ZPOOL_CONFIG_L2CACHE; if (zpool_iter(hdl, find_aux, &cb) == 1) { name = (char *)zpool_get_name(cb.cb_zhp); - ret = TRUE; + ret = B_TRUE; } else { - ret = FALSE; + ret = B_FALSE; } break; @@ -1483,3 +1800,18 @@ zpool_in_use(libzfs_handle_t *hdl, int f *inuse = ret; return (0); } + +#ifdef __NetBSD__ +/* + * This needs to be at the end of the file so that we can #undef ioctl + * without affecting anything else. + */ +#undef ioctl + +static int +native_ioctl(int fd, unsigned long cmd, void *arg) +{ + + return ioctl(fd, cmd, arg); +} +#endif Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_iter.c =================================================================== RCS file: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_iter.c diff -N src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_iter.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_iter.c 10 Oct 2016 11:14:25 -0000 @@ -0,0 +1,526 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012 Pawel Jakub Dawidek. All rights reserved. + * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "libzfs_impl.h" + +int +zfs_iter_clones(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + nvlist_t *nvl = zfs_get_clones_nvl(zhp); + nvpair_t *pair; + + if (nvl == NULL) + return (0); + + for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL; + pair = nvlist_next_nvpair(nvl, pair)) { + zfs_handle_t *clone = zfs_open(zhp->zfs_hdl, nvpair_name(pair), + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (clone != NULL) { + int err = func(clone, data); + if (err != 0) + return (err); + } + } + return (0); +} + +static int +zfs_do_list_ioctl(zfs_handle_t *zhp, unsigned long arg, zfs_cmd_t *zc) +{ + int rc; + uint64_t orig_cookie; + + orig_cookie = zc->zc_cookie; +top: + (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); + rc = ioctl(zhp->zfs_hdl->libzfs_fd, arg, zc); + + if (rc == -1) { + switch (errno) { + case ENOMEM: + /* expand nvlist memory and try again */ + if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, zc) != 0) { + zcmd_free_nvlists(zc); + return (-1); + } + zc->zc_cookie = orig_cookie; + goto top; + /* + * An errno value of ESRCH indicates normal completion. + * If ENOENT is returned, then the underlying dataset + * has been removed since we obtained the handle. + */ + case ESRCH: + case ENOENT: + rc = 1; + break; + default: + rc = zfs_standard_error(zhp->zfs_hdl, errno, + dgettext(TEXT_DOMAIN, + "cannot iterate filesystems")); + break; + } + } + return (rc); +} + +/* + * Iterate over all child filesystems + */ +int +zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + zfs_cmd_t zc = { 0 }; + zfs_handle_t *nzhp; + int ret; + + if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM) + return (0); + + if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) + return (-1); + + while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT, + &zc)) == 0) { + /* + * Silently ignore errors, as the only plausible explanation is + * that the pool has since been removed. + */ + if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl, + &zc)) == NULL) { + continue; + } + + if ((ret = func(nzhp, data)) != 0) { + zcmd_free_nvlists(&zc); + return (ret); + } + } + zcmd_free_nvlists(&zc); + return ((ret < 0) ? ret : 0); +} + +/* + * Iterate over all snapshots + */ +int +zfs_iter_snapshots(zfs_handle_t *zhp, boolean_t simple, zfs_iter_f func, + void *data) +{ + zfs_cmd_t zc = { 0 }; + zfs_handle_t *nzhp; + int ret; + + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT || + zhp->zfs_type == ZFS_TYPE_BOOKMARK) + return (0); + + zc.zc_simple = simple; + + if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) + return (-1); + while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT, + &zc)) == 0) { + + if (simple) + nzhp = make_dataset_simple_handle_zc(zhp, &zc); + else + nzhp = make_dataset_handle_zc(zhp->zfs_hdl, &zc); + if (nzhp == NULL) + continue; + + if ((ret = func(nzhp, data)) != 0) { + zcmd_free_nvlists(&zc); + return (ret); + } + } + zcmd_free_nvlists(&zc); + return ((ret < 0) ? ret : 0); +} + +/* + * Iterate over all bookmarks + */ +int +zfs_iter_bookmarks(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + zfs_handle_t *nzhp; + nvlist_t *props = NULL; + nvlist_t *bmarks = NULL; + int err; + + if ((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT | ZFS_TYPE_BOOKMARK)) != 0) + return (0); + + /* Setup the requested properties nvlist. */ + props = fnvlist_alloc(); + fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_GUID)); + fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_CREATETXG)); + fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_CREATION)); + + if ((err = lzc_get_bookmarks(zhp->zfs_name, props, &bmarks)) != 0) + goto out; + + for (nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + char *bmark_name; + nvlist_t *bmark_props; + + bmark_name = nvpair_name(pair); + bmark_props = fnvpair_value_nvlist(pair); + + (void) snprintf(name, sizeof (name), "%s#%s", zhp->zfs_name, + bmark_name); + + nzhp = make_bookmark_handle(zhp, name, bmark_props); + if (nzhp == NULL) + continue; + + if ((err = func(nzhp, data)) != 0) + goto out; + } + +out: + fnvlist_free(props); + fnvlist_free(bmarks); + + return (err); +} + +/* + * Routines for dealing with the sorted snapshot functionality + */ +typedef struct zfs_node { + zfs_handle_t *zn_handle; + avl_node_t zn_avlnode; +} zfs_node_t; + +static int +zfs_sort_snaps(zfs_handle_t *zhp, void *data) +{ + avl_tree_t *avl = data; + zfs_node_t *node; + zfs_node_t search; + + search.zn_handle = zhp; + node = avl_find(avl, &search, NULL); + if (node) { + /* + * If this snapshot was renamed while we were creating the + * AVL tree, it's possible that we already inserted it under + * its old name. Remove the old handle before adding the new + * one. + */ + zfs_close(node->zn_handle); + avl_remove(avl, node); + free(node); + } + + node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t)); + node->zn_handle = zhp; + avl_add(avl, node); + + return (0); +} + +static int +zfs_snapshot_compare(const void *larg, const void *rarg) +{ + zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; + zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; + uint64_t lcreate, rcreate; + + /* + * Sort them according to creation time. We use the hidden + * CREATETXG property to get an absolute ordering of snapshots. + */ + lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); + rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); + + if (lcreate < rcreate) + return (-1); + else if (lcreate > rcreate) + return (+1); + else + return (0); +} + +int +zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, void *data) +{ + int ret = 0; + zfs_node_t *node; + avl_tree_t avl; + void *cookie = NULL; + + avl_create(&avl, zfs_snapshot_compare, + sizeof (zfs_node_t), offsetof(zfs_node_t, zn_avlnode)); + + ret = zfs_iter_snapshots(zhp, B_FALSE, zfs_sort_snaps, &avl); + + for (node = avl_first(&avl); node != NULL; node = AVL_NEXT(&avl, node)) + ret |= callback(node->zn_handle, data); + + while ((node = avl_destroy_nodes(&avl, &cookie)) != NULL) + free(node); + + avl_destroy(&avl); + + return (ret); +} + +typedef struct { + char *ssa_first; + char *ssa_last; + boolean_t ssa_seenfirst; + boolean_t ssa_seenlast; + zfs_iter_f ssa_func; + void *ssa_arg; +} snapspec_arg_t; + +static int +snapspec_cb(zfs_handle_t *zhp, void *arg) +{ + snapspec_arg_t *ssa = arg; + char *shortsnapname; + int err = 0; + + if (ssa->ssa_seenlast) + return (0); + shortsnapname = zfs_strdup(zhp->zfs_hdl, + strchr(zfs_get_name(zhp), '@') + 1); + + if (!ssa->ssa_seenfirst && strcmp(shortsnapname, ssa->ssa_first) == 0) + ssa->ssa_seenfirst = B_TRUE; + + if (ssa->ssa_seenfirst) { + err = ssa->ssa_func(zhp, ssa->ssa_arg); + } else { + zfs_close(zhp); + } + + if (strcmp(shortsnapname, ssa->ssa_last) == 0) + ssa->ssa_seenlast = B_TRUE; + free(shortsnapname); + + return (err); +} + +/* + * spec is a string like "A,B%C,D" + * + * , where can be: + * (single snapshot) + * % (range of snapshots, inclusive) + * % (range of snapshots, starting with earliest) + * % (range of snapshots, ending with last) + * % (all snapshots) + * [,...] (comma separated list of the above) + * + * If a snapshot can not be opened, continue trying to open the others, but + * return ENOENT at the end. + */ +int +zfs_iter_snapspec(zfs_handle_t *fs_zhp, const char *spec_orig, + zfs_iter_f func, void *arg) +{ + char *buf, *comma_separated, *cp; + int err = 0; + int ret = 0; + + buf = zfs_strdup(fs_zhp->zfs_hdl, spec_orig); + cp = buf; + + while ((comma_separated = strsep(&cp, ",")) != NULL) { + char *pct = strchr(comma_separated, '%'); + if (pct != NULL) { + snapspec_arg_t ssa = { 0 }; + ssa.ssa_func = func; + ssa.ssa_arg = arg; + + if (pct == comma_separated) + ssa.ssa_seenfirst = B_TRUE; + else + ssa.ssa_first = comma_separated; + *pct = '\0'; + ssa.ssa_last = pct + 1; + + /* + * If there is a lastname specified, make sure it + * exists. + */ + if (ssa.ssa_last[0] != '\0') { + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + (void) snprintf(snapname, sizeof (snapname), + "%s@%s", zfs_get_name(fs_zhp), + ssa.ssa_last); + if (!zfs_dataset_exists(fs_zhp->zfs_hdl, + snapname, ZFS_TYPE_SNAPSHOT)) { + ret = ENOENT; + continue; + } + } + + err = zfs_iter_snapshots_sorted(fs_zhp, + snapspec_cb, &ssa); + if (ret == 0) + ret = err; + if (ret == 0 && (!ssa.ssa_seenfirst || + (ssa.ssa_last[0] != '\0' && !ssa.ssa_seenlast))) { + ret = ENOENT; + } + } else { + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + zfs_handle_t *snap_zhp; + (void) snprintf(snapname, sizeof (snapname), "%s@%s", + zfs_get_name(fs_zhp), comma_separated); + snap_zhp = make_dataset_handle(fs_zhp->zfs_hdl, + snapname); + if (snap_zhp == NULL) { + ret = ENOENT; + continue; + } + err = func(snap_zhp, arg); + if (ret == 0) + ret = err; + } + } + + free(buf); + return (ret); +} + +/* + * Iterate over all children, snapshots and filesystems + */ +int +zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + int ret; + + if ((ret = zfs_iter_filesystems(zhp, func, data)) != 0) + return (ret); + + return (zfs_iter_snapshots(zhp, B_FALSE, func, data)); +} + + +typedef struct iter_stack_frame { + struct iter_stack_frame *next; + zfs_handle_t *zhp; +} iter_stack_frame_t; + +typedef struct iter_dependents_arg { + boolean_t first; + boolean_t allowrecursion; + iter_stack_frame_t *stack; + zfs_iter_f func; + void *data; +} iter_dependents_arg_t; + +static int +iter_dependents_cb(zfs_handle_t *zhp, void *arg) +{ + iter_dependents_arg_t *ida = arg; + int err = 0; + boolean_t first = ida->first; + ida->first = B_FALSE; + + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { + err = zfs_iter_clones(zhp, iter_dependents_cb, ida); + } else if (zhp->zfs_type != ZFS_TYPE_BOOKMARK) { + iter_stack_frame_t isf; + iter_stack_frame_t *f; + + /* + * check if there is a cycle by seeing if this fs is already + * on the stack. + */ + for (f = ida->stack; f != NULL; f = f->next) { + if (f->zhp->zfs_dmustats.dds_guid == + zhp->zfs_dmustats.dds_guid) { + if (ida->allowrecursion) { + zfs_close(zhp); + return (0); + } else { + zfs_error_aux(zhp->zfs_hdl, + dgettext(TEXT_DOMAIN, + "recursive dependency at '%s'"), + zfs_get_name(zhp)); + err = zfs_error(zhp->zfs_hdl, + EZFS_RECURSIVE, + dgettext(TEXT_DOMAIN, + "cannot determine dependent " + "datasets")); + zfs_close(zhp); + return (err); + } + } + } + + isf.zhp = zhp; + isf.next = ida->stack; + ida->stack = &isf; + err = zfs_iter_filesystems(zhp, iter_dependents_cb, ida); + if (err == 0) { + err = zfs_iter_snapshots(zhp, B_FALSE, + iter_dependents_cb, ida); + } + ida->stack = isf.next; + } + + if (!first && err == 0) + err = ida->func(zhp, ida->data); + else + zfs_close(zhp); + + return (err); +} + +int +zfs_iter_dependents(zfs_handle_t *zhp, boolean_t allowrecursion, + zfs_iter_f func, void *data) +{ + iter_dependents_arg_t ida; + ida.allowrecursion = allowrecursion; + ida.stack = NULL; + ida.func = func; + ida.data = data; + ida.first = B_TRUE; + return (iter_dependents_cb(zfs_handle_dup(zhp), &ida)); +} Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c,v retrieving revision 1.4 diff -u -p -r1.4 libzfs_mount.c --- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c 14 Dec 2010 01:22:24 -0000 1.4 +++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c 20 Apr 2017 22:40:09 -0000 @@ -20,8 +20,9 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov */ /* @@ -44,17 +45,14 @@ * * zfs_is_shared_nfs() * zfs_is_shared_smb() - * zfs_is_shared_iscsi() * zfs_share_proto() * zfs_shareall(); - * zfs_share_iscsi() * zfs_unshare_nfs() * zfs_unshare_smb() * zfs_unshareall_nfs() * zfs_unshareall_smb() * zfs_unshareall() * zfs_unshareall_bypath() - * zfs_unshare_iscsi() * * The following functions are available for pool consumers, and will * mount/unmount and share/unshare all datasets within pool: @@ -66,6 +64,7 @@ #include #include #include +#include #include #include #include @@ -76,24 +75,19 @@ #include #include #include +#include #include #include "libzfs_impl.h" #include -#include #define MAXISALEN 257 /* based on sysinfo(2) man page */ static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *); zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **, zfs_share_proto_t); -static int (*iscsitgt_zfs_share)(const char *); -static int (*iscsitgt_zfs_unshare)(const char *); -static int (*iscsitgt_zfs_is_shared)(const char *); -static int (*iscsitgt_svc_online)(); - /* * The share protocols table must be in the same order as the zfs_share_prot_t * enum in libzfs_impl.h @@ -125,29 +119,6 @@ zfs_share_proto_t share_all_proto[] = { PROTO_END }; -#pragma init(zfs_iscsi_init) -static void -zfs_iscsi_init(void) -{ - void *libiscsitgt; - - if ((libiscsitgt = dlopen("/lib/libiscsitgt.so.1", - RTLD_LAZY | RTLD_GLOBAL)) == NULL || - (iscsitgt_zfs_share = (int (*)(const char *))dlsym(libiscsitgt, - "iscsitgt_zfs_share")) == NULL || - (iscsitgt_zfs_unshare = (int (*)(const char *))dlsym(libiscsitgt, - "iscsitgt_zfs_unshare")) == NULL || - (iscsitgt_zfs_is_shared = (int (*)(const char *))dlsym(libiscsitgt, - "iscsitgt_zfs_is_shared")) == NULL || - (iscsitgt_svc_online = (int (*)(const char *))dlsym(libiscsitgt, - "iscsitgt_svc_online")) == NULL) { - iscsitgt_zfs_share = NULL; - iscsitgt_zfs_unshare = NULL; - iscsitgt_zfs_is_shared = NULL; - iscsitgt_svc_online = NULL; - } -} - /* * Search the sharetab for the given mountpoint and protocol, returning * a zfs_share_type_t value. @@ -168,9 +139,10 @@ is_shared(libzfs_handle_t *hdl, const ch /* the mountpoint is the first entry on each line */ if ((tab = strchr(buf, '\t')) == NULL) continue; -#if defined(sun) + *tab = '\0'; if (strcmp(buf, mountpoint) == 0) { +#ifdef illumos /* * the protocol field is the third field * skip over second field @@ -193,17 +165,17 @@ is_shared(libzfs_handle_t *hdl, const ch return (0); } } - } #else if (proto == PROTO_NFS) return (SHARED_NFS); #endif - + } } return (SHARED_NOT_SHARED); } +#ifdef illumos /* * Returns true if the specified directory is empty. If we can't open the * directory at all, return true so that the mount can fail with a more @@ -231,6 +203,7 @@ dir_is_empty(const char *dirname) (void) closedir(dirp); return (B_TRUE); } +#endif /* * Checks to see if the mount is active. If the filesystem is mounted, we fill @@ -265,7 +238,7 @@ static boolean_t zfs_is_mountable(zfs_handle_t *zhp, char *buf, size_t buflen, zprop_source_t *source) { - char sourceloc[ZFS_MAXNAMELEN]; + char sourceloc[MAXNAMELEN]; zprop_source_t sourcetype; if (!zfs_prop_valid_for_type(ZFS_PROP_MOUNTPOINT, zhp->zfs_type)) @@ -307,6 +280,12 @@ zfs_mount(zfs_handle_t *zhp, const char else (void) strlcpy(mntopts, options, sizeof (mntopts)); + /* + * If the pool is imported read-only then all mounts must be read-only + */ + if (zpool_get_prop_int(zhp->zpool_hdl, ZPOOL_PROP_READONLY, NULL)) + flags |= MS_RDONLY; + if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL)) return (0); @@ -321,6 +300,7 @@ zfs_mount(zfs_handle_t *zhp, const char } } +#ifdef illumos /* FreeBSD: overlay mounts are not checked. */ /* * Determine if the mountpoint is empty. If so, refuse to perform the * mount. We don't perform this check if MS_OVERLAY is specified, which @@ -335,9 +315,10 @@ zfs_mount(zfs_handle_t *zhp, const char return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, dgettext(TEXT_DOMAIN, "cannot mount '%s'"), mountpoint)); } +#endif /* perform the mount */ - if (mount(zfs_get_name(zhp), mountpoint, MS_OPTIONSTR | flags, + if (zmount(zfs_get_name(zhp), mountpoint, flags, MNTTYPE_ZFS, NULL, 0, mntopts, sizeof (mntopts)) != 0) { /* * Generic errors are nasty, but there are just way too many @@ -350,6 +331,18 @@ zfs_mount(zfs_handle_t *zhp, const char } else if (errno == EPERM) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Insufficient privileges")); + } else if (errno == ENOTSUP) { + char buf[256]; + int spa_version; + + VERIFY(zfs_spa_version(zhp, &spa_version) == 0); + (void) snprintf(buf, sizeof (buf), + dgettext(TEXT_DOMAIN, "Can't mount a version %lld " + "file system on a version %d pool. Pool must be" + " upgraded to mount this file system."), + (u_longlong_t)zfs_prop_get_int(zhp, + ZFS_PROP_VERSION), spa_version); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, buf)); } else { zfs_error_aux(hdl, strerror(errno)); } @@ -450,7 +443,7 @@ zfs_is_shared(zfs_handle_t *zhp) zfs_share_proto_t *curr_proto; if (ZFS_IS_VOLUME(zhp)) - return (zfs_is_shared_iscsi(zhp)); + return (B_FALSE); for (curr_proto = share_all_proto; *curr_proto != PROTO_END; curr_proto++) @@ -462,18 +455,14 @@ zfs_is_shared(zfs_handle_t *zhp) int zfs_share(zfs_handle_t *zhp) { - if (ZFS_IS_VOLUME(zhp)) - return (zfs_share_iscsi(zhp)); - + assert(!ZFS_IS_VOLUME(zhp)); return (zfs_share_proto(zhp, share_all_proto)); } int zfs_unshare(zfs_handle_t *zhp) { - if (ZFS_IS_VOLUME(zhp)) - return (zfs_unshare_iscsi(zhp)); - + assert(!ZFS_IS_VOLUME(zhp)); return (zfs_unshareall(zhp)); } @@ -489,7 +478,8 @@ zfs_is_shared_proto(zfs_handle_t *zhp, c if (!zfs_is_mounted(zhp, &mountpoint)) return (SHARED_NOT_SHARED); - if (rc = is_shared(zhp->zfs_hdl, mountpoint, proto)) { + if ((rc = is_shared(zhp->zfs_hdl, mountpoint, proto)) + != SHARED_NOT_SHARED) { if (where != NULL) *where = mountpoint; else @@ -520,7 +510,8 @@ zfs_is_shared_smb(zfs_handle_t *zhp, cha * wrapper functions that check to see that the pointers to functions * initialized in _zfs_init_libshare() are actually present. */ -#ifdef PORT_SOLARIS + +#ifdef illumos static sa_handle_t (*_sa_init)(int); static void (*_sa_fini)(sa_handle_t); static sa_share_t (*_sa_find_share)(sa_handle_t, char *); @@ -534,6 +525,7 @@ static int (*_sa_zfs_process_share)(sa_h char *, char *, zprop_source_t, char *, char *, char *); static void (*_sa_update_sharetab_ts)(sa_handle_t); #endif + /* * _zfs_init_libshare() * @@ -546,7 +538,7 @@ static void (*_sa_update_sharetab_ts)(sa static void _zfs_init_libshare(void) { -#ifdef PORT_SOLARIS +#ifdef illumos void *libshare; char path[MAXPATHLEN]; char isa[MAXISALEN]; @@ -616,7 +608,8 @@ int zfs_init_libshare(libzfs_handle_t *zhandle, int service) { int ret = SA_OK; -#ifdef PORT_SOLARIS + +#ifdef illumos if (_sa_init == NULL) ret = SA_CONFIG_ERR; @@ -643,6 +636,7 @@ zfs_init_libshare(libzfs_handle_t *zhand if (ret == SA_OK && zhandle->libzfs_sharehdl == NULL) ret = SA_NO_MEMORY; #endif + return (ret); } @@ -656,7 +650,7 @@ void zfs_uninit_libshare(libzfs_handle_t *zhandle) { if (zhandle != NULL && zhandle->libzfs_sharehdl != NULL) { -#ifdef PORT_SOLARIS +#ifdef illumos if (_sa_fini != NULL) _sa_fini(zhandle->libzfs_sharehdl); #endif @@ -673,7 +667,7 @@ zfs_uninit_libshare(libzfs_handle_t *zha int zfs_parse_options(char *options, zfs_share_proto_t proto) { -#ifdef PORT_SOLARIS +#ifdef illumos if (_sa_parse_legacy_options != NULL) { return (_sa_parse_legacy_options(NULL, options, proto_table[proto].p_name)); @@ -684,7 +678,7 @@ zfs_parse_options(char *options, zfs_sha #endif } -#ifdef PORT_SOLARIS +#ifdef illumos /* * zfs_sa_find_share(handle, path) * @@ -726,7 +720,8 @@ zfs_sa_disable_share(sa_share_t share, c return (_sa_disable_share(share, proto)); return (SA_CONFIG_ERR); } -#endif +#endif /* illumos */ + /* * Share the given filesystem according to the options in the specified * protocol specific properties (sharenfs, sharesmb). We rely @@ -739,22 +734,13 @@ zfs_share_proto(zfs_handle_t *zhp, zfs_s char shareopts[ZFS_MAXPROPLEN]; char sourcestr[ZFS_MAXPROPLEN]; libzfs_handle_t *hdl = zhp->zfs_hdl; - sa_share_t share; zfs_share_proto_t *curr_proto; zprop_source_t sourcetype; int error, ret; if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL)) return (0); -#ifdef PORT_SOLARIS - if ((ret = zfs_init_libshare(hdl, SA_INIT_SHARE_API)) != SA_OK) { - (void) zfs_error_fmt(hdl, EZFS_SHARENFSFAILED, - dgettext(TEXT_DOMAIN, "cannot share '%s': %s"), - zfs_get_name(zhp), _sa_errorstr != NULL ? - _sa_errorstr(ret) : ""); - return (-1); - } -#endif + for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) { /* * Return success if there are no share options. @@ -765,6 +751,17 @@ zfs_share_proto(zfs_handle_t *zhp, zfs_s strcmp(shareopts, "off") == 0) continue; +#ifdef illumos + ret = zfs_init_libshare(hdl, SA_INIT_SHARE_API); + if (ret != SA_OK) { + (void) zfs_error_fmt(hdl, EZFS_SHARENFSFAILED, + dgettext(TEXT_DOMAIN, "cannot share '%s': %s"), + zfs_get_name(zhp), _sa_errorstr != NULL ? + _sa_errorstr(ret) : ""); + return (-1); + } +#endif + /* * If the 'zoned' property is set, then zfs_is_mountable() * will have already bailed out if we are in the global zone. @@ -774,7 +771,7 @@ zfs_share_proto(zfs_handle_t *zhp, zfs_s if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) continue; -#ifdef PORT_SOLARIS +#ifdef illumos share = zfs_sa_find_share(hdl->libzfs_sharehdl, mountpoint); if (share == NULL) { /* @@ -811,14 +808,28 @@ zfs_share_proto(zfs_handle_t *zhp, zfs_s zfs_get_name(zhp)); return (-1); } - } else { + } else +#else + if (*curr_proto != PROTO_NFS) { + fprintf(stderr, "Unsupported share protocol: %d.\n", + *curr_proto); + continue; + } + + if (strcmp(shareopts, "on") == 0) + error = fsshare(ZFS_EXPORTS_PATH, mountpoint, ""); + else + error = fsshare(ZFS_EXPORTS_PATH, mountpoint, shareopts); + if (error != 0) +#endif + { (void) zfs_error_fmt(hdl, proto_table[*curr_proto].p_share_err, dgettext(TEXT_DOMAIN, "cannot share '%s'"), zfs_get_name(zhp)); return (-1); } -#endif + } return (0); } @@ -849,7 +860,7 @@ static int unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint, zfs_share_proto_t proto) { -#ifdef PORT_SOLARIS +#ifdef illumos sa_share_t share; int err; char *mntpt; @@ -883,6 +894,23 @@ unshare_one(libzfs_handle_t *hdl, const dgettext(TEXT_DOMAIN, "cannot unshare '%s': not found"), name)); } +#else + char buf[MAXPATHLEN]; + FILE *fp; + int err; + + if (proto != PROTO_NFS) { + fprintf(stderr, "No SMB support in FreeBSD yet.\n"); + return (EOPNOTSUPP); + } + + err = fsunshare(ZFS_EXPORTS_PATH, mountpoint); + if (err != 0) { + zfs_error_aux(hdl, "%s", strerror(err)); + return (zfs_error_fmt(hdl, EZFS_UNSHARENFSFAILED, + dgettext(TEXT_DOMAIN, + "cannot unshare '%s'"), name)); + } #endif return (0); } @@ -1016,94 +1044,29 @@ remove_mountpoint(zfs_handle_t *zhp) } } -boolean_t -zfs_is_shared_iscsi(zfs_handle_t *zhp) -{ - - /* - * If iscsi deamon isn't running then we aren't shared - */ - if (iscsitgt_svc_online && iscsitgt_svc_online() == 1) - return (B_FALSE); - else - return (iscsitgt_zfs_is_shared != NULL && - iscsitgt_zfs_is_shared(zhp->zfs_name) != 0); -} - -int -zfs_share_iscsi(zfs_handle_t *zhp) -{ - char shareopts[ZFS_MAXPROPLEN]; - const char *dataset = zhp->zfs_name; - libzfs_handle_t *hdl = zhp->zfs_hdl; - - /* - * Return success if there are no share options. - */ - if (zfs_prop_get(zhp, ZFS_PROP_SHAREISCSI, shareopts, - sizeof (shareopts), NULL, NULL, 0, B_FALSE) != 0 || - strcmp(shareopts, "off") == 0) - return (0); -#ifdef PORT_ISCSI /* NetBSD do not support zfssharing with iscsi, yet */ - if (iscsitgt_zfs_share == NULL || iscsitgt_zfs_share(dataset) != 0) { - int error = EZFS_SHAREISCSIFAILED; - - /* - * If service isn't availabele and EPERM was - * returned then use special error. - */ - if (iscsitgt_svc_online && errno == EPERM && - (iscsitgt_svc_online() != 0)) - error = EZFS_ISCSISVCUNAVAIL; - - return (zfs_error_fmt(hdl, error, - dgettext(TEXT_DOMAIN, "cannot share '%s'"), dataset)); - } -#endif - return (0); -} - -int -zfs_unshare_iscsi(zfs_handle_t *zhp) +void +libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp) { - const char *dataset = zfs_get_name(zhp); - libzfs_handle_t *hdl = zhp->zfs_hdl; - -#ifdef PORT_ISCSI /* NetBSD do not support zfssharing with iscsi, yet */ - /* - * Return if the volume is not shared - */ - if (zfs_is_shared_iscsi(zhp) != SHARED_ISCSI) - return (0); + if (cbp->cb_alloc == cbp->cb_used) { + size_t newsz; + void *ptr; - /* - * If this fails with ENODEV it indicates that zvol wasn't shared so - * we should return success in that case. - */ - if (iscsitgt_zfs_unshare == NULL || - (iscsitgt_zfs_unshare(dataset) != 0 && errno != ENODEV)) { - if (errno == EPERM) - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "Insufficient privileges to unshare iscsi")); - return (zfs_error_fmt(hdl, EZFS_UNSHAREISCSIFAILED, - dgettext(TEXT_DOMAIN, "cannot unshare '%s'"), dataset)); + newsz = cbp->cb_alloc ? cbp->cb_alloc * 2 : 64; + ptr = zfs_realloc(zhp->zfs_hdl, + cbp->cb_handles, cbp->cb_alloc * sizeof (void *), + newsz * sizeof (void *)); + cbp->cb_handles = ptr; + cbp->cb_alloc = newsz; } -#endif - return (0); + cbp->cb_handles[cbp->cb_used++] = zhp; } -typedef struct mount_cbdata { - zfs_handle_t **cb_datasets; - int cb_used; - int cb_alloc; -} mount_cbdata_t; - static int mount_cb(zfs_handle_t *zhp, void *data) { - mount_cbdata_t *cbp = data; + get_all_cb_t *cbp = data; - if (!(zfs_get_type(zhp) & (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME))) { + if (!(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM)) { zfs_close(zhp); return (0); } @@ -1113,25 +1076,27 @@ mount_cb(zfs_handle_t *zhp, void *data) return (0); } - if (cbp->cb_alloc == cbp->cb_used) { - void *ptr; - - if ((ptr = zfs_realloc(zhp->zfs_hdl, - cbp->cb_datasets, cbp->cb_alloc * sizeof (void *), - cbp->cb_alloc * 2 * sizeof (void *))) == NULL) - return (-1); - cbp->cb_datasets = ptr; - - cbp->cb_alloc *= 2; + /* + * If this filesystem is inconsistent and has a receive resume + * token, we can not mount it. + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && + zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { + zfs_close(zhp); + return (0); } - cbp->cb_datasets[cbp->cb_used++] = zhp; - - return (zfs_iter_filesystems(zhp, mount_cb, cbp)); + libzfs_add_handle(cbp, zhp); + if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) { + zfs_close(zhp); + return (-1); + } + return (0); } -static int -dataset_cmp(const void *a, const void *b) +int +libzfs_dataset_cmp(const void *a, const void *b) { zfs_handle_t **za = (zfs_handle_t **)a; zfs_handle_t **zb = (zfs_handle_t **)b; @@ -1169,7 +1134,7 @@ dataset_cmp(const void *a, const void *b int zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) { - mount_cbdata_t cb = { 0 }; + get_all_cb_t cb = { 0 }; libzfs_handle_t *hdl = zhp->zpool_hdl; zfs_handle_t *zfsp; int i, ret = -1; @@ -1178,23 +1143,17 @@ zpool_enable_datasets(zpool_handle_t *zh /* * Gather all non-snap datasets within the pool. */ - if ((cb.cb_datasets = zfs_alloc(hdl, 4 * sizeof (void *))) == NULL) - return (-1); - cb.cb_alloc = 4; - if ((zfsp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) == NULL) goto out; - cb.cb_datasets[0] = zfsp; - cb.cb_used = 1; - + libzfs_add_handle(&cb, zfsp); if (zfs_iter_filesystems(zfsp, mount_cb, &cb) != 0) goto out; - /* * Sort the datasets by mountpoint. */ - qsort(cb.cb_datasets, cb.cb_used, sizeof (void *), dataset_cmp); + qsort(cb.cb_handles, cb.cb_used, sizeof (void *), + libzfs_dataset_cmp); /* * And mount all the datasets, keeping track of which ones @@ -1206,7 +1165,7 @@ zpool_enable_datasets(zpool_handle_t *zh ret = 0; for (i = 0; i < cb.cb_used; i++) { - if (zfs_mount(cb.cb_datasets[i], mntopts, flags) != 0) + if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0) ret = -1; else good[i] = 1; @@ -1219,7 +1178,7 @@ zpool_enable_datasets(zpool_handle_t *zh * zfs_alloc is supposed to exit if memory isn't available. */ for (i = 0; i < cb.cb_used; i++) { - if (good[i] && zfs_share(cb.cb_datasets[i]) != 0) + if (good[i] && zfs_share(cb.cb_handles[i]) != 0) ret = -1; } @@ -1227,27 +1186,12 @@ zpool_enable_datasets(zpool_handle_t *zh out: for (i = 0; i < cb.cb_used; i++) - zfs_close(cb.cb_datasets[i]); - free(cb.cb_datasets); + zfs_close(cb.cb_handles[i]); + free(cb.cb_handles); return (ret); } -/*ARGSUSED1*/ -static int -zvol_cb(zfs_handle_t *zhp, void *unused) -{ - int error = 0; - - if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) - (void) zfs_iter_children(zhp, zvol_cb, NULL); - if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) - error = zfs_unshare_iscsi(zhp); - zfs_close(zhp); - - return (error); -} - static int mountpoint_compare(const void *a, const void *b) { @@ -1270,48 +1214,34 @@ int zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) { int used, alloc; - struct statvfs *sfs; - int n; + struct mnttab entry; size_t namelen; char **mountpoints = NULL; - zfs_handle_t *zfp; zfs_handle_t **datasets = NULL; libzfs_handle_t *hdl = zhp->zpool_hdl; int i; int ret = -1; int flags = (force ? MS_FORCE : 0); - /* - * First unshare all zvols. - */ - zfp = zfs_open(zhp->zpool_hdl, zhp->zpool_name, - ZFS_TYPE_FILESYSTEM); - if (zfp != NULL) { - (void) zfs_iter_children(zfp, zvol_cb, NULL); - zfs_close(zfp); - } - namelen = strlen(zhp->zpool_name); rewind(hdl->libzfs_mnttab); used = alloc = 0; - if ((n = getmntinfo(&sfs, MNT_WAIT)) == 0) { - fprintf(stderr, "getmntinfo(): %s\n", strerror(errno)); - return (-1); - } - for (i = 0; i < n; i++) { + while (getmntent(hdl->libzfs_mnttab, &entry) == 0) { /* * Ignore non-ZFS entries. */ - if (strcmp(sfs[i].f_fstypename, MNTTYPE_ZFS) != 0) + if (entry.mnt_fstype == NULL || + strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) continue; /* * Ignore filesystems not within this pool. */ - if (strncmp(sfs[i].f_mntfromname, zhp->zpool_name, namelen) != 0 || - (sfs[i].f_mntfromname[namelen] != '/' && - sfs[i].f_mntfromname[namelen] != '\0')) + if (entry.mnt_mountp == NULL || + strncmp(entry.mnt_special, zhp->zpool_name, namelen) != 0 || + (entry.mnt_special[namelen] != '/' && + entry.mnt_special[namelen] != '\0')) continue; /* @@ -1349,7 +1279,7 @@ zpool_disable_datasets(zpool_handle_t *z } if ((mountpoints[used] = zfs_strdup(hdl, - sfs[i].f_mntonname)) == NULL) + entry.mnt_mountp)) == NULL) goto out; /* @@ -1357,7 +1287,7 @@ zpool_disable_datasets(zpool_handle_t *z * is only used to determine if we need to remove the underlying * mountpoint, so failure is not fatal. */ - datasets[used] = make_dataset_handle(hdl, sfs[i].f_mntfromname); + datasets[used] = make_dataset_handle(hdl, entry.mnt_special); used++; } Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c,v retrieving revision 1.4 diff -u -p -r1.4 libzfs_pool.c --- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c 10 Jan 2017 19:20:34 -0000 1.4 +++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c 5 May 2017 17:36:25 -0000 @@ -20,10 +20,15 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. + * Copyright 2016 Igor Kozhukhov */ +#include +#include #include #include #include @@ -33,73 +38,25 @@ #include #include #include -#include -#include +#include #include -#include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" #include "zfs_comutil.h" - -const char *hist_event_table[LOG_END] = { - "invalid event", - "pool create", - "vdev add", - "pool remove", - "pool destroy", - "pool export", - "pool import", - "vdev attach", - "vdev replace", - "vdev detach", - "vdev online", - "vdev offline", - "vdev upgrade", - "pool clear", - "pool scrub", - "pool property set", - "create", - "clone", - "destroy", - "destroy_begin_sync", - "inherit", - "property set", - "quota set", - "permission update", - "permission remove", - "permission who remove", - "promote", - "receive", - "rename", - "reservation set", - "replay_inc_sync", - "replay_full_sync", - "rollback", - "snapshot", - "filesystem version upgrade", - "refquota set", - "refreservation set", - "pool scrub done", - "user hold", - "user release", - "pool split", -}; +#include "zfeature_common.h" static int read_efi_label(nvlist_t *config, diskaddr_t *sb); -#if defined(__i386) || defined(__amd64) -#define BOOTCMD "installgrub(1M)" -#else -#define BOOTCMD "installboot(1M)" -#endif - -#define DISK_ROOT "/dev/dsk" -#define RDISK_ROOT "/dev/rdsk" #define BACKUP_SLICE "s2" +typedef struct prop_flags { + int create:1; /* Validate property on creation */ + int import:1; /* Validate property on import */ +} prop_flags_t; + /* * ==================================================================== * zpool property functions @@ -169,14 +126,14 @@ zpool_get_prop_string(zpool_handle_t *zh verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); } else { source = ZPROP_SRC_DEFAULT; - if ((value = __UNCONST(zpool_prop_default_string(prop))) == NULL) - value = __UNCONST("-"); + if ((value = (char *)zpool_prop_default_string(prop)) == NULL) + value = "-"; } if (src) *src = source; - return value; + return (value); } uint64_t @@ -243,10 +200,40 @@ zpool_state_to_name(vdev_state_t state, return (gettext("DEGRADED")); case VDEV_STATE_HEALTHY: return (gettext("ONLINE")); - case VDEV_STATE_UNKNOWN: + default: - return (gettext("UNKNOWN")); + break; + } + + return (gettext("UNKNOWN")); +} + +/* + * Map POOL STATE to printed strings. + */ +const char * +zpool_pool_state_to_name(pool_state_t state) +{ + switch (state) { + case POOL_STATE_ACTIVE: + return (gettext("ACTIVE")); + case POOL_STATE_EXPORTED: + return (gettext("EXPORTED")); + case POOL_STATE_DESTROYED: + return (gettext("DESTROYED")); + case POOL_STATE_SPARE: + return (gettext("SPARE")); + case POOL_STATE_L2CACHE: + return (gettext("L2CACHE")); + case POOL_STATE_UNINITIALIZED: + return (gettext("UNINITIALIZED")); + case POOL_STATE_UNAVAIL: + return (gettext("UNAVAIL")); + case POOL_STATE_POTENTIALLY_ACTIVE: + return (gettext("POTENTIALLY_ACTIVE")); } + + return (gettext("UNKNOWN")); } /* @@ -255,7 +242,7 @@ zpool_state_to_name(vdev_state_t state, */ int zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, - zprop_source_t *srctype) + zprop_source_t *srctype, boolean_t literal) { uint64_t intval; const char *strval; @@ -271,24 +258,24 @@ zpool_get_prop(zpool_handle_t *zhp, zpoo break; case ZPOOL_PROP_HEALTH: - (void) strlcpy(buf, "FAULTED", len); + (void) strlcpy(buf, + zpool_pool_state_to_name(POOL_STATE_UNAVAIL), len); break; case ZPOOL_PROP_GUID: intval = zpool_get_prop_int(zhp, prop, &src); - (void) snprintf(buf, len, "%" PRIu64, intval); + (void) snprintf(buf, len, "%llu", intval); break; case ZPOOL_PROP_ALTROOT: case ZPOOL_PROP_CACHEFILE: + case ZPOOL_PROP_COMMENT: if (zhp->zpool_props != NULL || zpool_get_all_props(zhp) == 0) { (void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src), len); - if (srctype != NULL) - *srctype = src; - return (0); + break; } /* FALLTHROUGH */ default: @@ -318,31 +305,65 @@ zpool_get_prop(zpool_handle_t *zhp, zpoo case ZPOOL_PROP_SIZE: case ZPOOL_PROP_ALLOCATED: case ZPOOL_PROP_FREE: - (void) zfs_nicenum(intval, buf, len); + case ZPOOL_PROP_FREEING: + case ZPOOL_PROP_LEAKED: + if (literal) { + (void) snprintf(buf, len, "%llu", + (u_longlong_t)intval); + } else { + (void) zfs_nicenum(intval, buf, len); + } + break; + case ZPOOL_PROP_EXPANDSZ: + if (intval == 0) { + (void) strlcpy(buf, "-", len); + } else if (literal) { + (void) snprintf(buf, len, "%llu", + (u_longlong_t)intval); + } else { + (void) zfs_nicenum(intval, buf, len); + } break; - case ZPOOL_PROP_CAPACITY: - (void) snprintf(buf, len, "%ju%%", - (uintmax_t)intval); + if (literal) { + (void) snprintf(buf, len, "%llu", + (u_longlong_t)intval); + } else { + (void) snprintf(buf, len, "%llu%%", + (u_longlong_t)intval); + } + break; + case ZPOOL_PROP_FRAGMENTATION: + if (intval == UINT64_MAX) { + (void) strlcpy(buf, "-", len); + } else { + (void) snprintf(buf, len, "%llu%%", + (u_longlong_t)intval); + } break; - case ZPOOL_PROP_DEDUPRATIO: - (void) snprintf(buf, len, "%ju.%02jux", - (uintmax_t)(intval / 100), - (uintmax_t)(intval % 100)); + (void) snprintf(buf, len, "%llu.%02llux", + (u_longlong_t)(intval / 100), + (u_longlong_t)(intval % 100)); break; - case ZPOOL_PROP_HEALTH: verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); verify(nvlist_lookup_uint64_array(nvroot, - ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) + == 0); (void) strlcpy(buf, zpool_state_to_name(intval, vs->vs_aux), len); break; + case ZPOOL_PROP_VERSION: + if (intval >= SPA_VERSION_FEATURES) { + (void) snprintf(buf, len, "-"); + break; + } + /* FALLTHROUGH */ default: - (void) snprintf(buf, len, "%ju", (uintmax_t)intval); + (void) snprintf(buf, len, "%llu", intval); } break; @@ -383,34 +404,13 @@ bootfs_name_valid(const char *pool, char return (B_FALSE); } -/* - * Inspect the configuration to determine if any of the devices contain - * an EFI label. - */ -static boolean_t -pool_uses_efi(nvlist_t *config) -{ - nvlist_t **child; - uint_t c, children; - - if (nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) - return (read_efi_label(config, NULL) >= 0); - - for (c = 0; c < children; c++) { - if (pool_uses_efi(child[c])) - return (B_TRUE); - } - return (B_FALSE); -} - -static boolean_t -pool_is_bootable(zpool_handle_t *zhp) +boolean_t +zpool_is_bootable(zpool_handle_t *zhp) { - char bootfs[ZPOOL_MAXNAMELEN]; + char bootfs[ZFS_MAX_DATASET_NAME_LEN]; return (zpool_get_prop(zhp, ZPOOL_PROP_BOOTFS, bootfs, - sizeof (bootfs), NULL) == 0 && strncmp(bootfs, "-", + sizeof (bootfs), NULL, B_FALSE) == 0 && strncmp(bootfs, "-", sizeof (bootfs)) != 0); } @@ -422,17 +422,16 @@ pool_is_bootable(zpool_handle_t *zhp) */ static nvlist_t * zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, - nvlist_t *props, uint64_t version, boolean_t create_or_import, char *errbuf) + nvlist_t *props, uint64_t version, prop_flags_t flags, char *errbuf) { nvpair_t *elem; nvlist_t *retprops; zpool_prop_t prop; char *strval; uint64_t intval; - char *slash; + char *slash, *check; struct stat64 statbuf; zpool_handle_t *zhp; - nvlist_t *nvroot; if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); @@ -443,10 +442,47 @@ zpool_valid_proplist(libzfs_handle_t *hd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { const char *propname = nvpair_name(elem); + prop = zpool_name_to_prop(propname); + if (prop == ZPROP_INVAL && zpool_prop_feature(propname)) { + int err; + char *fname = strchr(propname, '@') + 1; + + err = zfeature_lookup_name(fname, NULL); + if (err != 0) { + ASSERT3U(err, ==, ENOENT); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid feature '%s'"), fname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (nvpair_type(elem) != DATA_TYPE_STRING) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a string"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + (void) nvpair_value_string(elem, &strval); + if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' can only be set to " + "'enabled'"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (nvlist_add_uint64(retprops, propname, 0) != 0) { + (void) no_memory(hdl); + goto error; + } + continue; + } + /* * Make sure this property is valid and applies to this type. */ - if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) { + if (prop == ZPROP_INVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); @@ -469,7 +505,8 @@ zpool_valid_proplist(libzfs_handle_t *hd */ switch (prop) { case ZPOOL_PROP_VERSION: - if (intval < version || intval > SPA_VERSION) { + if (intval < version || + !SPA_VERSION_IS_SUPPORTED(intval)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' number %d is invalid."), propname, intval); @@ -479,7 +516,7 @@ zpool_valid_proplist(libzfs_handle_t *hd break; case ZPOOL_PROP_BOOTFS: - if (create_or_import) { + if (flags.create || flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' cannot be set at creation " "or import time"), propname); @@ -513,26 +550,11 @@ zpool_valid_proplist(libzfs_handle_t *hd (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf); goto error; } - verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), - ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - - /* - * bootfs property cannot be set on a disk which has - * been EFI labeled. - */ - if (pool_uses_efi(nvroot)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property '%s' not supported on " - "EFI labeled devices"), propname); - (void) zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf); - zpool_close(zhp); - goto error; - } zpool_close(zhp); break; case ZPOOL_PROP_ALTROOT: - if (!create_or_import) { + if (!flags.create && !flags.import) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "property '%s' can only be set during pool " "creation or import"), propname); @@ -587,21 +609,40 @@ zpool_valid_proplist(libzfs_handle_t *hd *slash = '/'; break; - case ZPOOL_PROP_FREE: - case ZPOOL_PROP_ALLOCATED: - case ZPOOL_NUM_PROPS: - case ZPOOL_PROP_AUTOEXPAND: - case ZPOOL_PROP_DEDUPDITTO: - case ZPOOL_PROP_SIZE: - case ZPOOL_PROP_CAPACITY: - case ZPOOL_PROP_HEALTH: - case ZPOOL_PROP_GUID: - case ZPOOL_PROP_DELEGATION: - case ZPOOL_PROP_AUTOREPLACE: - case ZPOOL_PROP_FAILUREMODE: - case ZPOOL_PROP_LISTSNAPS: - case ZPOOL_PROP_DEDUPRATIO: - case ZPOOL_PROP_NAME: + + case ZPOOL_PROP_COMMENT: + for (check = strval; *check != '\0'; check++) { + if (!isprint(*check)) { + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, + "comment may only have printable " + "characters")); + (void) zfs_error(hdl, EZFS_BADPROP, + errbuf); + goto error; + } + } + if (strlen(strval) > ZPROP_MAX_COMMENT) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "comment must not exceed %d characters"), + ZPROP_MAX_COMMENT); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + case ZPOOL_PROP_READONLY: + if (!flags.import) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' can only be set at " + "import time"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + + default: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s'(%d) not defined"), propname, prop); break; } } @@ -624,6 +665,7 @@ zpool_set_prop(zpool_handle_t *zhp, cons nvlist_t *nvl = NULL; nvlist_t *realprops; uint64_t version; + prop_flags_t flags = { 0 }; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), @@ -639,7 +681,7 @@ zpool_set_prop(zpool_handle_t *zhp, cons version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); if ((realprops = zpool_valid_proplist(zhp->zpool_hdl, - zhp->zpool_name, nvl, version, B_FALSE, errbuf)) == NULL) { + zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) { nvlist_free(nvl); return (-1); } @@ -676,10 +718,77 @@ zpool_expand_proplist(zpool_handle_t *zh libzfs_handle_t *hdl = zhp->zpool_hdl; zprop_list_t *entry; char buf[ZFS_MAXPROPLEN]; + nvlist_t *features = NULL; + zprop_list_t **last; + boolean_t firstexpand = (NULL == *plp); if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0) return (-1); + last = plp; + while (*last != NULL) + last = &(*last)->pl_next; + + if ((*plp)->pl_all) + features = zpool_get_features(zhp); + + if ((*plp)->pl_all && firstexpand) { + for (int i = 0; i < SPA_FEATURES; i++) { + zprop_list_t *entry = zfs_alloc(hdl, + sizeof (zprop_list_t)); + entry->pl_prop = ZPROP_INVAL; + entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s", + spa_feature_table[i].fi_uname); + entry->pl_width = strlen(entry->pl_user_prop); + entry->pl_all = B_TRUE; + + *last = entry; + last = &entry->pl_next; + } + } + + /* add any unsupported features */ + for (nvpair_t *nvp = nvlist_next_nvpair(features, NULL); + nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) { + char *propname; + boolean_t found; + zprop_list_t *entry; + + if (zfeature_is_supported(nvpair_name(nvp))) + continue; + + propname = zfs_asprintf(hdl, "unsupported@%s", + nvpair_name(nvp)); + + /* + * Before adding the property to the list make sure that no + * other pool already added the same property. + */ + found = B_FALSE; + entry = *plp; + while (entry != NULL) { + if (entry->pl_user_prop != NULL && + strcmp(propname, entry->pl_user_prop) == 0) { + found = B_TRUE; + break; + } + entry = entry->pl_next; + } + if (found) { + free(propname); + continue; + } + + entry = zfs_alloc(hdl, sizeof (zprop_list_t)); + entry->pl_prop = ZPROP_INVAL; + entry->pl_user_prop = propname; + entry->pl_width = strlen(entry->pl_user_prop); + entry->pl_all = B_TRUE; + + *last = entry; + last = &entry->pl_next; + } + for (entry = *plp; entry != NULL; entry = entry->pl_next) { if (entry->pl_fixed) @@ -687,7 +796,7 @@ zpool_expand_proplist(zpool_handle_t *zh if (entry->pl_prop != ZPROP_INVAL && zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf), - NULL) == 0) { + NULL, B_FALSE) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } @@ -696,6 +805,66 @@ zpool_expand_proplist(zpool_handle_t *zh return (0); } +/* + * Get the state for the given feature on the given ZFS pool. + */ +int +zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf, + size_t len) +{ + uint64_t refcount; + boolean_t found = B_FALSE; + nvlist_t *features = zpool_get_features(zhp); + boolean_t supported; + const char *feature = strchr(propname, '@') + 1; + + supported = zpool_prop_feature(propname); + ASSERT(supported || zpool_prop_unsupported(propname)); + + /* + * Convert from feature name to feature guid. This conversion is + * unecessary for unsupported@... properties because they already + * use guids. + */ + if (supported) { + int ret; + spa_feature_t fid; + + ret = zfeature_lookup_name(feature, &fid); + if (ret != 0) { + (void) strlcpy(buf, "-", len); + return (ENOTSUP); + } + feature = spa_feature_table[fid].fi_guid; + } + + if (nvlist_lookup_uint64(features, feature, &refcount) == 0) + found = B_TRUE; + + if (supported) { + if (!found) { + (void) strlcpy(buf, ZFS_FEATURE_DISABLED, len); + } else { + if (refcount == 0) + (void) strlcpy(buf, ZFS_FEATURE_ENABLED, len); + else + (void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len); + } + } else { + if (found) { + if (refcount == 0) { + (void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE); + } else { + (void) strcpy(buf, ZFS_UNSUPPORTED_READONLY); + } + } else { + (void) strlcpy(buf, "-", len); + return (ENOTSUP); + } + } + + return (0); +} /* * Don't start the slice at the default block of 34; many storage @@ -782,9 +951,10 @@ zpool_name_valid(libzfs_handle_t *hdl, b zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "multiple '@' delimiters in name")); break; - case NAME_ERR_NO_AT: + + default: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "no attribute in name")); + "(%d) not defined"), why); break; } } @@ -895,12 +1065,9 @@ zpool_open(libzfs_handle_t *hdl, const c void zpool_close(zpool_handle_t *zhp) { - if (zhp->zpool_config) - nvlist_free(zhp->zpool_config); - if (zhp->zpool_old_config) - nvlist_free(zhp->zpool_old_config); - if (zhp->zpool_props) - nvlist_free(zhp->zpool_props); + nvlist_free(zhp->zpool_config); + nvlist_free(zhp->zpool_old_config); + nvlist_free(zhp->zpool_props); free(zhp); } @@ -936,7 +1103,6 @@ zpool_create(libzfs_handle_t *hdl, const nvlist_t *zc_fsprops = NULL; nvlist_t *zc_props = NULL; char msg[1024]; - char *altroot; int ret = -1; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, @@ -949,8 +1115,10 @@ zpool_create(libzfs_handle_t *hdl, const return (-1); if (props) { + prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE }; + if ((zc_props = zpool_valid_proplist(hdl, pool, props, - SPA_VERSION_1, B_TRUE, msg)) == NULL) { + SPA_VERSION_1, flags, msg)) == NULL) { goto create_failed; } } @@ -963,8 +1131,8 @@ zpool_create(libzfs_handle_t *hdl, const zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) && strcmp(zonestr, "on") == 0); - if ((zc_fsprops = zfs_valid_proplist(hdl, - ZFS_TYPE_FILESYSTEM, fsprops, zoned, NULL, msg)) == NULL) { + if ((zc_fsprops = zfs_valid_proplist(hdl, ZFS_TYPE_FILESYSTEM, + fsprops, zoned, NULL, NULL, msg)) == NULL) { goto create_failed; } if (!zc_props && @@ -1000,6 +1168,21 @@ zpool_create(libzfs_handle_t *hdl, const "one or more vdevs refer to the same device")); return (zfs_error(hdl, EZFS_BADDEV, msg)); + case ERANGE: + /* + * This happens if the record size is smaller or larger + * than the allowed size range, or not a power of 2. + * + * NOTE: although zfs_valid_proplist is called earlier, + * this case may have slipped through since the + * pool does not exist yet and it is therefore + * impossible to read properties e.g. max blocksize + * from the pool. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "record size invalid")); + return (zfs_error(hdl, EZFS_BADPROP, msg)); + case EOVERFLOW: /* * This occurs when one of the devices is below @@ -1033,21 +1216,6 @@ zpool_create(libzfs_handle_t *hdl, const } } - /* - * If this is an alternate root pool, then we automatically set the - * mountpoint of the root dataset to be '/'. - */ - if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), - &altroot) == 0) { - zfs_handle_t *zhp; - - verify((zhp = zfs_open(hdl, pool, ZFS_TYPE_DATASET)) != NULL); - verify(zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), - "/") == 0); - - zfs_close(zhp); - } - create_failed: zcmd_free_nvlists(&zc); nvlist_free(zc_props); @@ -1060,7 +1228,7 @@ create_failed: * datasets left in the pool. */ int -zpool_destroy(zpool_handle_t *zhp) +zpool_destroy(zpool_handle_t *zhp, const char *log_str) { zfs_cmd_t zc = { 0 }; zfs_handle_t *zfp = NULL; @@ -1068,13 +1236,13 @@ zpool_destroy(zpool_handle_t *zhp) char msg[1024]; if (zhp->zpool_state == POOL_STATE_ACTIVE && - (zfp = zfs_open(zhp->zpool_hdl, zhp->zpool_name, - ZFS_TYPE_FILESYSTEM)) == NULL) + (zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL) return (-1); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_history = (uint64_t)(uintptr_t)log_str; - if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) { + if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) { (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), zhp->zpool_name); @@ -1125,25 +1293,6 @@ zpool_add(zpool_handle_t *zhp, nvlist_t return (zfs_error(hdl, EZFS_BADVERSION, msg)); } - if (pool_is_bootable(zhp) && nvlist_lookup_nvlist_array(nvroot, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { - uint64_t s; - - for (s = 0; s < nspares; s++) { - char *path; - - if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, - &path) == 0 && pool_uses_efi(spares[s])) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "device '%s' contains an EFI label and " - "cannot be used on root pools."), - zpool_vdev_name(hdl, NULL, spares[s], - B_FALSE)); - return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg)); - } - } - } - if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < SPA_VERSION_L2CACHE && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, @@ -1157,7 +1306,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t return (-1); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) { + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) { switch (errno) { case EBUSY: /* @@ -1228,7 +1377,8 @@ zpool_add(zpool_handle_t *zhp, nvlist_t * mounted datasets in the pool. */ static int -zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce) +zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce, + const char *log_str) { zfs_cmd_t zc = { 0 }; char msg[1024]; @@ -1239,6 +1389,7 @@ zpool_export_common(zpool_handle_t *zhp, (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = force; zc.zc_guid = hardforce; + zc.zc_history = (uint64_t)(uintptr_t)log_str; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) { switch (errno) { @@ -1260,35 +1411,41 @@ zpool_export_common(zpool_handle_t *zhp, } int -zpool_export(zpool_handle_t *zhp, boolean_t force) +zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str) { - return (zpool_export_common(zhp, force, B_FALSE)); + return (zpool_export_common(zhp, force, B_FALSE, log_str)); } int -zpool_export_force(zpool_handle_t *zhp) +zpool_export_force(zpool_handle_t *zhp, const char *log_str) { - return (zpool_export_common(zhp, B_TRUE, B_TRUE)); + return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str)); } static void zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, - nvlist_t *rbi) + nvlist_t *config) { + nvlist_t *nv = NULL; uint64_t rewindto; int64_t loss = -1; struct tm t; char timestr[128]; - if (!hdl->libzfs_printerr || rbi == NULL) + if (!hdl->libzfs_printerr || config == NULL) return; - if (nvlist_lookup_uint64(rbi, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || + nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) { + return; + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) return; - (void) nvlist_lookup_int64(rbi, ZPOOL_CONFIG_REWIND_TIME, &loss); + (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); if (localtime_r((time_t *)&rewindto, &t) != NULL && - strftime(timestr, 128, "", &t) != 0) { + strftime(timestr, 128, 0, &t) != 0) { if (dryrun) { (void) printf(dgettext(TEXT_DOMAIN, "Would be able to return %s " @@ -1301,16 +1458,15 @@ zpool_rewind_exclaim(libzfs_handle_t *hd } if (loss > 120) { (void) printf(dgettext(TEXT_DOMAIN, - "%s approximately %jd "), + "%s approximately %lld "), dryrun ? "Would discard" : "Discarded", - ((uintmax_t)(loss + 30) / 60)); + (loss + 30) / 60); (void) printf(dgettext(TEXT_DOMAIN, "minutes of transactions.\n")); } else if (loss > 0) { (void) printf(dgettext(TEXT_DOMAIN, - "%s approximately %jd "), - dryrun ? "Would discard" : "Discarded", - (uintmax_t)loss); + "%s approximately %lld "), + dryrun ? "Would discard" : "Discarded", loss); (void) printf(dgettext(TEXT_DOMAIN, "seconds of transactions.\n")); } @@ -1321,6 +1477,7 @@ void zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, nvlist_t *config) { + nvlist_t *nv = NULL; int64_t loss = -1; uint64_t edata = UINT64_MAX; uint64_t rewindto; @@ -1336,19 +1493,20 @@ zpool_explain_recover(libzfs_handle_t *h (void) printf(dgettext(TEXT_DOMAIN, "\t")); /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */ - if (nvlist_lookup_uint64(config, - ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || + nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 || + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) goto no_info; - (void) nvlist_lookup_int64(config, ZPOOL_CONFIG_REWIND_TIME, &loss); - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_LOAD_DATA_ERRORS, + (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS, &edata); (void) printf(dgettext(TEXT_DOMAIN, "Recovery is possible, but will result in some data loss.\n")); if (localtime_r((time_t *)&rewindto, &t) != NULL && - strftime(timestr, 128, "", &t) != 0) { + strftime(timestr, 128, 0, &t) != 0) { (void) printf(dgettext(TEXT_DOMAIN, "\tReturning the pool to its state as of %s\n" "\tshould correct the problem. "), @@ -1361,13 +1519,12 @@ zpool_explain_recover(libzfs_handle_t *h if (loss > 120) { (void) printf(dgettext(TEXT_DOMAIN, - "Approximately %jd minutes of data\n" - "\tmust be discarded, irreversibly. "), - (uintmax_t)((loss + 30) / 60)); + "Approximately %lld minutes of data\n" + "\tmust be discarded, irreversibly. "), (loss + 30) / 60); } else if (loss > 0) { (void) printf(dgettext(TEXT_DOMAIN, - "Approximately %jd seconds of data\n" - "\tmust be discarded, irreversibly. "), (uintmax_t)loss); + "Approximately %lld seconds of data\n" + "\tmust be discarded, irreversibly. "), loss); } if (edata != 0 && edata != UINT64_MAX) { if (edata == 1) { @@ -1426,12 +1583,63 @@ zpool_import(libzfs_handle_t *hdl, nvlis } } - ret = zpool_import_props(hdl, config, newname, props, B_FALSE); - if (props) - nvlist_free(props); + ret = zpool_import_props(hdl, config, newname, props, + ZFS_IMPORT_NORMAL); + nvlist_free(props); return (ret); } +static void +print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv, + int indent) +{ + nvlist_t **child; + uint_t c, children; + char *vname; + uint64_t is_log = 0; + + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, + &is_log); + + if (name != NULL) + (void) printf("\t%*s%s%s\n", indent, "", name, + is_log ? " [log]" : ""); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return; + + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(hdl, NULL, child[c], B_TRUE); + print_vdev_tree(hdl, vname, child[c], indent + 2); + free(vname); + } +} + +void +zpool_print_unsup_feat(nvlist_t *config) +{ + nvlist_t *nvinfo, *unsup_feat; + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == + 0); + verify(nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT, + &unsup_feat) == 0); + + for (nvpair_t *nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL; + nvp = nvlist_next_nvpair(unsup_feat, nvp)) { + char *desc; + + verify(nvpair_type(nvp) == DATA_TYPE_STRING); + verify(nvpair_value_string(nvp, &desc) == 0); + + if (strlen(desc) > 0) + (void) printf("\t%s (%s)\n", nvpair_name(nvp), desc); + else + (void) printf("\t%s\n", nvpair_name(nvp)); + } +} + /* * Import the given pool using the known configuration and a list of * properties to be set. The configuration should have come from @@ -1440,15 +1648,17 @@ zpool_import(libzfs_handle_t *hdl, nvlis */ int zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, - nvlist_t *props, boolean_t importfaulted) + nvlist_t *props, int flags) { zfs_cmd_t zc = { 0 }; zpool_rewind_policy_t policy; - nvlist_t *nvi = NULL; + nvlist_t *nv = NULL; + nvlist_t *nvinfo = NULL; + nvlist_t *missing = NULL; char *thename; char *origname; - uint64_t returned_size; int ret; + int error = 0; char errbuf[1024]; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, @@ -1462,24 +1672,26 @@ zpool_import_props(libzfs_handle_t *hdl, return (zfs_error_fmt(hdl, EZFS_INVALIDNAME, dgettext(TEXT_DOMAIN, "cannot import '%s'"), newname)); - thename = __UNCONST(newname); + thename = (char *)newname; } else { thename = origname; } - if (props) { + if (props != NULL) { uint64_t version; + prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) == 0); if ((props = zpool_valid_proplist(hdl, origname, - props, version, B_TRUE, errbuf)) == NULL) { + props, version, flags, errbuf)) == NULL) return (-1); - } else if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { + if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { nvlist_free(props); return (-1); } + nvlist_free(props); } (void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name)); @@ -1488,30 +1700,42 @@ zpool_import_props(libzfs_handle_t *hdl, &zc.zc_guid) == 0); if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) { - nvlist_free(props); + zcmd_free_nvlists(&zc); return (-1); } - returned_size = zc.zc_nvlist_conf_size + 512; - if (zcmd_alloc_dst_nvlist(hdl, &zc, returned_size) != 0) { - nvlist_free(props); + if (zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2) != 0) { + zcmd_free_nvlists(&zc); return (-1); } - zc.zc_cookie = (uint64_t)importfaulted; - ret = 0; - if (zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc) != 0) { + zc.zc_cookie = flags; + while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 && + errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + } + if (ret != 0) + error = errno; + + (void) zcmd_read_dst_nvlist(hdl, &zc, &nv); + + zcmd_free_nvlists(&zc); + + zpool_get_rewind_policy(config, &policy); + + if (error) { char desc[1024]; - (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); - zpool_get_rewind_policy(config, &policy); /* * Dry-run failed, but we print out what success * looks like if we found a best txg */ - if ((policy.zrp_request & ZPOOL_TRY_REWIND) && nvi) { + if (policy.zrp_request & ZPOOL_TRY_REWIND) { zpool_rewind_exclaim(hdl, newname ? origname : thename, - B_TRUE, nvi); - nvlist_free(nvi); + B_TRUE, nv); + nvlist_free(nv); return (-1); } @@ -1524,8 +1748,24 @@ zpool_import_props(libzfs_handle_t *hdl, dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"), origname, thename); - switch (errno) { + switch (error) { case ENOTSUP: + if (nv != NULL && nvlist_lookup_nvlist(nv, + ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && + nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) { + (void) printf(dgettext(TEXT_DOMAIN, "This " + "pool uses the following feature(s) not " + "supported by this system:\n")); + zpool_print_unsup_feat(nv); + if (nvlist_exists(nvinfo, + ZPOOL_CONFIG_CAN_RDONLY)) { + (void) printf(dgettext(TEXT_DOMAIN, + "All unsupported features are only " + "required for writing to the pool." + "\nThe pool can be imported using " + "'-o readonly=on'.\n")); + } + } /* * Unsupported version. */ @@ -1536,15 +1776,43 @@ zpool_import_props(libzfs_handle_t *hdl, (void) zfs_error(hdl, EZFS_INVALCONFIG, desc); break; + case EROFS: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more devices is read only")); + (void) zfs_error(hdl, EZFS_BADDEV, desc); + break; + + case ENXIO: + if (nv && nvlist_lookup_nvlist(nv, + ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && + nvlist_lookup_nvlist(nvinfo, + ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "The devices below are missing, use " + "'-m' to import the pool anyway:\n")); + print_vdev_tree(hdl, NULL, missing, 2); + (void) printf("\n"); + } + (void) zpool_standard_error(hdl, error, desc); + break; + + case EEXIST: + (void) zpool_standard_error(hdl, error, desc); + break; + case ENAMETOOLONG: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "new name of at least one dataset is longer than " + "the maximum allowable length")); + (void) zfs_error(hdl, EZFS_NAMETOOLONG, desc); + break; default: - (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); - (void) zpool_standard_error(hdl, errno, desc); + (void) zpool_standard_error(hdl, error, desc); zpool_explain_recover(hdl, - newname ? origname : thename, -errno, nvi); - nvlist_free(nvi); + newname ? origname : thename, -error, nv); break; } + nvlist_free(nv); ret = -1; } else { zpool_handle_t *zhp; @@ -1556,49 +1824,101 @@ zpool_import_props(libzfs_handle_t *hdl, ret = -1; else if (zhp != NULL) zpool_close(zhp); - (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); - zpool_get_rewind_policy(config, &policy); if (policy.zrp_request & (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { zpool_rewind_exclaim(hdl, newname ? origname : thename, - ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), - nvi); + ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), nv); } - nvlist_free(nvi); + nvlist_free(nv); return (0); } - zcmd_free_nvlists(&zc); - nvlist_free(props); - return (ret); } /* - * Scrub the pool. + * Scan the pool. */ int -zpool_scrub(zpool_handle_t *zhp, pool_scrub_type_t type) +zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func) { zfs_cmd_t zc = { 0 }; char msg[1024]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - zc.zc_cookie = type; + zc.zc_cookie = func; - if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SCRUB, &zc) == 0) + if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0 || + (errno == ENOENT && func != POOL_SCAN_NONE)) return (0); - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name); + if (func == POOL_SCAN_SCRUB) { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot scrub %s"), zc.zc_name); + } else if (func == POOL_SCAN_NONE) { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"), + zc.zc_name); + } else { + assert(!"unexpected result"); + } - if (errno == EBUSY) - return (zfs_error(hdl, EZFS_RESILVERING, msg)); - else + if (errno == EBUSY) { + nvlist_t *nvroot; + pool_scan_stat_t *ps = NULL; + uint_t psc; + + verify(nvlist_lookup_nvlist(zhp->zpool_config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); + if (ps && ps->pss_func == POOL_SCAN_SCRUB) + return (zfs_error(hdl, EZFS_SCRUBBING, msg)); + else + return (zfs_error(hdl, EZFS_RESILVERING, msg)); + } else if (errno == ENOENT) { + return (zfs_error(hdl, EZFS_NO_SCRUB, msg)); + } else { return (zpool_standard_error(hdl, errno, msg)); + } } +#ifdef illumos +/* + * This provides a very minimal check whether a given string is likely a + * c#t#d# style string. Users of this are expected to do their own + * verification of the s# part. + */ +#define CTD_CHECK(str) (str && str[0] == 'c' && isdigit(str[1])) + +/* + * More elaborate version for ones which may start with "/dev/dsk/" + * and the like. + */ +static int +ctd_check_path(char *str) +{ + /* + * If it starts with a slash, check the last component. + */ + if (str && str[0] == '/') { + char *tmp = strrchr(str, '/'); + + /* + * If it ends in "/old", check the second-to-last + * component of the string instead. + */ + if (tmp != str && strcmp(tmp, "/old") == 0) { + for (tmp--; *tmp != '/'; tmp--) + ; + } + str = tmp + 1; + } + return (CTD_CHECK(str)); +} +#endif + /* * Find a vdev that matches the search criteria specified. We use the * the nvpair name to determine how we should look for the device. @@ -1624,26 +1944,17 @@ vdev_to_nvlist_iter(nvlist_t *nv, nvlist srchkey = nvpair_name(pair); switch (nvpair_type(pair)) { - case DATA_TYPE_UINT64: { - uint64_t srchval, theguid, present; - - verify(nvpair_value_uint64(pair, &srchval) == 0); + case DATA_TYPE_UINT64: if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, - &present) == 0) { - /* - * If the device has never been present since - * import, the only reliable way to match the - * vdev is by GUID. - */ - verify(nvlist_lookup_uint64(nv, - ZPOOL_CONFIG_GUID, &theguid) == 0); - if (theguid == srchval) - return (nv); - } + uint64_t srchval, theguid; + + verify(nvpair_value_uint64(pair, &srchval) == 0); + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, + &theguid) == 0); + if (theguid == srchval) + return (nv); } break; - } case DATA_TYPE_STRING: { char *srchval, *val; @@ -1653,27 +1964,58 @@ vdev_to_nvlist_iter(nvlist_t *nv, nvlist break; /* - * Search for the requested value. We special case the search - * for ZPOOL_CONFIG_PATH when it's a wholedisk and when - * Looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE). + * Search for the requested value. Special cases: + * + * - ZPOOL_CONFIG_PATH for whole disk entries. These end in + * "s0" or "s0/old". The "s0" part is hidden from the user, + * but included in the string, so this matches around it. + * - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE). + * * Otherwise, all other searches are simple string compares. */ - if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 && val) { +#ifdef illumos + if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 && + ctd_check_path(val)) { uint64_t wholedisk = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); if (wholedisk) { + int slen = strlen(srchval); + int vlen = strlen(val); + + if (slen != vlen - 2) + break; + /* - * For whole disks, the internal path has 's0', - * but the path passed in by the user doesn't. + * make_leaf_vdev() should only set + * wholedisk for ZPOOL_CONFIG_PATHs which + * will include "/dev/dsk/", giving plenty of + * room for the indices used next. */ - if (strlen(srchval) == strlen(val) - 2 && - strncmp(srchval, val, strlen(srchval)) == 0) + ASSERT(vlen >= 6); + + /* + * strings identical except trailing "s0" + */ + if (strcmp(&val[vlen - 2], "s0") == 0 && + strncmp(srchval, val, slen) == 0) return (nv); + + /* + * strings identical except trailing "s0/old" + */ + if (strcmp(&val[vlen - 6], "s0/old") == 0 && + strcmp(&srchval[slen - 4], "/old") == 0 && + strncmp(srchval, val, slen - 4) == 0) + return (nv); + break; } } else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) { +#else + if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) { +#endif char *type, *idx, *end, *p; uint64_t id, vdev_id; @@ -1798,6 +2140,9 @@ zpool_find_vdev_by_physpath(zpool_handle &nvroot) == 0); *avail_spare = B_FALSE; + *l2cache = B_FALSE; + if (log != NULL) + *log = B_FALSE; ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); nvlist_free(search); @@ -1807,7 +2152,7 @@ zpool_find_vdev_by_physpath(zpool_handle /* * Determine if we have an "interior" top-level vdev (i.e mirror/raidz). */ -static boolean_t +boolean_t zpool_vdev_is_interior(const char *name) { if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || @@ -1833,7 +2178,7 @@ zpool_find_vdev(zpool_handle_t *zhp, con } else if (zpool_vdev_is_interior(path)) { verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0); } else if (path[0] != '/') { - (void) snprintf(buf, sizeof (buf), "%s%s", "/dev/dsk/", path); + (void) snprintf(buf, sizeof (buf), "%s%s", _PATH_DEV, path); verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0); } else { verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0); @@ -1931,17 +2276,17 @@ vdev_get_physpaths(nvlist_t *nv, char *p (is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) { nvlist_t **child; uint_t count; - int i, rv; + int i, ret; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &count) != 0) return (EZFS_INVALCONFIG); for (i = 0; i < count; i++) { - rv = vdev_get_physpaths(child[i], physpath, + ret = vdev_get_physpaths(child[i], physpath, phypath_size, rsz, is_spare); - if (rv == EZFS_NOSPC) - return (rv); + if (ret == EZFS_NOSPC) + return (ret); } } @@ -1973,11 +2318,9 @@ zpool_get_config_physpath(nvlist_t *conf return (EZFS_INVALCONFIG); /* - * root pool can not have EFI labeled disks and can only have - * a single top-level vdev. + * root pool can only have a single top-level vdev. */ - if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1 || - pool_uses_efi(vdev_root)) + if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1) return (EZFS_POOL_INVALARG); (void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz, @@ -2008,6 +2351,7 @@ zpool_get_physpath(zpool_handle_t *zhp, static int zpool_relabel_disk(libzfs_handle_t *hdl, const char *name) { +#ifdef illumos char path[MAXPATHLEN]; char errbuf[1024]; int fd, error; @@ -2017,7 +2361,7 @@ zpool_relabel_disk(libzfs_handle_t *hdl, "efi_use_whole_disk")) == NULL) return (-1); - (void) snprintf(path, sizeof (path), "%s/%s", RDISK_ROOT, name); + (void) snprintf(path, sizeof (path), "%s/%s", ZFS_RDISK_ROOT, name); if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " @@ -2037,6 +2381,7 @@ zpool_relabel_disk(libzfs_handle_t *hdl, "relabel '%s': unable to read disk capacity"), name); return (zfs_error(hdl, EZFS_NOCAP, errbuf)); } +#endif /* illumos */ return (0); } @@ -2092,15 +2437,15 @@ zpool_vdev_online(zpool_handle_t *zhp, c } if (wholedisk) { - pathname += strlen(DISK_ROOT) + 1; - (void) zpool_relabel_disk(zhp->zpool_hdl, pathname); + pathname += strlen(ZFS_DISK_ROOT) + 1; + (void) zpool_relabel_disk(hdl, pathname); } } zc.zc_cookie = VDEV_STATE_ONLINE; zc.zc_obj = flags; - if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) { + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) { if (errno == EINVAL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split " "from this pool into a new one. Use '%s' " @@ -2142,7 +2487,7 @@ zpool_vdev_offline(zpool_handle_t *zhp, zc.zc_cookie = VDEV_STATE_OFFLINE; zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0; - if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); switch (errno) { @@ -2175,14 +2520,14 @@ zpool_vdev_fault(zpool_handle_t *zhp, ui libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot fault %ju"), (uintmax_t)guid); + dgettext(TEXT_DOMAIN, "cannot fault %llu"), guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = VDEV_STATE_FAULTED; zc.zc_obj = aux; - if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); switch (errno) { @@ -2210,14 +2555,14 @@ zpool_vdev_degrade(zpool_handle_t *zhp, libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot degrade %ju"), (uintmax_t)guid); + dgettext(TEXT_DOMAIN, "cannot degrade %llu"), guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = VDEV_STATE_DEGRADED; zc.zc_obj = aux; - if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + if (ioctl(hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); return (zpool_standard_error(hdl, errno, msg)); @@ -2265,12 +2610,12 @@ zpool_vdev_attach(zpool_handle_t *zhp, nvlist_t *tgt; boolean_t avail_spare, l2cache, islog; uint64_t val; - char *path, *newname; + char *newname; nvlist_t **child; uint_t children; nvlist_t *config_root; libzfs_handle_t *hdl = zhp->zpool_hdl; - boolean_t rootpool = pool_is_bootable(zhp); + boolean_t rootpool = zpool_is_bootable(zhp); if (replacing) (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, @@ -2279,16 +2624,6 @@ zpool_vdev_attach(zpool_handle_t *zhp, (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot attach %s to %s"), new_disk, old_disk); - /* - * If this is a root pool, make sure that we're not attaching an - * EFI labeled device. - */ - if (rootpool && pool_uses_efi(nvroot)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "EFI labeled devices are not supported on root pools.")); - return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg)); - } - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache, &islog)) == 0) @@ -2331,48 +2666,34 @@ zpool_vdev_attach(zpool_handle_t *zhp, return (zfs_error(hdl, EZFS_BADTARGET, msg)); } - /* - * If we are attempting to replace a spare, it canot be applied to an - * already spared device. - */ - if (replacing && - nvlist_lookup_string(child[0], ZPOOL_CONFIG_PATH, &path) == 0 && - zpool_find_vdev(zhp, newname, &avail_spare, - &l2cache, NULL) != NULL && avail_spare && - is_replacing_spare(config_root, tgt, 0)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "device has already been replaced with a spare")); - free(newname); - return (zfs_error(hdl, EZFS_BADTARGET, msg)); - } - free(newname); if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) return (-1); - ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_ATTACH, &zc); + ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc); zcmd_free_nvlists(&zc); if (ret == 0) { if (rootpool) { /* - * XXX - This should be removed once we can - * automatically install the bootblocks on the - * newly attached disk. - */ - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Please " - "be sure to invoke %s to make '%s' bootable.\n"), - BOOTCMD, new_disk); - - /* * XXX need a better way to prevent user from * booting up a half-baked vdev. */ (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make " "sure to wait until resilver is done " "before rebooting.\n")); + (void) fprintf(stderr, "\n"); + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "If " + "you boot from pool '%s', you may need to update\n" + "boot code on newly attached disk '%s'.\n\n" + "Assuming you use GPT partitioning and 'da0' is " + "your new boot disk\n" + "you may use the following command:\n\n" + "\tgpart bootcode -b /boot/pmbr -p " + "/boot/gptzfsboot -i 1 da0\n\n"), + zhp->zpool_name, new_disk); } return (0); } @@ -2383,9 +2704,16 @@ zpool_vdev_attach(zpool_handle_t *zhp, * Can't attach to or replace this type of vdev. */ if (replacing) { + uint64_t version = zpool_get_prop_int(zhp, + ZPOOL_PROP_VERSION, NULL); + if (islog) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a log with a spare")); + else if (version >= SPA_VERSION_MULTI_REPLACE) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "already in replacing/spare config; wait " + "for completion or use 'zpool detach'")); else zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot replace a replacing device")); @@ -2483,7 +2811,7 @@ zpool_vdev_detach(zpool_handle_t *zhp, c */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only " "applicable to mirror and replacing vdevs")); - (void) zfs_error(zhp->zpool_hdl, EZFS_BADTARGET, msg); + (void) zfs_error(hdl, EZFS_BADTARGET, msg); break; case EBUSY: @@ -2575,8 +2903,9 @@ zpool_vdev_split(zpool_handle_t *zhp, ch verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0); if (props) { + prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name, - props, vers, B_TRUE, msg)) == NULL) + props, vers, flags, msg)) == NULL) return (-1); } @@ -2584,8 +2913,7 @@ zpool_vdev_split(zpool_handle_t *zhp, ch &children) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Source pool is missing vdev tree")); - if (zc_props) - nvlist_free(zc_props); + nvlist_free(zc_props); return (-1); } @@ -2733,10 +3061,8 @@ out: free(varray); } zcmd_free_nvlists(&zc); - if (zc_props) - nvlist_free(zc_props); - if (newconfig) - nvlist_free(newconfig); + nvlist_free(zc_props); + nvlist_free(newconfig); if (freelist) { nvlist_free(*newroot); *newroot = NULL; @@ -2810,6 +3136,7 @@ zpool_clear(zpool_handle_t *zhp, const c boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; nvlist_t *nvi = NULL; + int error; if (path) (void) snprintf(msg, sizeof (msg), @@ -2840,14 +3167,21 @@ zpool_clear(zpool_handle_t *zhp, const c zpool_get_rewind_policy(rewindnvl, &policy); zc.zc_cookie = policy.zrp_request; - if (zcmd_alloc_dst_nvlist(hdl, &zc, 8192) != 0) + if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2) != 0) return (-1); - if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, rewindnvl) != 0) + if (zcmd_write_src_nvlist(hdl, &zc, rewindnvl) != 0) return (-1); - if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0 || - ((policy.zrp_request & ZPOOL_TRY_REWIND) && + while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 && + errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + } + + if (!error || ((policy.zrp_request & ZPOOL_TRY_REWIND) && errno != EPERM && errno != EACCES)) { if (policy.zrp_request & (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { @@ -2876,11 +3210,12 @@ zpool_vdev_clear(zpool_handle_t *zhp, ui libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot clear errors for %x"), - (uintmax_t)guid); + dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"), + guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; + zc.zc_cookie = ZPOOL_NO_REWIND; if (ioctl(hdl->libzfs_fd, ZFS_IOC_CLEAR, &zc) == 0) return (0); @@ -2889,6 +3224,46 @@ zpool_vdev_clear(zpool_handle_t *zhp, ui } /* + * Change the GUID for a pool. + */ +int +zpool_reguid(zpool_handle_t *zhp) +{ + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + zfs_cmd_t zc = { 0 }; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0) + return (0); + + return (zpool_standard_error(hdl, errno, msg)); +} + +/* + * Reopen the pool. + */ +int +zpool_reopen(zpool_handle_t *zhp) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot reopen '%s'"), + zhp->zpool_name); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if (zfs_ioctl(hdl, ZFS_IOC_POOL_REOPEN, &zc) == 0) + return (0); + return (zpool_standard_error(hdl, errno, msg)); +} + +/* * Convert from a devid string to a path. */ static char * @@ -2911,8 +3286,10 @@ devid_to_path(char *devid_str) if (ret != 0) return (NULL); - if ((path = strdup(list[0].devname)) == NULL) - return (NULL); + /* + * In a case the strdup() fails, we will just return NULL below. + */ + path = strdup(list[0].devname); devid_free_nmlist(list); @@ -2925,6 +3302,7 @@ devid_to_path(char *devid_str) static char * path_to_devid(const char *path) { +#ifdef have_devid int fd; ddi_devid_t devid; char *minor, *ret; @@ -2944,6 +3322,9 @@ path_to_devid(const char *path) (void) close(fd); return (ret); +#else + return (NULL); +#endif } /* @@ -2988,15 +3369,25 @@ zpool_vdev_name(libzfs_handle_t *hdl, zp char buf[64]; vdev_stat_t *vs; uint_t vsc; + int have_stats; + int have_path; + + have_stats = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) == 0; + have_path = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0; - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, - &value) == 0) { + /* + * If the device is not currently present, assume it will not + * come back at the same device path. Display the device by GUID. + */ + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0 || + have_path && have_stats && vs->vs_state <= VDEV_STATE_CANT_OPEN) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value) == 0); - (void) snprintf(buf, sizeof (buf), "%ju", - (uintmax_t)value); + (void) snprintf(buf, sizeof (buf), "%llu", + (u_longlong_t)value); path = buf; - } else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { + } else if (have_path) { /* * If the device is dead (faulted, offline, etc) then don't @@ -3004,8 +3395,7 @@ zpool_vdev_name(libzfs_handle_t *hdl, zp * open a misbehaving device, which can have undesirable * effects. */ - if ((nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS, - (uint64_t **)&vs, &vsc) != 0 || + if ((have_stats == 0 || vs->vs_state >= VDEV_STATE_DEGRADED) && zhp != NULL && nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &devid) == 0) { @@ -3036,17 +3426,35 @@ zpool_vdev_name(libzfs_handle_t *hdl, zp devid_str_free(newdevid); } - if (strncmp(path, "/dev/dsk/", 9) == 0) - path += 9; +#ifdef illumos + if (strncmp(path, ZFS_DISK_ROOTD, strlen(ZFS_DISK_ROOTD)) == 0) + path += strlen(ZFS_DISK_ROOTD); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) == 0 && value) { + int pathlen = strlen(path); char *tmp = zfs_strdup(hdl, path); - if (tmp == NULL) - return (NULL); - tmp[strlen(path) - 2] = '\0'; + + /* + * If it starts with c#, and ends with "s0", chop + * the "s0" off, or if it ends with "s0/old", remove + * the "s0" from the middle. + */ + if (CTD_CHECK(tmp)) { + if (strcmp(&tmp[pathlen - 2], "s0") == 0) { + tmp[pathlen - 2] = '\0'; + } else if (pathlen > 6 && + strcmp(&tmp[pathlen - 6], "s0/old") == 0) { + (void) strcpy(&tmp[pathlen - 6], + "/old"); + } + } return (tmp); } +#else /* !illumos */ + if (strncmp(path, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0) + path += sizeof(_PATH_DEV) - 1; +#endif /* illumos */ } else { verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &path) == 0); @@ -3056,8 +3464,8 @@ zpool_vdev_name(libzfs_handle_t *hdl, zp if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &value) == 0); - (void) snprintf(buf, sizeof (buf), "%s%ju", path, - (uintmax_t)value); + (void) snprintf(buf, sizeof (buf), "%s%llu", path, + (u_longlong_t)value); path = buf; } @@ -3070,8 +3478,8 @@ zpool_vdev_name(libzfs_handle_t *hdl, zp verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &id) == 0); - (void) snprintf(buf, sizeof (buf), "%s-%ju", path, - (uintmax_t)id); + (void) snprintf(buf, sizeof (buf), "%s-%llu", path, + (u_longlong_t)id); path = buf; } } @@ -3080,9 +3488,9 @@ zpool_vdev_name(libzfs_handle_t *hdl, zp } static int -zbookmark_compare(const void *a, const void *b) +zbookmark_mem_compare(const void *a, const void *b) { - return (memcmp(a, b, sizeof (zbookmark_t))); + return (memcmp(a, b, sizeof (zbookmark_phys_t))); } /* @@ -3094,7 +3502,7 @@ zpool_get_errlog(zpool_handle_t *zhp, nv { zfs_cmd_t zc = { 0 }; uint64_t count; - zbookmark_t *zb = NULL; + zbookmark_phys_t *zb = NULL; int i; /* @@ -3107,7 +3515,7 @@ zpool_get_errlog(zpool_handle_t *zhp, nv if (count == 0) return (0); if ((zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl, - count * sizeof (zbookmark_t))) == (uintptr_t)NULL) + count * sizeof (zbookmark_phys_t))) == (uintptr_t)NULL) return (-1); zc.zc_nvlist_dst_size = count; (void) strcpy(zc.zc_name, zhp->zpool_name); @@ -3116,11 +3524,14 @@ zpool_get_errlog(zpool_handle_t *zhp, nv &zc) != 0) { free((void *)(uintptr_t)zc.zc_nvlist_dst); if (errno == ENOMEM) { + void *dst; + count = zc.zc_nvlist_dst_size; - if ((zc.zc_nvlist_dst = (uintptr_t) - zfs_alloc(zhp->zpool_hdl, count * - sizeof (zbookmark_t))) == (uintptr_t)NULL) + dst = zfs_alloc(zhp->zpool_hdl, count * + sizeof (zbookmark_phys_t)); + if (dst == NULL) return (-1); + zc.zc_nvlist_dst = (uintptr_t)dst; } else { return (-1); } @@ -3136,11 +3547,11 @@ zpool_get_errlog(zpool_handle_t *zhp, nv * _not_ copied as part of the process. So we point the start of our * array appropriate and decrement the total number of elements. */ - zb = ((zbookmark_t *)(uintptr_t)zc.zc_nvlist_dst) + + zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) + zc.zc_nvlist_dst_size; count -= zc.zc_nvlist_dst_size; - qsort(zb, count, sizeof (zbookmark_t), zbookmark_compare); + qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare); verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); @@ -3202,40 +3613,30 @@ zpool_upgrade(zpool_handle_t *zhp, uint6 } void -zpool_set_history_str(const char *subcommand, int argc, char **argv, - char *history_str) +zfs_save_arguments(int argc, char **argv, char *string, int len) { - int i; - - (void) strlcpy(history_str, subcommand, HIS_MAX_RECORD_LEN); - for (i = 1; i < argc; i++) { - if (strlen(history_str) + 1 + strlen(argv[i]) > - HIS_MAX_RECORD_LEN) - break; - (void) strlcat(history_str, " ", HIS_MAX_RECORD_LEN); - (void) strlcat(history_str, argv[i], HIS_MAX_RECORD_LEN); + (void) strlcpy(string, basename(argv[0]), len); + for (int i = 1; i < argc; i++) { + (void) strlcat(string, " ", len); + (void) strlcat(string, argv[i], len); } } -/* - * Stage command history for logging. - */ int -zpool_stage_history(libzfs_handle_t *hdl, const char *history_str) +zpool_log_history(libzfs_handle_t *hdl, const char *message) { - if (history_str == NULL) - return (EINVAL); - - if (strlen(history_str) > HIS_MAX_RECORD_LEN) - return (EINVAL); - - if (hdl->libzfs_log_str != NULL) - free(hdl->libzfs_log_str); - - if ((hdl->libzfs_log_str = strdup(history_str)) == NULL) - return (no_memory(hdl)); + zfs_cmd_t zc = { 0 }; + nvlist_t *args; + int err; - return (0); + args = fnvlist_alloc(); + fnvlist_add_string(args, "message", message); + err = zcmd_write_src_nvlist(hdl, &zc, args); + if (err == 0) + err = ioctl(hdl->libzfs_fd, ZFS_IOC_LOG_HISTORY, &zc); + nvlist_free(args); + zcmd_free_nvlists(&zc); + return (err); } /* @@ -3328,7 +3729,9 @@ zpool_history_unpack(char *buf, uint64_t return (0); } -#define HIS_BUF_LEN (128*1024) +/* from spa_history.c: spa_history_create_obj() */ +#define HIS_BUF_LEN_DEF (128 << 10) +#define HIS_BUF_LEN_MAX (1 << 30) /* * Retrieve the command history of a pool. @@ -3336,31 +3739,51 @@ zpool_history_unpack(char *buf, uint64_t int zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp) { - char buf[HIS_BUF_LEN]; + char *buf; + uint64_t buflen = HIS_BUF_LEN_DEF; uint64_t off = 0; nvlist_t **records = NULL; uint_t numrecords = 0; int err, i; + buf = malloc(buflen); + if (buf == NULL) + return (ENOMEM); do { - uint64_t bytes_read = sizeof (buf); + uint64_t bytes_read = buflen; uint64_t leftover; if ((err = get_history(zhp, buf, &off, &bytes_read)) != 0) break; /* if nothing else was read in, we're at EOF, just return */ - if (!bytes_read) + if (bytes_read == 0) break; if ((err = zpool_history_unpack(buf, bytes_read, &leftover, &records, &numrecords)) != 0) break; off -= leftover; + if (leftover == bytes_read) { + /* + * no progress made, because buffer is not big enough + * to hold this record; resize and retry. + */ + buflen *= 2; + free(buf); + buf = NULL; + if ((buflen >= HIS_BUF_LEN_MAX) || + ((buf = malloc(buflen)) == NULL)) { + err = ENOMEM; + break; + } + } /* CONSTCOND */ } while (1); + free(buf); + if (!err) { verify(nvlist_alloc(nvhisp, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD, @@ -3380,12 +3803,11 @@ zpool_obj_to_path(zpool_handle_t *zhp, u zfs_cmd_t zc = { 0 }; boolean_t mounted = B_FALSE; char *mntpnt = NULL; - char dsname[MAXNAMELEN]; + char dsname[ZFS_MAX_DATASET_NAME_LEN]; if (dsobj == 0) { /* special case for the MOS */ - (void) snprintf(pathname, len, ":<%#jx>", - (uintmax_t)obj); + (void) snprintf(pathname, len, ":<0x%llx>", obj); return; } @@ -3395,8 +3817,8 @@ zpool_obj_to_path(zpool_handle_t *zhp, u if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) { /* just write out a path of two object numbers */ - (void) snprintf(pathname, len, "<%#jx>:<%#jx>", - (uintmax_t)dsobj, (uintmax_t)obj); + (void) snprintf(pathname, len, "<0x%llx>:<0x%llx>", + dsobj, obj); return; } (void) strlcpy(dsname, zc.zc_value, sizeof (dsname)); @@ -3417,12 +3839,12 @@ zpool_obj_to_path(zpool_handle_t *zhp, u dsname, zc.zc_value); } } else { - (void) snprintf(pathname, len, "%s:<%#jx>", dsname, - (uintmax_t)obj); + (void) snprintf(pathname, len, "%s:<0x%llx>", dsname, obj); } free(mntpnt); } +#ifdef illumos /* * Read the EFI label from the config, if a label does not exist then * pass back the error to the caller. If the caller has passed a non-NULL @@ -3440,7 +3862,7 @@ read_efi_label(nvlist_t *config, diskadd if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0) return (err); - (void) snprintf(diskname, sizeof (diskname), "%s%s", RDISK_ROOT, + (void) snprintf(diskname, sizeof (diskname), "%s%s", ZFS_RDISK_ROOT, strrchr(path, '/')); if ((fd = open(diskname, O_RDONLY|O_NDELAY)) >= 0) { struct dk_gpt *vtoc; @@ -3487,14 +3909,16 @@ find_start_block(nvlist_t *config) } return (MAXOFFSET_T); } +#endif /* illumos */ /* * Label an individual disk. The name provided is the short name, * stripped of any leading /dev path. */ int -zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) +zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name) { +#ifdef illumos char path[MAXPATHLEN]; struct dk_gpt *vtoc; int fd; @@ -3510,13 +3934,6 @@ zpool_label_disk(libzfs_handle_t *hdl, z if (zhp) { nvlist_t *nvroot; - if (pool_is_bootable(zhp)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "EFI labeled devices are not supported on root " - "pools.")); - return (zfs_error(hdl, EZFS_POOL_NOTSUP, errbuf)); - } - verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); @@ -3530,7 +3947,7 @@ zpool_label_disk(libzfs_handle_t *hdl, z start_block = NEW_START_BLOCK; } - (void) snprintf(path, sizeof (path), "%s/%s%s", RDISK_ROOT, name, + (void) snprintf(path, sizeof (path), "%s/%s%s", ZFS_RDISK_ROOT, name, BACKUP_SLICE); if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { @@ -3599,6 +4016,7 @@ zpool_label_disk(libzfs_handle_t *hdl, z (void) close(fd); efi_free(vtoc); +#endif /* illumos */ return (0); } @@ -3610,9 +4028,7 @@ supported_dump_vdev_type(libzfs_handle_t uint_t children, c; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || - strcmp(type, VDEV_TYPE_FILE) == 0 || - strcmp(type, VDEV_TYPE_LOG) == 0 || + if (strcmp(type, VDEV_TYPE_FILE) == 0 || strcmp(type, VDEV_TYPE_HOLE) == 0 || strcmp(type, VDEV_TYPE_MISSING) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, @@ -3631,8 +4047,12 @@ supported_dump_vdev_type(libzfs_handle_t } /* - * check if this zvol is allowable for use as a dump device; zero if - * it is, > 0 if it isn't, < 0 if it isn't a zvol + * Check if this zvol is allowable for use as a dump device; zero if + * it is, > 0 if it isn't, < 0 if it isn't a zvol. + * + * Allowable storage configurations include mirrors, all raidz variants, and + * pools with log, cache, and spare devices. Pools which are backed by files or + * have missing/hole vdevs are not suitable. */ int zvol_check_dump_config(char *arg) @@ -3644,7 +4064,7 @@ zvol_check_dump_config(char *arg) uint_t toplevels; libzfs_handle_t *hdl; char errbuf[1024]; - char poolname[ZPOOL_MAXNAMELEN]; + char poolname[ZFS_MAX_DATASET_NAME_LEN]; int pathlen = strlen(ZVOL_FULL_DEV_DIR); int ret = 1; @@ -3667,7 +4087,7 @@ zvol_check_dump_config(char *arg) "malformed dataset name")); (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); return (1); - } else if (p - volname >= ZFS_MAXNAMELEN) { + } else if (p - volname >= ZFS_MAX_DATASET_NAME_LEN) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "dataset name is too long")); (void) zfs_error(hdl, EZFS_NAMETOOLONG, errbuf); @@ -3694,12 +4114,6 @@ zvol_check_dump_config(char *arg) verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &top, &toplevels) == 0); - if (toplevels != 1) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "'%s' has multiple top level vdevs"), poolname); - (void) zfs_error(hdl, EZFS_DEVOVERFLOW, errbuf); - goto out; - } if (!supported_dump_vdev_type(hdl, top[0], errbuf)) { goto out; @@ -3712,3 +4126,25 @@ out: libzfs_fini(hdl); return (ret); } + +int +zpool_nextboot(libzfs_handle_t *hdl, uint64_t pool_guid, uint64_t dev_guid, + const char *command) +{ + zfs_cmd_t zc = { 0 }; + nvlist_t *args; + char *packed; + size_t size; + int error; + + args = fnvlist_alloc(); + fnvlist_add_uint64(args, ZPOOL_CONFIG_POOL_GUID, pool_guid); + fnvlist_add_uint64(args, ZPOOL_CONFIG_GUID, dev_guid); + fnvlist_add_string(args, "command", command); + error = zcmd_write_src_nvlist(hdl, &zc, args); + if (error == 0) + error = ioctl(hdl->libzfs_fd, ZFS_IOC_NEXTBOOT, &zc); + zcmd_free_nvlists(&zc); + nvlist_free(args); + return (error); +} Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c,v retrieving revision 1.2 diff -u -p -r1.2 libzfs_sendrecv.c --- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c 3 Jun 2012 10:53:51 -0000 1.2 +++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c 22 Apr 2017 17:03:30 -0000 @@ -20,8 +20,14 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2012 Pawel Jakub Dawidek. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Igor Kozhukhov */ #include @@ -34,25 +40,38 @@ #include #include #include +#include #include #include #include +#include #include +#include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_fletcher.h" #include "libzfs_impl.h" +#include #include #include #include +#ifdef __FreeBSD__ +extern int zfs_ioctl_version; +/* We need to use something for ENODATA. */ +#define ENODATA EIDRM +#endif + /* in libzfs_dataset.c */ extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); -static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t, - int, avl_tree_t *, char **); +static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, + recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int, + uint64_t *, const char *); +static int guid_to_name(libzfs_handle_t *, const char *, + uint64_t, boolean_t, char *); static const zio_cksum_t zero_cksum = { 0 }; @@ -62,6 +81,12 @@ typedef struct dedup_arg { libzfs_handle_t *dedup_hdl; } dedup_arg_t; +typedef struct progress_arg { + zfs_handle_t *pa_zhp; + int pa_fd; + boolean_t pa_parsable; +} progress_arg_t; + typedef struct dataref { uint64_t ref_guid; uint64_t ref_object; @@ -169,10 +194,28 @@ ddt_update(libzfs_handle_t *hdl, dedup_t } static int -cksum_and_write(const void *buf, uint64_t len, zio_cksum_t *zc, int outfd) +dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, + zio_cksum_t *zc, int outfd) { - fletcher_4_incremental_native(buf, len, zc); - return (write(outfd, buf, len)); + ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + fletcher_4_incremental_native(drr, + offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); + if (drr->drr_type != DRR_BEGIN) { + ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. + drr_checksum.drr_checksum)); + drr->drr_u.drr_checksum.drr_checksum = *zc; + } + fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, + sizeof (zio_cksum_t), zc); + if (write(outfd, drr, sizeof (*drr)) == -1) + return (errno); + if (payload_len != 0) { + fletcher_4_incremental_native(payload, payload_len, zc); + if (write(outfd, payload, payload_len) == -1) + return (errno); + } + return (0); } /* @@ -196,27 +239,21 @@ static void * cksummer(void *arg) { dedup_arg_t *dda = arg; - char *buf = malloc(1<<20); + char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE); dmu_replay_record_t thedrr; dmu_replay_record_t *drr = &thedrr; - struct drr_begin *drrb = &thedrr.drr_u.drr_begin; - struct drr_end *drre = &thedrr.drr_u.drr_end; - struct drr_object *drro = &thedrr.drr_u.drr_object; - struct drr_write *drrw = &thedrr.drr_u.drr_write; FILE *ofp; int outfd; - dmu_replay_record_t wbr_drr = {0}; - struct drr_write_byref *wbr_drrr = &wbr_drr.drr_u.drr_write_byref; dedup_table_t ddt; zio_cksum_t stream_cksum; uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); uint64_t numbuckets; ddt.max_ddt_size = - MAX((physmem * MAX_DDT_PHYSMEM_PERCENT)/100, - SMALLEST_POSSIBLE_MAX_DDT_MB<<20); + MAX((physmem * MAX_DDT_PHYSMEM_PERCENT) / 100, + SMALLEST_POSSIBLE_MAX_DDT_MB << 20); - numbuckets = ddt.max_ddt_size/(sizeof (dedup_entry_t)); + numbuckets = ddt.max_ddt_size / (sizeof (dedup_entry_t)); /* * numbuckets must be a power of 2. Increase number to @@ -232,86 +269,90 @@ cksummer(void *arg) ddt.numhashbits = high_order_bit(numbuckets) - 1; ddt.ddt_full = B_FALSE; - /* Initialize the write-by-reference block. */ - wbr_drr.drr_type = DRR_WRITE_BYREF; - wbr_drr.drr_payloadlen = 0; - outfd = dda->outputfd; ofp = fdopen(dda->inputfd, "r"); - while (ssread(drr, sizeof (dmu_replay_record_t), ofp) != 0) { + while (ssread(drr, sizeof (*drr), ofp) != 0) { switch (drr->drr_type) { case DRR_BEGIN: { - int fflags; + struct drr_begin *drrb = &drr->drr_u.drr_begin; + int fflags; + int sz = 0; ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + /* set the DEDUP feature flag for this stream */ fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); fflags |= (DMU_BACKUP_FEATURE_DEDUP | DMU_BACKUP_FEATURE_DEDUPPROPS); DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); - if (cksum_and_write(drr, sizeof (dmu_replay_record_t), - &stream_cksum, outfd) == -1) - goto out; - if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == - DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) { - int sz = drr->drr_payloadlen; - - if (sz > 1<<20) { - free(buf); - buf = malloc(sz); + if (drr->drr_payloadlen != 0) { + sz = drr->drr_payloadlen; + + if (sz > SPA_MAXBLOCKSIZE) { + buf = zfs_realloc(dda->dedup_hdl, buf, + SPA_MAXBLOCKSIZE, sz); } (void) ssread(buf, sz, ofp); if (ferror(stdin)) perror("fread"); - if (cksum_and_write(buf, sz, &stream_cksum, - outfd) == -1) - goto out; } + if (dump_record(drr, buf, sz, &stream_cksum, + outfd) != 0) + goto out; break; } case DRR_END: { + struct drr_end *drre = &drr->drr_u.drr_end; /* use the recalculated checksum */ - ZIO_SET_CHECKSUM(&drre->drr_checksum, - stream_cksum.zc_word[0], stream_cksum.zc_word[1], - stream_cksum.zc_word[2], stream_cksum.zc_word[3]); - if ((write(outfd, drr, - sizeof (dmu_replay_record_t))) == -1) + drre->drr_checksum = stream_cksum; + if (dump_record(drr, NULL, 0, &stream_cksum, + outfd) != 0) goto out; break; } case DRR_OBJECT: { - if (cksum_and_write(drr, sizeof (dmu_replay_record_t), - &stream_cksum, outfd) == -1) - goto out; + struct drr_object *drro = &drr->drr_u.drr_object; if (drro->drr_bonuslen > 0) { (void) ssread(buf, P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), ofp); - if (cksum_and_write(buf, - P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), - &stream_cksum, outfd) == -1) - goto out; } + if (dump_record(drr, buf, + P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), + &stream_cksum, outfd) != 0) + goto out; + break; + } + + case DRR_SPILL: + { + struct drr_spill *drrs = &drr->drr_u.drr_spill; + (void) ssread(buf, drrs->drr_length, ofp); + if (dump_record(drr, buf, drrs->drr_length, + &stream_cksum, outfd) != 0) + goto out; break; } case DRR_FREEOBJECTS: { - if (cksum_and_write(drr, sizeof (dmu_replay_record_t), - &stream_cksum, outfd) == -1) + if (dump_record(drr, NULL, 0, &stream_cksum, + outfd) != 0) goto out; break; } case DRR_WRITE: { + struct drr_write *drrw = &drr->drr_u.drr_write; dataref_t dataref; (void) ssread(buf, drrw->drr_length, ofp); @@ -349,7 +390,13 @@ cksummer(void *arg) if (ddt_update(dda->dedup_hdl, &ddt, &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop, &dataref)) { + dmu_replay_record_t wbr_drr = {0}; + struct drr_write_byref *wbr_drrr = + &wbr_drr.drr_u.drr_write_byref; + /* block already present in stream */ + wbr_drr.drr_type = DRR_WRITE_BYREF; + wbr_drrr->drr_object = drrw->drr_object; wbr_drrr->drr_offset = drrw->drr_offset; wbr_drrr->drr_length = drrw->drr_length; @@ -369,34 +416,41 @@ cksummer(void *arg) wbr_drrr->drr_key.ddk_prop = drrw->drr_key.ddk_prop; - if (cksum_and_write(&wbr_drr, - sizeof (dmu_replay_record_t), &stream_cksum, - outfd) == -1) + if (dump_record(&wbr_drr, NULL, 0, + &stream_cksum, outfd) != 0) goto out; } else { /* block not previously seen */ - if (cksum_and_write(drr, - sizeof (dmu_replay_record_t), &stream_cksum, - outfd) == -1) - goto out; - if (cksum_and_write(buf, - drrw->drr_length, - &stream_cksum, outfd) == -1) + if (dump_record(drr, buf, drrw->drr_length, + &stream_cksum, outfd) != 0) goto out; } break; } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &drr->drr_u.drr_write_embedded; + (void) ssread(buf, + P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp); + if (dump_record(drr, buf, + P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), + &stream_cksum, outfd) != 0) + goto out; + break; + } + case DRR_FREE: { - if (cksum_and_write(drr, sizeof (dmu_replay_record_t), - &stream_cksum, outfd) == -1) + if (dump_record(drr, NULL, 0, &stream_cksum, + outfd) != 0) goto out; break; } default: - (void) printf("INVALID record type 0x%x\n", + (void) fprintf(stderr, "INVALID record type 0x%x\n", drr->drr_type); /* should never happen, so assert */ assert(B_FALSE); @@ -526,13 +580,30 @@ fsavl_create(nvlist_t *fss) * Routines for dealing with the giant nvlist of fs-nvlists, etc. */ typedef struct send_data { + /* + * assigned inside every recursive call, + * restored from *_save on return: + * + * guid of fromsnap snapshot in parent dataset + * txg of fromsnap snapshot in current dataset + * txg of tosnap snapshot in current dataset + */ + uint64_t parent_fromsnap_guid; + uint64_t fromsnap_txg; + uint64_t tosnap_txg; + + /* the nvlists get accumulated during depth-first traversal */ nvlist_t *parent_snaps; nvlist_t *fss; nvlist_t *snapprops; + + /* send-receive configuration, does not change during traversal */ + const char *fsname; const char *fromsnap; const char *tosnap; boolean_t recursive; + boolean_t verbose; /* * The header nvlist is of the following format: @@ -565,11 +636,23 @@ send_iterate_snap(zfs_handle_t *zhp, voi { send_data_t *sd = arg; uint64_t guid = zhp->zfs_dmustats.dds_guid; + uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; char *snapname; nvlist_t *nv; snapname = strrchr(zhp->zfs_name, '@')+1; + if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { + if (sd->verbose) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "skipping snapshot %s because it was created " + "after the destination snapshot (%s)\n"), + zhp->zfs_name, sd->tosnap); + } + zfs_close(zhp); + return (0); + } + VERIFY(0 == nvlist_add_uint64(sd->parent_snaps, snapname, guid)); /* * NB: if there is no fromsnap here (it's a newly created fs in @@ -663,6 +746,31 @@ send_iterate_prop(zfs_handle_t *zhp, nvl } /* + * returns snapshot creation txg + * and returns 0 if the snapshot does not exist + */ +static uint64_t +get_snap_txg(libzfs_handle_t *hdl, const char *fs, const char *snap) +{ + char name[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t txg = 0; + + if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0') + return (txg); + + (void) snprintf(name, sizeof (name), "%s@%s", fs, snap); + if (zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT)) { + zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT); + if (zhp != NULL) { + txg = zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG); + zfs_close(zhp); + } + } + + return (txg); +} + +/* * recursively generate nvlists describing datasets. See comment * for the data structure send_data_t above for description of contents * of the nvlist. @@ -674,9 +782,48 @@ send_iterate_fs(zfs_handle_t *zhp, void nvlist_t *nvfs, *nv; int rv = 0; uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid; + uint64_t fromsnap_txg_save = sd->fromsnap_txg; + uint64_t tosnap_txg_save = sd->tosnap_txg; + uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; uint64_t guid = zhp->zfs_dmustats.dds_guid; + uint64_t fromsnap_txg, tosnap_txg; char guidstring[64]; + fromsnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->fromsnap); + if (fromsnap_txg != 0) + sd->fromsnap_txg = fromsnap_txg; + + tosnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->tosnap); + if (tosnap_txg != 0) + sd->tosnap_txg = tosnap_txg; + + /* + * on the send side, if the current dataset does not have tosnap, + * perform two additional checks: + * + * - skip sending the current dataset if it was created later than + * the parent tosnap + * - return error if the current dataset was created earlier than + * the parent tosnap + */ + if (sd->tosnap != NULL && tosnap_txg == 0) { + if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { + if (sd->verbose) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "skipping dataset %s: snapshot %s does " + "not exist\n"), zhp->zfs_name, sd->tosnap); + } + } else { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "cannot send %s@%s%s: snapshot %s@%s does not " + "exist\n"), sd->fsname, sd->tosnap, sd->recursive ? + dgettext(TEXT_DOMAIN, " recursively") : "", + zhp->zfs_name, sd->tosnap); + rv = -1; + } + goto out; + } + VERIFY(0 == nvlist_alloc(&nvfs, NV_UNIQUE_NAME, 0)); VERIFY(0 == nvlist_add_string(nvfs, "name", zhp->zfs_name)); VERIFY(0 == nvlist_add_uint64(nvfs, "parentfromsnap", @@ -685,8 +832,10 @@ send_iterate_fs(zfs_handle_t *zhp, void if (zhp->zfs_dmustats.dds_origin[0]) { zfs_handle_t *origin = zfs_open(zhp->zfs_hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT); - if (origin == NULL) - return (-1); + if (origin == NULL) { + rv = -1; + goto out; + } VERIFY(0 == nvlist_add_uint64(nvfs, "origin", origin->zfs_dmustats.dds_guid)); } @@ -701,7 +850,7 @@ send_iterate_fs(zfs_handle_t *zhp, void sd->parent_fromsnap_guid = 0; VERIFY(0 == nvlist_alloc(&sd->parent_snaps, NV_UNIQUE_NAME, 0)); VERIFY(0 == nvlist_alloc(&sd->snapprops, NV_UNIQUE_NAME, 0)); - (void) zfs_iter_snapshots(zhp, send_iterate_snap, sd); + (void) zfs_iter_snapshots_sorted(zhp, send_iterate_snap, sd); VERIFY(0 == nvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps)); VERIFY(0 == nvlist_add_nvlist(nvfs, "snapprops", sd->snapprops)); nvlist_free(sd->parent_snaps); @@ -717,7 +866,10 @@ send_iterate_fs(zfs_handle_t *zhp, void if (sd->recursive) rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd); +out: sd->parent_fromsnap_guid = parent_fromsnap_guid_save; + sd->fromsnap_txg = fromsnap_txg_save; + sd->tosnap_txg = tosnap_txg_save; zfs_close(zhp); return (rv); @@ -725,7 +877,8 @@ send_iterate_fs(zfs_handle_t *zhp, void static int gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, - const char *tosnap, boolean_t recursive, nvlist_t **nvlp, avl_tree_t **avlp) + const char *tosnap, boolean_t recursive, boolean_t verbose, + nvlist_t **nvlp, avl_tree_t **avlp) { zfs_handle_t *zhp; send_data_t sd = { 0 }; @@ -736,9 +889,11 @@ gather_nvlist(libzfs_handle_t *hdl, cons return (EZFS_BADTYPE); VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0)); + sd.fsname = fsname; sd.fromsnap = fromsnap; sd.tosnap = tosnap; sd.recursive = recursive; + sd.verbose = verbose; if ((error = send_iterate_fs(zhp, &sd)) != 0) { nvlist_free(sd.fss); @@ -759,128 +914,137 @@ gather_nvlist(libzfs_handle_t *hdl, cons } /* - * Routines for dealing with the sorted snapshot functionality - */ -typedef struct zfs_node { - zfs_handle_t *zn_handle; - avl_node_t zn_avlnode; -} zfs_node_t; - -static int -zfs_sort_snaps(zfs_handle_t *zhp, void *data) -{ - avl_tree_t *avl = data; - zfs_node_t *node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t)); - - node->zn_handle = zhp; - avl_add(avl, node); - return (0); -} - -/* ARGSUSED */ -static int -zfs_snapshot_compare(const void *larg, const void *rarg) -{ - zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; - zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; - uint64_t lcreate, rcreate; - - /* - * Sort them according to creation time. We use the hidden - * CREATETXG property to get an absolute ordering of snapshots. - */ - lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); - rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); - - if (lcreate < rcreate) - return (-1); - else if (lcreate > rcreate) - return (+1); - else - return (0); -} - -int -zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, void *data) -{ - int ret = 0; - zfs_node_t *node; - avl_tree_t avl; - void *cookie = NULL; - - avl_create(&avl, zfs_snapshot_compare, - sizeof (zfs_node_t), offsetof(zfs_node_t, zn_avlnode)); - - ret = zfs_iter_snapshots(zhp, zfs_sort_snaps, &avl); - - for (node = avl_first(&avl); node != NULL; node = AVL_NEXT(&avl, node)) - ret |= callback(node->zn_handle, data); - - while ((node = avl_destroy_nodes(&avl, &cookie)) != NULL) - free(node); - - avl_destroy(&avl); - - return (ret); -} - -/* * Routines specific to "zfs send" */ typedef struct send_dump_data { /* these are all just the short snapname (the part after the @) */ const char *fromsnap; const char *tosnap; - char prevsnap[ZFS_MAXNAMELEN]; + char prevsnap[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t prevsnap_obj; boolean_t seenfrom, seento, replicate, doall, fromorigin; - boolean_t verbose; + boolean_t verbose, dryrun, parsable, progress, embed_data, std_out; + boolean_t large_block; int outfd; boolean_t err; nvlist_t *fss; + nvlist_t *snapholds; avl_tree_t *fsavl; snapfilter_cb_t *filter_cb; void *filter_cb_arg; + nvlist_t *debugnv; + char holdtag[ZFS_MAX_DATASET_NAME_LEN]; + int cleanup_fd; + uint64_t size; } send_dump_data_t; +static int +estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj, + boolean_t fromorigin, uint64_t *sizep) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zfs_hdl; + + assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); + assert(fromsnap_obj == 0 || !fromorigin); + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + zc.zc_obj = fromorigin; + zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); + zc.zc_fromobj = fromsnap_obj; + zc.zc_guid = 1; /* estimate flag */ + + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot estimate space for '%s'"), zhp->zfs_name); + + switch (errno) { + case EXDEV: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "not an earlier snapshot from the same fs")); + return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); + + case ENOENT: + if (zfs_dataset_exists(hdl, zc.zc_name, + ZFS_TYPE_SNAPSHOT)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental source (@%s) does not exist"), + zc.zc_value); + } + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + + case EDQUOT: + case EFBIG: + case EIO: + case ENOLINK: + case ENOSPC: + case ENXIO: + case EPIPE: + case ERANGE: + case EFAULT: + case EROFS: + zfs_error_aux(hdl, strerror(errno)); + return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); + + default: + return (zfs_standard_error(hdl, errno, errbuf)); + } + } + + *sizep = zc.zc_objset_type; + + return (0); +} + /* * Dumps a backup of the given snapshot (incremental from fromsnap if it's not * NULL) to the file descriptor specified by outfd. */ static int -dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin, - int outfd, boolean_t enoent_ok, boolean_t *got_enoent) +dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, + boolean_t fromorigin, int outfd, enum lzc_send_flags flags, + nvlist_t *debugnv) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zfs_hdl; + nvlist_t *thisdbg; assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); - assert(fromsnap == NULL || fromsnap[0] == '\0' || !fromorigin); + assert(fromsnap_obj == 0 || !fromorigin); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - if (fromsnap) - (void) strlcpy(zc.zc_value, fromsnap, sizeof (zc.zc_value)); zc.zc_cookie = outfd; zc.zc_obj = fromorigin; + zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); + zc.zc_fromobj = fromsnap_obj; + zc.zc_flags = flags; - *got_enoent = B_FALSE; + VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0)); + if (fromsnap && fromsnap[0] != '\0') { + VERIFY(0 == nvlist_add_string(thisdbg, + "fromsnap", fromsnap)); + } - if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SEND, &zc) != 0) { + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "warning: cannot send '%s'"), zhp->zfs_name); - switch (errno) { + VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno)); + if (debugnv) { + VERIFY(0 == nvlist_add_nvlist(debugnv, + zhp->zfs_name, thisdbg)); + } + nvlist_free(thisdbg); + switch (errno) { case EXDEV: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "not an earlier snapshot from the same fs")); return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: - if (enoent_ok) { - *got_enoent = B_TRUE; - return (0); - } if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_SNAPSHOT)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, @@ -894,7 +1058,9 @@ dump_ioctl(zfs_handle_t *zhp, const char case EIO: case ENOLINK: case ENOSPC: +#ifdef illumos case ENOSTR: +#endif case ENXIO: case EPIPE: case ERANGE: @@ -908,23 +1074,139 @@ dump_ioctl(zfs_handle_t *zhp, const char } } + if (debugnv) + VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg)); + nvlist_free(thisdbg); + return (0); } +static void +gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd) +{ + assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); + + /* + * zfs_send() only sets snapholds for sends that need them, + * e.g. replication and doall. + */ + if (sdd->snapholds == NULL) + return; + + fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag); +} + +static void * +send_progress_thread(void *arg) +{ + progress_arg_t *pa = arg; + zfs_cmd_t zc = { 0 }; + zfs_handle_t *zhp = pa->pa_zhp; + libzfs_handle_t *hdl = zhp->zfs_hdl; + unsigned long long bytes; + char buf[16]; + time_t t; + struct tm *tm; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + if (!pa->pa_parsable) + (void) fprintf(stderr, "TIME SENT SNAPSHOT\n"); + + /* + * Print the progress from ZFS_IOC_SEND_PROGRESS every second. + */ + for (;;) { + (void) sleep(1); + + zc.zc_cookie = pa->pa_fd; + if (zfs_ioctl(hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0) + return ((void *)-1); + + (void) time(&t); + tm = localtime(&t); + bytes = zc.zc_cookie; + + if (pa->pa_parsable) { + (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", + tm->tm_hour, tm->tm_min, tm->tm_sec, + bytes, zhp->zfs_name); + } else { + zfs_nicenum(bytes, buf, sizeof (buf)); + (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", + tm->tm_hour, tm->tm_min, tm->tm_sec, + buf, zhp->zfs_name); + } + } +} + +static void +send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap, + uint64_t size, boolean_t parsable) +{ + if (parsable) { + if (fromsnap != NULL) { + (void) fprintf(fout, "incremental\t%s\t%s", + fromsnap, tosnap); + } else { + (void) fprintf(fout, "full\t%s", + tosnap); + } + } else { + if (fromsnap != NULL) { + if (strchr(fromsnap, '@') == NULL && + strchr(fromsnap, '#') == NULL) { + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "send from @%s to %s"), + fromsnap, tosnap); + } else { + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "send from %s to %s"), + fromsnap, tosnap); + } + } else { + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "full send of %s"), + tosnap); + } + } + + if (size != 0) { + if (parsable) { + (void) fprintf(fout, "\t%llu", + (longlong_t)size); + } else { + char buf[16]; + zfs_nicenum(size, buf, sizeof (buf)); + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + " estimated size is %s"), buf); + } + } + (void) fprintf(fout, "\n"); +} + static int dump_snapshot(zfs_handle_t *zhp, void *arg) { send_dump_data_t *sdd = arg; - const char *thissnap; + progress_arg_t pa = { 0 }; + pthread_t tid; + char *thissnap; int err; - boolean_t got_enoent; + boolean_t isfromsnap, istosnap, fromorigin; + boolean_t exclude = B_FALSE; + FILE *fout = sdd->std_out ? stdout : stderr; + err = 0; thissnap = strchr(zhp->zfs_name, '@') + 1; + isfromsnap = (sdd->fromsnap != NULL && + strcmp(sdd->fromsnap, thissnap) == 0); - if (sdd->fromsnap && !sdd->seenfrom && - strcmp(sdd->fromsnap, thissnap) == 0) { + if (!sdd->seenfrom && isfromsnap) { + gather_holds(zhp, sdd); sdd->seenfrom = B_TRUE; (void) strcpy(sdd->prevsnap, thissnap); + sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zfs_close(zhp); return (0); } @@ -934,18 +1216,40 @@ dump_snapshot(zfs_handle_t *zhp, void *a return (0); } - if (strcmp(sdd->tosnap, thissnap) == 0) + istosnap = (strcmp(sdd->tosnap, thissnap) == 0); + if (istosnap) sdd->seento = B_TRUE; + if (!sdd->doall && !isfromsnap && !istosnap) { + if (sdd->replicate) { + char *snapname; + nvlist_t *snapprops; + /* + * Filter out all intermediate snapshots except origin + * snapshots needed to replicate clones. + */ + nvlist_t *nvfs = fsavl_find(sdd->fsavl, + zhp->zfs_dmustats.dds_guid, &snapname); + + VERIFY(0 == nvlist_lookup_nvlist(nvfs, + "snapprops", &snapprops)); + VERIFY(0 == nvlist_lookup_nvlist(snapprops, + thissnap, &snapprops)); + exclude = !nvlist_exists(snapprops, "is_clone_origin"); + } else { + exclude = B_TRUE; + } + } + /* * If a filter function exists, call it to determine whether * this snapshot will be sent. */ - if (sdd->filter_cb != NULL && - sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE) { + if (exclude || (sdd->filter_cb != NULL && + sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) { /* * This snapshot is filtered out. Don't send it, and don't - * set prevsnap, so it will be as if this snapshot didn't + * set prevsnap_obj, so it will be as if this snapshot didn't * exist, and the next accepted snapshot will be sent as * an incremental from the last accepted one, or as the * first (and full) snapshot in the case of a replication, @@ -955,20 +1259,55 @@ dump_snapshot(zfs_handle_t *zhp, void *a return (0); } - /* send it */ + gather_holds(zhp, sdd); + fromorigin = sdd->prevsnap[0] == '\0' && + (sdd->fromorigin || sdd->replicate); + if (sdd->verbose) { - (void) fprintf(stderr, "sending from @%s to %s\n", - sdd->prevsnap, zhp->zfs_name); + uint64_t size = 0; + (void) estimate_ioctl(zhp, sdd->prevsnap_obj, + fromorigin, &size); + + send_print_verbose(fout, zhp->zfs_name, + sdd->prevsnap[0] ? sdd->prevsnap : NULL, + size, sdd->parsable); + sdd->size += size; } - err = dump_ioctl(zhp, sdd->prevsnap, - sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate), - sdd->outfd, B_TRUE, &got_enoent); + if (!sdd->dryrun) { + /* + * If progress reporting is requested, spawn a new thread to + * poll ZFS_IOC_SEND_PROGRESS at a regular interval. + */ + if (sdd->progress) { + pa.pa_zhp = zhp; + pa.pa_fd = sdd->outfd; + pa.pa_parsable = sdd->parsable; - if (got_enoent) - err = 0; - else - (void) strcpy(sdd->prevsnap, thissnap); + if ((err = pthread_create(&tid, NULL, + send_progress_thread, &pa)) != 0) { + zfs_close(zhp); + return (err); + } + } + + enum lzc_send_flags flags = 0; + if (sdd->large_block) + flags |= LZC_SEND_FLAG_LARGE_BLOCK; + if (sdd->embed_data) + flags |= LZC_SEND_FLAG_EMBED_DATA; + + err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, + fromorigin, sdd->outfd, flags, sdd->debugnv); + + if (sdd->progress) { + (void) pthread_cancel(tid); + (void) pthread_join(tid, NULL); + } + } + + (void) strcpy(sdd->prevsnap, thissnap); + sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zfs_close(zhp); return (err); } @@ -984,8 +1323,8 @@ dump_filesystem(zfs_handle_t *zhp, void (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", zhp->zfs_name, sdd->tosnap); if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) { - (void) fprintf(stderr, "WARNING: " - "could not send %s@%s: does not exist\n", + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "WARNING: could not send %s@%s: does not exist\n"), zhp->zfs_name, sdd->tosnap); sdd->err = B_TRUE; return (0); @@ -1007,57 +1346,34 @@ dump_filesystem(zfs_handle_t *zhp, void } } - if (sdd->doall) { - sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0; - if (sdd->fromsnap == NULL || missingfrom) - sdd->seenfrom = B_TRUE; - - rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg); - if (!sdd->seenfrom) { - (void) fprintf(stderr, + sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0; + sdd->prevsnap_obj = 0; + if (sdd->fromsnap == NULL || missingfrom) + sdd->seenfrom = B_TRUE; + + rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg); + if (!sdd->seenfrom) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "WARNING: could not send %s@%s:\n" + "incremental source (%s@%s) does not exist\n"), + zhp->zfs_name, sdd->tosnap, + zhp->zfs_name, sdd->fromsnap); + sdd->err = B_TRUE; + } else if (!sdd->seento) { + if (sdd->fromsnap) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "WARNING: could not send %s@%s:\n" - "incremental source (%s@%s) does not exist\n", + "incremental source (%s@%s) " + "is not earlier than it\n"), zhp->zfs_name, sdd->tosnap, zhp->zfs_name, sdd->fromsnap); - sdd->err = B_TRUE; - } else if (!sdd->seento) { - if (sdd->fromsnap) { - (void) fprintf(stderr, - "WARNING: could not send %s@%s:\n" - "incremental source (%s@%s) " - "is not earlier than it\n", - zhp->zfs_name, sdd->tosnap, - zhp->zfs_name, sdd->fromsnap); - } else { - (void) fprintf(stderr, "WARNING: " - "could not send %s@%s: does not exist\n", - zhp->zfs_name, sdd->tosnap); - } - sdd->err = B_TRUE; - } - } else { - zfs_handle_t *snapzhp; - char snapname[ZFS_MAXNAMELEN]; - - (void) snprintf(snapname, sizeof (snapname), "%s@%s", - zfs_get_name(zhp), sdd->tosnap); - snapzhp = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT); - if (snapzhp == NULL) { - rv = -1; } else { - if (sdd->filter_cb == NULL || - sdd->filter_cb(snapzhp, sdd->filter_cb_arg) == - B_TRUE) { - boolean_t got_enoent; - - rv = dump_ioctl(snapzhp, - missingfrom ? NULL : sdd->fromsnap, - sdd->fromorigin || missingfrom, - sdd->outfd, B_FALSE, &got_enoent); - } - sdd->seento = B_TRUE; - zfs_close(snapzhp); + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "WARNING: " + "could not send %s@%s: does not exist\n"), + zhp->zfs_name, sdd->tosnap); } + sdd->err = B_TRUE; } return (rv); @@ -1073,16 +1389,39 @@ dump_filesystems(zfs_handle_t *rzhp, voi if (!sdd->replicate) return (dump_filesystem(rzhp, sdd)); + /* Mark the clone origin snapshots. */ + for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; + fspair = nvlist_next_nvpair(sdd->fss, fspair)) { + nvlist_t *nvfs; + uint64_t origin_guid = 0; + + VERIFY(0 == nvpair_value_nvlist(fspair, &nvfs)); + (void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid); + if (origin_guid != 0) { + char *snapname; + nvlist_t *origin_nv = fsavl_find(sdd->fsavl, + origin_guid, &snapname); + if (origin_nv != NULL) { + nvlist_t *snapprops; + VERIFY(0 == nvlist_lookup_nvlist(origin_nv, + "snapprops", &snapprops)); + VERIFY(0 == nvlist_lookup_nvlist(snapprops, + snapname, &snapprops)); + VERIFY(0 == nvlist_add_boolean( + snapprops, "is_clone_origin")); + } + } + } again: needagain = progress = B_FALSE; for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; fspair = nvlist_next_nvpair(sdd->fss, fspair)) { - nvlist_t *fslist; + nvlist_t *fslist, *parent_nv; char *fsname; zfs_handle_t *zhp; int err; uint64_t origin_guid = 0; - nvlist_t *origin_nv; + uint64_t parent_guid = 0; VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0); if (nvlist_lookup_boolean(fslist, "sent") == 0) @@ -1090,16 +1429,30 @@ again: VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0); (void) nvlist_lookup_uint64(fslist, "origin", &origin_guid); + (void) nvlist_lookup_uint64(fslist, "parentfromsnap", + &parent_guid); - origin_nv = fsavl_find(sdd->fsavl, origin_guid, NULL); - if (origin_nv && - nvlist_lookup_boolean(origin_nv, "sent") == ENOENT) { - /* - * origin has not been sent yet; - * skip this clone. - */ - needagain = B_TRUE; - continue; + if (parent_guid != 0) { + parent_nv = fsavl_find(sdd->fsavl, parent_guid, NULL); + if (!nvlist_exists(parent_nv, "sent")) { + /* parent has not been sent; skip this one */ + needagain = B_TRUE; + continue; + } + } + + if (origin_guid != 0) { + nvlist_t *origin_nv = fsavl_find(sdd->fsavl, + origin_guid, NULL); + if (origin_nv != NULL && + !nvlist_exists(origin_nv, "sent")) { + /* + * origin has not been sent yet; + * skip this clone. + */ + needagain = B_TRUE; + continue; + } } zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET); @@ -1116,9 +1469,246 @@ again: assert(progress); goto again; } + + /* clean out the sent flags in case we reuse this fss */ + for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; + fspair = nvlist_next_nvpair(sdd->fss, fspair)) { + nvlist_t *fslist; + + VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0); + (void) nvlist_remove_all(fslist, "sent"); + } + return (0); } +nvlist_t * +zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token) +{ + unsigned int version; + int nread; + unsigned long long checksum, packed_len; + + /* + * Decode token header, which is: + * -- + * Note that the only supported token version is 1. + */ + nread = sscanf(token, "%u-%llx-%llx-", + &version, &checksum, &packed_len); + if (nread != 3) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (invalid format)")); + return (NULL); + } + + if (version != ZFS_SEND_RESUME_TOKEN_VERSION) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (invalid version %u)"), + version); + return (NULL); + } + + /* convert hexadecimal representation to binary */ + token = strrchr(token, '-') + 1; + int len = strlen(token) / 2; + unsigned char *compressed = zfs_alloc(hdl, len); + for (int i = 0; i < len; i++) { + nread = sscanf(token + i * 2, "%2hhx", compressed + i); + if (nread != 1) { + free(compressed); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt " + "(payload is not hex-encoded)")); + return (NULL); + } + } + + /* verify checksum */ + zio_cksum_t cksum; + fletcher_4_native(compressed, len, NULL, &cksum); + if (cksum.zc_word[0] != checksum) { + free(compressed); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (incorrect checksum)")); + return (NULL); + } + + /* uncompress */ + void *packed = zfs_alloc(hdl, packed_len); + uLongf packed_len_long = packed_len; + if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK || + packed_len_long != packed_len) { + free(packed); + free(compressed); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (decompression failed)")); + return (NULL); + } + + /* unpack nvlist */ + nvlist_t *nv; + int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP); + free(packed); + free(compressed); + if (error != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (nvlist_unpack failed)")); + return (NULL); + } + return (nv); +} + +int +zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, + const char *resume_token) +{ + char errbuf[1024]; + char *toname; + char *fromname = NULL; + uint64_t resumeobj, resumeoff, toguid, fromguid, bytes; + zfs_handle_t *zhp; + int error = 0; + char name[ZFS_MAX_DATASET_NAME_LEN]; + enum lzc_send_flags lzc_flags = 0; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot resume send")); + + nvlist_t *resume_nvl = + zfs_send_resume_token_to_nvlist(hdl, resume_token); + if (resume_nvl == NULL) { + /* + * zfs_error_aux has already been set by + * zfs_send_resume_token_to_nvlist + */ + return (zfs_error(hdl, EZFS_FAULT, errbuf)); + } + if (flags->verbose) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "resume token contents:\n")); + nvlist_print(stderr, resume_nvl); + } + + if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 || + nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 || + nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 || + nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || + nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt")); + return (zfs_error(hdl, EZFS_FAULT, errbuf)); + } + fromguid = 0; + (void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid); + + if (flags->embed_data || nvlist_exists(resume_nvl, "embedok")) + lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; + + if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) { + if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is no longer the same snapshot used in " + "the initial send"), toname); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' used in the initial send no longer exists"), + toname); + } + return (zfs_error(hdl, EZFS_BADPATH, errbuf)); + } + zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); + if (zhp == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "unable to access '%s'"), name); + return (zfs_error(hdl, EZFS_BADPATH, errbuf)); + } + + if (fromguid != 0) { + if (guid_to_name(hdl, toname, fromguid, B_TRUE, name) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental source %#llx no longer exists"), + (longlong_t)fromguid); + return (zfs_error(hdl, EZFS_BADPATH, errbuf)); + } + fromname = name; + } + + if (flags->verbose) { + uint64_t size = 0; + error = lzc_send_space(zhp->zfs_name, fromname, &size); + if (error == 0) + size = MAX(0, (int64_t)(size - bytes)); + send_print_verbose(stderr, zhp->zfs_name, fromname, + size, flags->parsable); + } + + if (!flags->dryrun) { + progress_arg_t pa = { 0 }; + pthread_t tid; + /* + * If progress reporting is requested, spawn a new thread to + * poll ZFS_IOC_SEND_PROGRESS at a regular interval. + */ + if (flags->progress) { + pa.pa_zhp = zhp; + pa.pa_fd = outfd; + pa.pa_parsable = flags->parsable; + + error = pthread_create(&tid, NULL, + send_progress_thread, &pa); + if (error != 0) { + zfs_close(zhp); + return (error); + } + } + + error = lzc_send_resume(zhp->zfs_name, fromname, outfd, + lzc_flags, resumeobj, resumeoff); + + if (flags->progress) { + (void) pthread_cancel(tid); + (void) pthread_join(tid, NULL); + } + + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot send '%s'"), zhp->zfs_name); + + zfs_close(zhp); + + switch (error) { + case 0: + return (0); + case EXDEV: + case ENOENT: + case EDQUOT: + case EFBIG: + case EIO: + case ENOLINK: + case ENOSPC: +#ifdef illumos + case ENOSTR: +#endif + case ENXIO: + case EPIPE: + case ERANGE: + case EFAULT: + case EROFS: + zfs_error_aux(hdl, strerror(errno)); + return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); + + default: + return (zfs_standard_error(hdl, errno, errbuf)); + } + } + + + zfs_close(zhp); + + return (error); +} + /* * Generate a send stream for the dataset identified by the argument zhp. * @@ -1137,22 +1727,21 @@ again: */ int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - sendflags_t flags, int outfd, snapfilter_cb_t filter_func, - void *cb_arg) + sendflags_t *flags, int outfd, snapfilter_cb_t filter_func, + void *cb_arg, nvlist_t **debugnvp) { char errbuf[1024]; send_dump_data_t sdd = { 0 }; - int err; + int err = 0; nvlist_t *fss = NULL; avl_tree_t *fsavl = NULL; - char holdtag[128]; static uint64_t holdseq; int spa_version; - boolean_t holdsnaps = B_FALSE; - pthread_t tid; + pthread_t tid = 0; int pipefd[2]; dedup_arg_t dda = { 0 }; int featureflags = 0; + FILE *fout; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot send '%s'"), zhp->zfs_name); @@ -1163,14 +1752,18 @@ zfs_send(zfs_handle_t *zhp, const char * return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); } - if (zfs_spa_version(zhp, &spa_version) == 0 && - spa_version >= SPA_VERSION_USERREFS) - holdsnaps = B_TRUE; + if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) { + uint64_t version; + version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + if (version >= ZPL_VERSION_SA) { + featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; + } + } - if (flags.dedup) { + if (flags->dedup && !flags->dryrun) { featureflags |= (DMU_BACKUP_FEATURE_DEDUP | DMU_BACKUP_FEATURE_DEDUPPROPS); - if (err = pipe(pipefd)) { + if ((err = pipe(pipefd)) != 0) { zfs_error_aux(zhp->zfs_hdl, strerror(errno)); return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, errbuf)); @@ -1178,7 +1771,7 @@ zfs_send(zfs_handle_t *zhp, const char * dda.outputfd = outfd; dda.inputfd = pipefd[1]; dda.dedup_hdl = zhp->zfs_hdl; - if (err = pthread_create(&tid, NULL, cksummer, &dda)) { + if ((err = pthread_create(&tid, NULL, cksummer, &dda)) != 0) { (void) close(pipefd[0]); (void) close(pipefd[1]); zfs_error_aux(zhp->zfs_hdl, strerror(errno)); @@ -1187,23 +1780,13 @@ zfs_send(zfs_handle_t *zhp, const char * } } - if (flags.replicate || flags.doall || flags.props) { + if (flags->replicate || flags->doall || flags->props) { dmu_replay_record_t drr = { 0 }; char *packbuf = NULL; size_t buflen = 0; zio_cksum_t zc = { 0 }; - if (holdsnaps) { - (void) snprintf(holdtag, sizeof (holdtag), - ".send-%d-%llu", getpid(), (u_longlong_t)holdseq); - ++holdseq; - err = zfs_hold_range(zhp, fromsnap, tosnap, - holdtag, flags.replicate, B_TRUE); - if (err) - goto err_out; - } - - if (flags.replicate || flags.props) { + if (flags->replicate || flags->props) { nvlist_t *hdrnv; VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0)); @@ -1212,108 +1795,178 @@ zfs_send(zfs_handle_t *zhp, const char * "fromsnap", fromsnap)); } VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap)); - if (!flags.replicate) { + if (!flags->replicate) { VERIFY(0 == nvlist_add_boolean(hdrnv, "not_recursive")); } err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name, - fromsnap, tosnap, flags.replicate, &fss, &fsavl); - if (err) { - if (holdsnaps) { - (void) zfs_release_range(zhp, fromsnap, - tosnap, holdtag, flags.replicate); - } + fromsnap, tosnap, flags->replicate, flags->verbose, + &fss, &fsavl); + if (err) goto err_out; - } VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss)); err = nvlist_pack(hdrnv, &packbuf, &buflen, NV_ENCODE_XDR, 0); - nvlist_free(hdrnv); - if (err) { - fsavl_destroy(fsavl); - nvlist_free(fss); - if (holdsnaps) { - (void) zfs_release_range(zhp, fromsnap, - tosnap, holdtag, flags.replicate); - } + if (debugnvp) + *debugnvp = hdrnv; + else + nvlist_free(hdrnv); + if (err) goto stderr_out; - } } - /* write first begin record */ - drr.drr_type = DRR_BEGIN; - drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; - DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin.drr_versioninfo, - DMU_COMPOUNDSTREAM); - DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin.drr_versioninfo, - featureflags); - (void) snprintf(drr.drr_u.drr_begin.drr_toname, - sizeof (drr.drr_u.drr_begin.drr_toname), - "%s@%s", zhp->zfs_name, tosnap); - drr.drr_payloadlen = buflen; - err = cksum_and_write(&drr, sizeof (drr), &zc, outfd); - - /* write header nvlist */ - if (err != -1 && packbuf != NULL) { - err = cksum_and_write(packbuf, buflen, &zc, outfd); - } - free(packbuf); - if (err == -1) { - fsavl_destroy(fsavl); - nvlist_free(fss); - if (holdsnaps) { - (void) zfs_release_range(zhp, fromsnap, tosnap, - holdtag, flags.replicate); - } - err = errno; - goto stderr_out; - } + if (!flags->dryrun) { + /* write first begin record */ + drr.drr_type = DRR_BEGIN; + drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; + DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin. + drr_versioninfo, DMU_COMPOUNDSTREAM); + DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin. + drr_versioninfo, featureflags); + (void) snprintf(drr.drr_u.drr_begin.drr_toname, + sizeof (drr.drr_u.drr_begin.drr_toname), + "%s@%s", zhp->zfs_name, tosnap); + drr.drr_payloadlen = buflen; + + err = dump_record(&drr, packbuf, buflen, &zc, outfd); + free(packbuf); + if (err != 0) + goto stderr_out; - /* write end record */ - if (err != -1) { + /* write end record */ bzero(&drr, sizeof (drr)); drr.drr_type = DRR_END; drr.drr_u.drr_end.drr_checksum = zc; err = write(outfd, &drr, sizeof (drr)); if (err == -1) { - fsavl_destroy(fsavl); - nvlist_free(fss); err = errno; - if (holdsnaps) { - (void) zfs_release_range(zhp, fromsnap, - tosnap, holdtag, flags.replicate); - } goto stderr_out; } + + err = 0; } } /* dump each stream */ sdd.fromsnap = fromsnap; sdd.tosnap = tosnap; - if (flags.dedup) + if (tid != 0) sdd.outfd = pipefd[0]; else sdd.outfd = outfd; - sdd.replicate = flags.replicate; - sdd.doall = flags.doall; - sdd.fromorigin = flags.fromorigin; + sdd.replicate = flags->replicate; + sdd.doall = flags->doall; + sdd.fromorigin = flags->fromorigin; sdd.fss = fss; sdd.fsavl = fsavl; - sdd.verbose = flags.verbose; + sdd.verbose = flags->verbose; + sdd.parsable = flags->parsable; + sdd.progress = flags->progress; + sdd.dryrun = flags->dryrun; + sdd.large_block = flags->largeblock; + sdd.embed_data = flags->embed_data; sdd.filter_cb = filter_func; sdd.filter_cb_arg = cb_arg; + if (debugnvp) + sdd.debugnv = *debugnvp; + if (sdd.verbose && sdd.dryrun) + sdd.std_out = B_TRUE; + fout = sdd.std_out ? stdout : stderr; + + /* + * Some flags require that we place user holds on the datasets that are + * being sent so they don't get destroyed during the send. We can skip + * this step if the pool is imported read-only since the datasets cannot + * be destroyed. + */ + if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp), + ZPOOL_PROP_READONLY, NULL) && + zfs_spa_version(zhp, &spa_version) == 0 && + spa_version >= SPA_VERSION_USERREFS && + (flags->doall || flags->replicate)) { + ++holdseq; + (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag), + ".send-%d-%llu", getpid(), (u_longlong_t)holdseq); + sdd.cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL); + if (sdd.cleanup_fd < 0) { + err = errno; + goto stderr_out; + } + sdd.snapholds = fnvlist_alloc(); + } else { + sdd.cleanup_fd = -1; + sdd.snapholds = NULL; + } + if (flags->verbose || sdd.snapholds != NULL) { + /* + * Do a verbose no-op dry run to get all the verbose output + * or to gather snapshot hold's before generating any data, + * then do a non-verbose real run to generate the streams. + */ + sdd.dryrun = B_TRUE; + err = dump_filesystems(zhp, &sdd); + + if (err != 0) + goto stderr_out; + + if (flags->verbose) { + if (flags->parsable) { + (void) fprintf(fout, "size\t%llu\n", + (longlong_t)sdd.size); + } else { + char buf[16]; + zfs_nicenum(sdd.size, buf, sizeof (buf)); + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "total estimated size is %s\n"), buf); + } + } + + /* Ensure no snaps found is treated as an error. */ + if (!sdd.seento) { + err = ENOENT; + goto err_out; + } + + /* Skip the second run if dryrun was requested. */ + if (flags->dryrun) + goto err_out; + + if (sdd.snapholds != NULL) { + err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds); + if (err != 0) + goto stderr_out; + + fnvlist_free(sdd.snapholds); + sdd.snapholds = NULL; + } + + sdd.dryrun = B_FALSE; + sdd.verbose = B_FALSE; + } + err = dump_filesystems(zhp, &sdd); fsavl_destroy(fsavl); nvlist_free(fss); - if (flags.dedup) { + /* Ensure no snaps found is treated as an error. */ + if (err == 0 && !sdd.seento) + err = ENOENT; + + if (tid != 0) { + if (err != 0) + (void) pthread_cancel(tid); (void) close(pipefd[0]); (void) pthread_join(tid, NULL); } - if (flags.replicate || flags.doall || flags.props) { + if (sdd.cleanup_fd != -1) { + VERIFY(0 == close(sdd.cleanup_fd)); + sdd.cleanup_fd = -1; + } + + if (!flags->dryrun && (flags->replicate || flags->doall || + flags->props)) { /* * write final end record. NB: want to do this even if * there was some error, because it might not be totally @@ -1321,10 +1974,6 @@ zfs_send(zfs_handle_t *zhp, const char * */ dmu_replay_record_t drr = { 0 }; drr.drr_type = DRR_END; - if (holdsnaps) { - (void) zfs_release_range(zhp, fromsnap, tosnap, - holdtag, flags.replicate); - } if (write(outfd, &drr, sizeof (drr)) == -1) { return (zfs_standard_error(zhp->zfs_hdl, errno, errbuf)); @@ -1336,14 +1985,77 @@ zfs_send(zfs_handle_t *zhp, const char * stderr_out: err = zfs_standard_error(zhp->zfs_hdl, err, errbuf); err_out: - if (flags.dedup) { + fsavl_destroy(fsavl); + nvlist_free(fss); + fnvlist_free(sdd.snapholds); + + if (sdd.cleanup_fd != -1) + VERIFY(0 == close(sdd.cleanup_fd)); + if (tid != 0) { (void) pthread_cancel(tid); - (void) pthread_join(tid, NULL); (void) close(pipefd[0]); + (void) pthread_join(tid, NULL); } return (err); } +int +zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, + enum lzc_send_flags flags) +{ + int err; + libzfs_handle_t *hdl = zhp->zfs_hdl; + + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot send '%s'"), zhp->zfs_name); + + err = lzc_send(zhp->zfs_name, from, fd, flags); + if (err != 0) { + switch (errno) { + case EXDEV: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "not an earlier snapshot from the same fs")); + return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); + + case ENOENT: + case ESRCH: + if (lzc_exists(zhp->zfs_name)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental source (%s) does not exist"), + from); + } + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + + case EBUSY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "target is busy; if a filesystem, " + "it must not be mounted")); + return (zfs_error(hdl, EZFS_BUSY, errbuf)); + + case EDQUOT: + case EFBIG: + case EIO: + case ENOLINK: + case ENOSPC: +#ifdef illumos + case ENOSTR: +#endif + case ENXIO: + case EPIPE: + case ERANGE: + case EFAULT: + case EROFS: + zfs_error_aux(hdl, strerror(errno)); + return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); + + default: + return (zfs_standard_error(hdl, errno, errbuf)); + } + } + return (err != 0); +} + /* * Routines specific to "zfs recv" */ @@ -1356,6 +2068,8 @@ recv_read(libzfs_handle_t *hdl, int fd, int rv; int len = ilen; + assert(ilen <= SPA_MAXBLOCKSIZE); + do { rv = read(fd, cp, len); cp += rv; @@ -1407,7 +2121,7 @@ recv_read_nvlist(libzfs_handle_t *hdl, i static int recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, - int baselen, char *newname, recvflags_t flags) + int baselen, char *newname, recvflags_t *flags) { static int seq; zfs_cmd_t zc = { 0 }; @@ -1419,7 +2133,7 @@ recv_rename(libzfs_handle_t *hdl, const if (zhp == NULL) return (-1); clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, - flags.force ? MS_FORCE : 0); + flags->force ? MS_FORCE : 0); zfs_close(zhp); if (clp == NULL) return (-1); @@ -1435,7 +2149,7 @@ recv_rename(libzfs_handle_t *hdl, const (void) strlcpy(zc.zc_value, tryname, sizeof (zc.zc_value)); - if (flags.verbose) { + if (flags->verbose) { (void) printf("attempting rename %s to %s\n", zc.zc_name, zc.zc_value); } @@ -1446,27 +2160,26 @@ recv_rename(libzfs_handle_t *hdl, const err = ENOENT; } - if (err != 0 && strncmp(name+baselen, "recv-", 5) != 0) { + if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) { seq++; - (void) strncpy(newname, name, baselen); - (void) snprintf(newname+baselen, ZFS_MAXNAMELEN-baselen, - "recv-%u-%u", getpid(), seq); + (void) snprintf(newname, ZFS_MAX_DATASET_NAME_LEN, + "%.*srecv-%u-%u", baselen, name, getpid(), seq); (void) strlcpy(zc.zc_value, newname, sizeof (zc.zc_value)); - if (flags.verbose) { + if (flags->verbose) { (void) printf("failed - trying rename %s to %s\n", zc.zc_name, zc.zc_value); } err = ioctl(hdl->libzfs_fd, ZFS_IOC_RENAME, &zc); if (err == 0) changelist_rename(clp, name, newname); - if (err && flags.verbose) { + if (err && flags->verbose) { (void) printf("failed (%u) - " "will try again on next pass\n", errno); } err = EAGAIN; - } else if (flags.verbose) { + } else if (flags->verbose) { if (err == 0) (void) printf("success\n"); else @@ -1481,7 +2194,7 @@ recv_rename(libzfs_handle_t *hdl, const static int recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, - char *newname, recvflags_t flags) + char *newname, recvflags_t *flags) { zfs_cmd_t zc = { 0 }; int err = 0; @@ -1494,7 +2207,7 @@ recv_destroy(libzfs_handle_t *hdl, const if (zhp == NULL) return (-1); clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, - flags.force ? MS_FORCE : 0); + flags->force ? MS_FORCE : 0); if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT && zfs_spa_version(zhp, &spa_version) == 0 && spa_version >= SPA_VERSION_USERREFS) @@ -1510,11 +2223,11 @@ recv_destroy(libzfs_handle_t *hdl, const zc.zc_defer_destroy = defer; (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); - if (flags.verbose) + if (flags->verbose) (void) printf("attempting destroy %s\n", zc.zc_name); err = ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc); if (err == 0) { - if (flags.verbose) + if (flags->verbose) (void) printf("success\n"); changelist_remove(clp, zc.zc_name); } @@ -1536,66 +2249,97 @@ recv_destroy(libzfs_handle_t *hdl, const typedef struct guid_to_name_data { uint64_t guid; + boolean_t bookmark_ok; char *name; + char *skip; } guid_to_name_data_t; static int guid_to_name_cb(zfs_handle_t *zhp, void *arg) { guid_to_name_data_t *gtnd = arg; + const char *slash; int err; - if (zhp->zfs_dmustats.dds_guid == gtnd->guid) { + if (gtnd->skip != NULL && + (slash = strrchr(zhp->zfs_name, '/')) != NULL && + strcmp(slash + 1, gtnd->skip) == 0) { + zfs_close(zhp); + return (0); + } + + if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid) { (void) strcpy(gtnd->name, zhp->zfs_name); zfs_close(zhp); return (EEXIST); } + err = zfs_iter_children(zhp, guid_to_name_cb, gtnd); + if (err != EEXIST && gtnd->bookmark_ok) + err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd); zfs_close(zhp); return (err); } +/* + * Attempt to find the local dataset associated with this guid. In the case of + * multiple matches, we attempt to find the "best" match by searching + * progressively larger portions of the hierarchy. This allows one to send a + * tree of datasets individually and guarantee that we will find the source + * guid within that hierarchy, even if there are multiple matches elsewhere. + */ static int guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, - char *name) + boolean_t bookmark_ok, char *name) { - /* exhaustive search all local snapshots */ + char pname[ZFS_MAX_DATASET_NAME_LEN]; guid_to_name_data_t gtnd; - int err = 0; - zfs_handle_t *zhp; - char *cp; gtnd.guid = guid; + gtnd.bookmark_ok = bookmark_ok; gtnd.name = name; + gtnd.skip = NULL; - if (strchr(parent, '@') == NULL) { - zhp = make_dataset_handle(hdl, parent); - if (zhp != NULL) { - err = zfs_iter_children(zhp, guid_to_name_cb, >nd); - zfs_close(zhp); - if (err == EEXIST) - return (0); - } - } - - cp = strchr(parent, '/'); - if (cp) + /* + * Search progressively larger portions of the hierarchy, starting + * with the filesystem specified by 'parent'. This will + * select the "most local" version of the origin snapshot in the case + * that there are multiple matching snapshots in the system. + */ + (void) strlcpy(pname, parent, sizeof (pname)); + char *cp = strrchr(pname, '@'); + if (cp == NULL) + cp = strchr(pname, '\0'); + for (; cp != NULL; cp = strrchr(pname, '/')) { + /* Chop off the last component and open the parent */ *cp = '\0'; - zhp = make_dataset_handle(hdl, parent); - if (cp) - *cp = '/'; + zfs_handle_t *zhp = make_dataset_handle(hdl, pname); - if (zhp) { - err = zfs_iter_children(zhp, guid_to_name_cb, >nd); + if (zhp == NULL) + continue; + int err = guid_to_name_cb(zfs_handle_dup(zhp), >nd); + if (err != EEXIST) + err = zfs_iter_children(zhp, guid_to_name_cb, >nd); + if (err != EEXIST && bookmark_ok) + err = zfs_iter_bookmarks(zhp, guid_to_name_cb, >nd); zfs_close(zhp); - } + if (err == EEXIST) + return (0); - return (err == EEXIST ? 0 : ENOENT); + /* + * Remember the last portion of the dataset so we skip it next + * time through (as we've already searched that portion of the + * hierarchy). + */ + gtnd.skip = strrchr(pname, '/') + 1; + } + return (ENOENT); } /* - * Return true if dataset guid1 is created before guid2. + * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if + * guid1 is after guid2. */ static int created_before(libzfs_handle_t *hdl, avl_tree_t *avl, @@ -1603,9 +2347,10 @@ created_before(libzfs_handle_t *hdl, avl { nvlist_t *nvfs; char *fsname, *snapname; - char buf[ZFS_MAXNAMELEN]; + char buf[ZFS_MAX_DATASET_NAME_LEN]; int rv; - zfs_node_t zn1, zn2; + zfs_handle_t *guid1hdl, *guid2hdl; + uint64_t create1, create2; if (guid2 == 0) return (0); @@ -1615,54 +2360,65 @@ created_before(libzfs_handle_t *hdl, avl nvfs = fsavl_find(avl, guid1, &snapname); VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); - zn1.zn_handle = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); - if (zn1.zn_handle == NULL) + guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); + if (guid1hdl == NULL) return (-1); nvfs = fsavl_find(avl, guid2, &snapname); VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); - zn2.zn_handle = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); - if (zn2.zn_handle == NULL) { - zfs_close(zn2.zn_handle); + guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); + if (guid2hdl == NULL) { + zfs_close(guid1hdl); return (-1); } - rv = (zfs_snapshot_compare(&zn1, &zn2) == -1); + create1 = zfs_prop_get_int(guid1hdl, ZFS_PROP_CREATETXG); + create2 = zfs_prop_get_int(guid2hdl, ZFS_PROP_CREATETXG); + + if (create1 < create2) + rv = -1; + else if (create1 > create2) + rv = +1; + else + rv = 0; - zfs_close(zn1.zn_handle); - zfs_close(zn2.zn_handle); + zfs_close(guid1hdl); + zfs_close(guid2hdl); return (rv); } static int recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, - recvflags_t flags, nvlist_t *stream_nv, avl_tree_t *stream_avl) + recvflags_t *flags, nvlist_t *stream_nv, avl_tree_t *stream_avl, + nvlist_t *renamed) { - nvlist_t *local_nv; + nvlist_t *local_nv, *deleted = NULL; avl_tree_t *local_avl; nvpair_t *fselem, *nextfselem; - char *tosnap, *fromsnap; - char newname[ZFS_MAXNAMELEN]; + char *fromsnap; + char newname[ZFS_MAX_DATASET_NAME_LEN]; + char guidname[32]; int error; boolean_t needagain, progress, recursive; char *s1, *s2; VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap)); - VERIFY(0 == nvlist_lookup_string(stream_nv, "tosnap", &tosnap)); recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == ENOENT); - if (flags.dryrun) + if (flags->dryrun) return (0); again: needagain = progress = B_FALSE; + VERIFY(0 == nvlist_alloc(&deleted, NV_UNIQUE_NAME, 0)); + if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL, - recursive, &local_nv, &local_avl)) != 0) + recursive, B_FALSE, &local_nv, &local_avl)) != 0) return (error); /* @@ -1715,7 +2471,7 @@ again: nvlist_t *origin_nvfs; char *origin_fsname; - if (flags.verbose) + if (flags->verbose) (void) printf("promoting %s\n", fsname); origin_nvfs = fsavl_find(local_avl, originguid, @@ -1761,9 +2517,9 @@ again: /* check for delete */ if (found == NULL) { - char name[ZFS_MAXNAMELEN]; + char name[ZFS_MAX_DATASET_NAME_LEN]; - if (!flags.force) + if (!flags->force) continue; (void) snprintf(name, sizeof (name), "%s@%s", @@ -1775,6 +2531,8 @@ again: needagain = B_TRUE; else progress = B_TRUE; + sprintf(guidname, "%lu", thisguid); + nvlist_add_boolean(deleted, guidname); continue; } @@ -1799,8 +2557,8 @@ again: /* check for different snapname */ if (strcmp(nvpair_name(snapelem), stream_snapname) != 0) { - char name[ZFS_MAXNAMELEN]; - char tryname[ZFS_MAXNAMELEN]; + char name[ZFS_MAX_DATASET_NAME_LEN]; + char tryname[ZFS_MAX_DATASET_NAME_LEN]; (void) snprintf(name, sizeof (name), "%s@%s", fsname, nvpair_name(snapelem)); @@ -1821,7 +2579,7 @@ again: /* check for delete */ if (stream_nvfs == NULL) { - if (!flags.force) + if (!flags->force) continue; error = recv_destroy(hdl, fsname, strlen(tofs)+1, @@ -1830,13 +2588,18 @@ again: needagain = B_TRUE; else progress = B_TRUE; + sprintf(guidname, "%lu", parent_fromsnap_guid); + nvlist_add_boolean(deleted, guidname); continue; } - if (fromguid == 0 && flags.verbose) { - (void) printf("local fs %s does not have fromsnap " - "(%s in stream); must have been deleted locally; " - "ignoring\n", fsname, fromsnap); + if (fromguid == 0) { + if (flags->verbose) { + (void) printf("local fs %s does not have " + "fromsnap (%s in stream); must have " + "been deleted locally; ignoring\n", + fsname, fromsnap); + } continue; } @@ -1848,12 +2611,36 @@ again: s1 = strrchr(fsname, '/'); s2 = strrchr(stream_fsname, '/'); - /* check for rename */ + /* + * Check if we're going to rename based on parent guid change + * and the current parent guid was also deleted. If it was then + * rename will fail and is likely unneeded, so avoid this and + * force an early retry to determine the new + * parent_fromsnap_guid. + */ + if (stream_parent_fromsnap_guid != 0 && + parent_fromsnap_guid != 0 && + stream_parent_fromsnap_guid != parent_fromsnap_guid) { + sprintf(guidname, "%lu", parent_fromsnap_guid); + if (nvlist_exists(deleted, guidname)) { + progress = B_TRUE; + needagain = B_TRUE; + goto doagain; + } + } + + /* + * Check for rename. If the exact receive path is specified, it + * does not count as a rename, but we still need to check the + * datasets beneath it. + */ if ((stream_parent_fromsnap_guid != 0 && + parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) || - ((s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) { + ((flags->isprefix || strcmp(tofs, fsname) != 0) && + (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) { nvlist_t *parent; - char tryname[ZFS_MAXNAMELEN]; + char tryname[ZFS_MAX_DATASET_NAME_LEN]; parent = fsavl_find(local_avl, stream_parent_fromsnap_guid, NULL); @@ -1873,14 +2660,22 @@ again: "%s%s", pname, strrchr(stream_fsname, '/')); } else { tryname[0] = '\0'; - if (flags.verbose) { + if (flags->verbose) { (void) printf("local fs %s new parent " "not found\n", fsname); } } + newname[0] = '\0'; + error = recv_rename(hdl, fsname, tryname, strlen(tofs)+1, newname, flags); + + if (renamed != NULL && newname[0] != '\0') { + VERIFY(0 == nvlist_add_boolean(renamed, + newname)); + } + if (error) needagain = B_TRUE; else @@ -1888,12 +2683,14 @@ again: } } +doagain: fsavl_destroy(local_avl); nvlist_free(local_nv); + nvlist_free(deleted); if (needagain && progress) { /* do another pass to fix up temporary names */ - if (flags.verbose) + if (flags->verbose) (void) printf("another pass:\n"); goto again; } @@ -1903,28 +2700,26 @@ again: static int zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, - recvflags_t flags, dmu_replay_record_t *drr, zio_cksum_t *zc, - char **top_zfs) + recvflags_t *flags, dmu_replay_record_t *drr, zio_cksum_t *zc, + char **top_zfs, int cleanup_fd, uint64_t *action_handlep) { nvlist_t *stream_nv = NULL; avl_tree_t *stream_avl = NULL; char *fromsnap = NULL; - char tofs[ZFS_MAXNAMELEN]; + char *sendsnap = NULL; + char *cp; + char tofs[ZFS_MAX_DATASET_NAME_LEN]; + char sendfs[ZFS_MAX_DATASET_NAME_LEN]; char errbuf[1024]; dmu_replay_record_t drre; int error; boolean_t anyerr = B_FALSE; boolean_t softerr = B_FALSE; + boolean_t recursive; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); - if (strchr(destname, '@')) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "can not specify snapshot name for multi-snapshot stream")); - return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); - } - assert(drr->drr_type == DRR_BEGIN); assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC); assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) == @@ -1935,20 +2730,30 @@ zfs_receive_package(libzfs_handle_t *hdl */ if (drr->drr_payloadlen != 0) { error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen, - &stream_nv, flags.byteswap, zc); + &stream_nv, flags->byteswap, zc); if (error) { error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); goto out; } } + recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == + ENOENT); + + if (recursive && strchr(destname, '@')) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot specify snapshot name for multi-snapshot stream")); + error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); + goto out; + } + /* * Read in the end record and verify checksum. */ if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre), - flags.byteswap, NULL))) + flags->byteswap, NULL))) goto out; - if (flags.byteswap) { + if (flags->byteswap) { drre.drr_type = BSWAP_32(drre.drr_type); drre.drr_u.drr_end.drr_checksum.zc_word[0] = BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]); @@ -1985,21 +2790,81 @@ zfs_receive_package(libzfs_handle_t *hdl } if (fromsnap != NULL) { - (void) strlcpy(tofs, destname, ZFS_MAXNAMELEN); - if (flags.isprefix) { - int i = strcspn(drr->drr_u.drr_begin.drr_toname, - "/@"); + nvlist_t *renamed = NULL; + nvpair_t *pair = NULL; + + (void) strlcpy(tofs, destname, sizeof (tofs)); + if (flags->isprefix) { + struct drr_begin *drrb = &drr->drr_u.drr_begin; + int i; + + if (flags->istail) { + cp = strrchr(drrb->drr_toname, '/'); + if (cp == NULL) { + (void) strlcat(tofs, "/", + sizeof (tofs)); + i = 0; + } else { + i = (cp - drrb->drr_toname); + } + } else { + i = strcspn(drrb->drr_toname, "/@"); + } /* zfs_receive_one() will create_parents() */ - (void) strlcat(tofs, - &drr->drr_u.drr_begin.drr_toname[i], - ZFS_MAXNAMELEN); + (void) strlcat(tofs, &drrb->drr_toname[i], + sizeof (tofs)); *strchr(tofs, '@') = '\0'; } - softerr = recv_incremental_replication(hdl, tofs, - flags, stream_nv, stream_avl); + + if (recursive && !flags->dryrun && !flags->nomount) { + VERIFY(0 == nvlist_alloc(&renamed, + NV_UNIQUE_NAME, 0)); + } + + softerr = recv_incremental_replication(hdl, tofs, flags, + stream_nv, stream_avl, renamed); + + /* Unmount renamed filesystems before receiving. */ + while ((pair = nvlist_next_nvpair(renamed, + pair)) != NULL) { + zfs_handle_t *zhp; + prop_changelist_t *clp = NULL; + + zhp = zfs_open(hdl, nvpair_name(pair), + ZFS_TYPE_FILESYSTEM); + if (zhp != NULL) { + clp = changelist_gather(zhp, + ZFS_PROP_MOUNTPOINT, 0, 0); + zfs_close(zhp); + if (clp != NULL) { + softerr |= + changelist_prefix(clp); + changelist_free(clp); + } + } + } + + nvlist_free(renamed); } } + /* + * Get the fs specified by the first path in the stream (the top level + * specified by 'zfs send') and pass it to each invocation of + * zfs_receive_one(). + */ + (void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname, + sizeof (sendfs)); + if ((cp = strchr(sendfs, '@')) != NULL) { + *cp = '\0'; + /* + * Find the "sendsnap", the final snapshot in a replication + * stream. zfs_receive_one() handles certain errors + * differently, depending on if the contained stream is the + * last one or not. + */ + sendsnap = (cp + 1); + } /* Finally, receive each contained stream */ do { @@ -2010,8 +2875,9 @@ zfs_receive_package(libzfs_handle_t *hdl * zfs_receive_one() will take care of it (ie, * recv_skip() and return 0). */ - error = zfs_receive_impl(hdl, destname, flags, fd, - stream_avl, top_zfs); + error = zfs_receive_impl(hdl, destname, NULL, flags, fd, + sendfs, stream_nv, stream_avl, top_zfs, cleanup_fd, + action_handlep, sendsnap); if (error == ENODATA) { error = 0; break; @@ -2025,13 +2891,12 @@ zfs_receive_package(libzfs_handle_t *hdl * renames again. */ softerr = recv_incremental_replication(hdl, tofs, flags, - stream_nv, stream_avl); + stream_nv, stream_avl, NULL); } out: fsavl_destroy(stream_avl); - if (stream_nv) - nvlist_free(stream_nv); + nvlist_free(stream_nv); if (softerr) error = -2; if (anyerr) @@ -2056,7 +2921,7 @@ static int recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) { dmu_replay_record_t *drr; - void *buf = malloc(1<<20); + void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, @@ -2072,11 +2937,9 @@ recv_skip(libzfs_handle_t *hdl, int fd, switch (drr->drr_type) { case DRR_BEGIN: - /* NB: not to be used on v2 stream packages */ if (drr->drr_payloadlen != 0) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid substream header")); - return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + (void) recv_read(hdl, fd, buf, + drr->drr_payloadlen, B_FALSE, NULL); } break; @@ -2103,7 +2966,24 @@ recv_skip(libzfs_handle_t *hdl, int fd, (void) recv_read(hdl, fd, buf, drr->drr_u.drr_write.drr_length, B_FALSE, NULL); break; - + case DRR_SPILL: + if (byteswap) { + drr->drr_u.drr_spill.drr_length = + BSWAP_64(drr->drr_u.drr_spill.drr_length); + } + (void) recv_read(hdl, fd, buf, + drr->drr_u.drr_spill.drr_length, B_FALSE, NULL); + break; + case DRR_WRITE_EMBEDDED: + if (byteswap) { + drr->drr_u.drr_write_embedded.drr_psize = + BSWAP_32(drr->drr_u.drr_write_embedded. + drr_psize); + } + (void) recv_read(hdl, fd, buf, + P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize, + 8), B_FALSE, NULL); + break; case DRR_WRITE_BYREF: case DRR_FREEOBJECTS: case DRR_FREE: @@ -2120,37 +3000,76 @@ recv_skip(libzfs_handle_t *hdl, int fd, return (-1); } +static void +recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap, + boolean_t resumable) +{ + char target_fs[ZFS_MAX_DATASET_NAME_LEN]; + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "checksum mismatch or incomplete stream")); + + if (!resumable) + return; + (void) strlcpy(target_fs, target_snap, sizeof (target_fs)); + *strchr(target_fs, '@') = '\0'; + zfs_handle_t *zhp = zfs_open(hdl, target_fs, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return; + + char token_buf[ZFS_MAXPROPLEN]; + int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + token_buf, sizeof (token_buf), + NULL, NULL, 0, B_TRUE); + if (error == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "checksum mismatch or incomplete stream.\n" + "Partially received snapshot is saved.\n" + "A resuming stream can be generated on the sending " + "system by running:\n" + " zfs send -t %s"), + token_buf); + } + zfs_close(zhp); +} + /* * Restores a backup of tosnap from the file descriptor specified by infd. */ static int zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, - recvflags_t flags, dmu_replay_record_t *drr, - dmu_replay_record_t *drr_noswap, avl_tree_t *stream_avl, - char **top_zfs) + const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr, + dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv, + avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, + uint64_t *action_handlep, const char *finalsnap) { zfs_cmd_t zc = { 0 }; time_t begin_time; - int ioctl_err, ioctl_errno, err, choplen; + int ioctl_err, ioctl_errno, err; char *cp; struct drr_begin *drrb = &drr->drr_u.drr_begin; char errbuf[1024]; char prop_errbuf[1024]; - char chopprefix[ZFS_MAXNAMELEN]; + const char *chopprefix; boolean_t newfs = B_FALSE; boolean_t stream_wantsnewfs; uint64_t parent_snapguid = 0; prop_changelist_t *clp = NULL; nvlist_t *snapprops_nvlist = NULL; zprop_errflags_t prop_errflags; + boolean_t recursive; + char *snapname = NULL; begin_time = time(NULL); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); + recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == + ENOENT); + if (stream_avl != NULL) { - char *snapname; nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid, &snapname); nvlist_t *props; @@ -2162,7 +3081,7 @@ zfs_receive_one(libzfs_handle_t *hdl, in if (err) VERIFY(0 == nvlist_alloc(&props, NV_UNIQUE_NAME, 0)); - if (flags.canmountoff) { + if (flags->canmountoff) { VERIFY(0 == nvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0)); } @@ -2179,6 +3098,8 @@ zfs_receive_one(libzfs_handle_t *hdl, in return (-1); } + cp = NULL; + /* * Determine how much of the snapshot name stored in the stream * we are going to tack on to the name they specified on the @@ -2187,43 +3108,85 @@ zfs_receive_one(libzfs_handle_t *hdl, in * If they specified a snapshot, chop the entire name stored in * the stream. */ - (void) strcpy(chopprefix, drrb->drr_toname); - if (flags.isprefix) { + if (flags->istail) { + /* + * A filesystem was specified with -e. We want to tack on only + * the tail of the sent snapshot path. + */ + if (strchr(tosnap, '@')) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " + "argument - snapshot not allowed with -e")); + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } + + chopprefix = strrchr(sendfs, '/'); + + if (chopprefix == NULL) { + /* + * The tail is the poolname, so we need to + * prepend a path separator. + */ + int len = strlen(drrb->drr_toname); + cp = malloc(len + 2); + cp[0] = '/'; + (void) strcpy(&cp[1], drrb->drr_toname); + chopprefix = cp; + } else { + chopprefix = drrb->drr_toname + (chopprefix - sendfs); + } + } else if (flags->isprefix) { /* - * They specified a fs with -d or -e. We want to tack on + * A filesystem was specified with -d. We want to tack on * everything but the first element of the sent snapshot path - * (all but the pool name) in the case of -d, or only the tail - * of the sent snapshot path in the case of -e. + * (all but the pool name). */ if (strchr(tosnap, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " - "argument - snapshot not allowed with %s"), - (flags.istail ? "-e" : "-d")); + "argument - snapshot not allowed with -d")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } - cp = (flags.istail ? strrchr(chopprefix, '/') : - strchr(chopprefix, '/')); - if (cp == NULL) - cp = strchr(chopprefix, '@'); - *cp = '\0'; + + chopprefix = strchr(drrb->drr_toname, '/'); + if (chopprefix == NULL) + chopprefix = strchr(drrb->drr_toname, '@'); } else if (strchr(tosnap, '@') == NULL) { /* - * If they specified a filesystem without -d or -e, we want to - * tack on everything after the fs specified in the first name - * from the stream. + * If a filesystem was specified without -d or -e, we want to + * tack on everything after the fs specified by 'zfs send'. */ - cp = strchr(chopprefix, '@'); - *cp = '\0'; + chopprefix = drrb->drr_toname + strlen(sendfs); + } else { + /* A snapshot was specified as an exact path (no -d or -e). */ + if (recursive) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot specify snapshot name for multi-snapshot " + "stream")); + return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + } + chopprefix = drrb->drr_toname + strlen(drrb->drr_toname); } - choplen = strlen(chopprefix); + + ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname); + ASSERT(chopprefix > drrb->drr_toname); + ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname)); + ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' || + chopprefix[0] == '\0'); /* * Determine name of destination snapshot, store in zc_value. */ - (void) strcpy(zc.zc_top_ds, tosnap); (void) strcpy(zc.zc_value, tosnap); - (void) strncat(zc.zc_value, drrb->drr_toname+choplen, - sizeof (zc.zc_value)); + (void) strncat(zc.zc_value, chopprefix, sizeof (zc.zc_value)); +#ifdef __FreeBSD__ + if (zfs_ioctl_version == ZFS_IOCVER_UNDEF) + zfs_ioctl_version = get_zfs_ioctl_version(); + /* + * For forward compatibility hide tosnap in zc_value + */ + if (zfs_ioctl_version < ZFS_IOCVER_LZC) + (void) strcpy(zc.zc_value + strlen(zc.zc_value) + 1, tosnap); +#endif + free(cp); if (!zfs_name_valid(zc.zc_value, ZFS_TYPE_SNAPSHOT)) { zcmd_free_nvlists(&zc); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); @@ -2233,20 +3196,27 @@ zfs_receive_one(libzfs_handle_t *hdl, in * Determine the name of the origin snapshot, store in zc_string. */ if (drrb->drr_flags & DRR_FLAG_CLONE) { - if (guid_to_name(hdl, tosnap, - drrb->drr_fromguid, zc.zc_string) != 0) { + if (guid_to_name(hdl, zc.zc_value, + drrb->drr_fromguid, B_FALSE, zc.zc_string) != 0) { zcmd_free_nvlists(&zc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "local origin for clone %s does not exist"), zc.zc_value); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } - if (flags.verbose) + if (flags->verbose) (void) printf("found clone origin %s\n", zc.zc_string); + } else if (originsnap) { + (void) strncpy(zc.zc_string, originsnap, sizeof (zc.zc_string)); + if (flags->verbose) + (void) printf("using provided clone origin %s\n", + zc.zc_string); } + boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_RESUMING; stream_wantsnewfs = (drrb->drr_fromguid == 0 || - (drrb->drr_flags & DRR_FLAG_CLONE)); + (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming; if (stream_wantsnewfs) { /* @@ -2262,10 +3232,10 @@ zfs_receive_one(libzfs_handle_t *hdl, in *cp = '\0'; if (cp && !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { - char suffix[ZFS_MAXNAMELEN]; + char suffix[ZFS_MAX_DATASET_NAME_LEN]; (void) strcpy(suffix, strrchr(zc.zc_value, '/')); - if (guid_to_name(hdl, tosnap, parent_snapguid, - zc.zc_value) == 0) { + if (guid_to_name(hdl, zc.zc_name, parent_snapguid, + B_FALSE, zc.zc_value) == 0) { *strchr(zc.zc_value, '@') = '\0'; (void) strcat(zc.zc_value, suffix); } @@ -2281,11 +3251,18 @@ zfs_receive_one(libzfs_handle_t *hdl, in (void) strcpy(zc.zc_name, zc.zc_value); *strchr(zc.zc_name, '@') = '\0'; - if (!zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { - char snap[ZFS_MAXNAMELEN]; + /* + * If the exact receive path was specified and this is the + * topmost path in the stream, then if the fs does not exist we + * should look no further. + */ + if ((flags->isprefix || (*(chopprefix = drrb->drr_toname + + strlen(sendfs)) != '\0' && *chopprefix != '@')) && + !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { + char snap[ZFS_MAX_DATASET_NAME_LEN]; (void) strcpy(snap, strchr(zc.zc_value, '@')); - if (guid_to_name(hdl, tosnap, drrb->drr_fromguid, - zc.zc_value) == 0) { + if (guid_to_name(hdl, zc.zc_name, drrb->drr_fromguid, + B_FALSE, zc.zc_value) == 0) { *strchr(zc.zc_value, '@') = '\0'; (void) strcat(zc.zc_value, snap); } @@ -2297,16 +3274,17 @@ zfs_receive_one(libzfs_handle_t *hdl, in if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) { zfs_handle_t *zhp; + /* - * Destination fs exists. Therefore this should either - * be an incremental, or the stream specifies a new fs - * (full stream or clone) and they want us to blow it - * away (and have therefore specified -F and removed any - * snapshots). + * Destination fs exists. It must be one of these cases: + * - an incremental send stream + * - the stream specifies a new fs (full stream or clone) + * and they want us to blow away the existing fs (and + * have therefore specified -F and removed any snapshots) + * - we are resuming a failed receive. */ - if (stream_wantsnewfs) { - if (!flags.force) { + if (!flags->force) { zcmd_free_nvlists(&zc); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "destination '%s' exists\n" @@ -2342,7 +3320,7 @@ zfs_receive_one(libzfs_handle_t *hdl, in return (zfs_error(hdl, EZFS_EXISTS, errbuf)); } - if (!flags.dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM && + if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM && stream_wantsnewfs) { /* We can't do online recv in this case */ clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0); @@ -2358,6 +3336,18 @@ zfs_receive_one(libzfs_handle_t *hdl, in return (-1); } } + + /* + * If we are resuming a newfs, set newfs here so that we will + * mount it if the recv succeeds this time. We can tell + * that it was a newfs on the first recv because the fs + * itself will be inconsistent (if the fs existed when we + * did the first recv, we would have received it into + * .../%recv). + */ + if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT)) + newfs = B_TRUE; + zfs_close(zhp); } else { /* @@ -2381,7 +3371,7 @@ zfs_receive_one(libzfs_handle_t *hdl, in */ *cp = '\0'; - if (flags.isprefix && !flags.dryrun && + if (flags->isprefix && !flags->istail && !flags->dryrun && create_parents(hdl, zc.zc_value, strlen(tosnap)) != 0) { zcmd_free_nvlists(&zc); return (zfs_error(hdl, EZFS_BADRESTORE, errbuf)); @@ -2390,24 +3380,27 @@ zfs_receive_one(libzfs_handle_t *hdl, in newfs = B_TRUE; } - zc.zc_begin_record = drr_noswap->drr_u.drr_begin; + zc.zc_begin_record = *drr_noswap; zc.zc_cookie = infd; - zc.zc_guid = flags.force; - if (flags.verbose) { + zc.zc_guid = flags->force; + zc.zc_resumable = flags->resumable; + if (flags->verbose) { (void) printf("%s %s stream of %s into %s\n", - flags.dryrun ? "would receive" : "receiving", + flags->dryrun ? "would receive" : "receiving", drrb->drr_fromguid ? "incremental" : "full", drrb->drr_toname, zc.zc_value); (void) fflush(stdout); } - if (flags.dryrun) { + if (flags->dryrun) { zcmd_free_nvlists(&zc); - return (recv_skip(hdl, infd, flags.byteswap)); + return (recv_skip(hdl, infd, flags->byteswap)); } zc.zc_nvlist_dst = (uint64_t)(uintptr_t)prop_errbuf; zc.zc_nvlist_dst_size = sizeof (prop_errbuf); + zc.zc_cleanup_fd = cleanup_fd; + zc.zc_action_handle = *action_handlep; err = ioctl_err = zfs_ioctl(hdl, ZFS_IOC_RECV, &zc); ioctl_errno = errno; @@ -2432,7 +3425,21 @@ zfs_receive_one(libzfs_handle_t *hdl, in ZPROP_N_MORE_ERRORS) == 0) { trunc_prop_errs(intval); break; - } else { + } else if (snapname == NULL || finalsnap == NULL || + strcmp(finalsnap, snapname) == 0 || + strcmp(nvpair_name(prop_err), + zfs_prop_to_name(ZFS_PROP_REFQUOTA)) != 0) { + /* + * Skip the special case of, for example, + * "refquota", errors on intermediate + * snapshots leading up to a final one. + * That's why we have all of the checks above. + * + * See zfs_ioctl.c's extract_delay_props() for + * a list of props which can fail on + * intermediate snapshots, but shouldn't + * affect the overall receive. + */ (void) snprintf(tbuf, sizeof (tbuf), dgettext(TEXT_DOMAIN, "cannot receive %s property on %s"), @@ -2458,7 +3465,7 @@ zfs_receive_one(libzfs_handle_t *hdl, in } } - if (err && (ioctl_errno == ENOENT || ioctl_errno == ENODEV)) { + if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) { /* * It may be that this snapshot already exists, * in which case we want to consume & ignore it @@ -2466,7 +3473,7 @@ zfs_receive_one(libzfs_handle_t *hdl, in */ avl_tree_t *local_avl; nvlist_t *local_nv, *fs; - char *cp = strchr(zc.zc_value, '@'); + cp = strchr(zc.zc_value, '@'); /* * XXX Do this faster by just iterating over snaps in @@ -2475,19 +3482,19 @@ zfs_receive_one(libzfs_handle_t *hdl, in */ *cp = '\0'; if (gather_nvlist(hdl, zc.zc_value, NULL, NULL, B_FALSE, - &local_nv, &local_avl) == 0) { + B_FALSE, &local_nv, &local_avl) == 0) { *cp = '@'; fs = fsavl_find(local_avl, drrb->drr_toguid, NULL); fsavl_destroy(local_avl); nvlist_free(local_nv); if (fs != NULL) { - if (flags.verbose) { + if (flags->verbose) { (void) printf("snap %s already exists; " "ignoring\n", zc.zc_value); } err = ioctl_err = recv_skip(hdl, infd, - flags.byteswap); + flags->byteswap); } } *cp = '@'; @@ -2527,10 +3534,19 @@ zfs_receive_one(libzfs_handle_t *hdl, in (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ECKSUM: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid stream (checksum mismatch)")); + recv_ecksum_set_aux(hdl, zc.zc_value, flags->resumable); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded to receive this stream.")); + (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + case EDQUOT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination %s space quota exceeded"), zc.zc_name); + (void) zfs_error(hdl, EZFS_NOSPC, errbuf); + break; default: (void) zfs_standard_error(hdl, ioctl_errno, errbuf); } @@ -2565,7 +3581,8 @@ zfs_receive_one(libzfs_handle_t *hdl, in } if (clp) { - err |= changelist_postfix(clp); + if (!flags->nomount) + err |= changelist_postfix(clp); changelist_free(clp); } @@ -2585,7 +3602,9 @@ zfs_receive_one(libzfs_handle_t *hdl, in if (err || ioctl_err) return (-1); - if (flags.verbose) { + *action_handlep = zc.zc_action_handle; + + if (flags->verbose) { char buf1[64]; char buf2[64]; uint64_t bytes = zc.zc_cookie; @@ -2603,8 +3622,10 @@ zfs_receive_one(libzfs_handle_t *hdl, in } static int -zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags, - int infd, avl_tree_t *stream_avl, char **top_zfs) +zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, + const char *originsnap, recvflags_t *flags, int infd, const char *sendfs, + nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, int cleanup_fd, + uint64_t *action_handlep, const char *finalsnap) { int err; dmu_replay_record_t drr, drr_noswap; @@ -2617,12 +3638,18 @@ zfs_receive_impl(libzfs_handle_t *hdl, c (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); - if (flags.isprefix && + if (flags->isprefix && !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs " "(%s) does not exist"), tosnap); return (zfs_error(hdl, EZFS_NOENT, errbuf)); } + if (originsnap && + !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs " + "(%s) does not exist"), originsnap); + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + } /* read in the BEGIN record */ if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, @@ -2637,7 +3664,7 @@ zfs_receive_impl(libzfs_handle_t *hdl, c /* the kernel needs the non-byteswapped begin record */ drr_noswap = drr; - flags.byteswap = B_FALSE; + flags->byteswap = B_FALSE; if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { /* * We computed the checksum in the wrong byteorder in @@ -2645,7 +3672,7 @@ zfs_receive_impl(libzfs_handle_t *hdl, c */ bzero(&zcksum, sizeof (zio_cksum_t)); fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum); - flags.byteswap = B_TRUE; + flags->byteswap = B_TRUE; drr.drr_type = BSWAP_32(drr.drr_type); drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen); @@ -2682,13 +3709,29 @@ zfs_receive_impl(libzfs_handle_t *hdl, c } if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) { - return (zfs_receive_one(hdl, infd, tosnap, flags, - &drr, &drr_noswap, stream_avl, top_zfs)); - } else { /* must be DMU_COMPOUNDSTREAM */ + char nonpackage_sendfs[ZFS_MAX_DATASET_NAME_LEN]; + if (sendfs == NULL) { + /* + * We were not called from zfs_receive_package(). Get + * the fs specified by 'zfs send'. + */ + char *cp; + (void) strlcpy(nonpackage_sendfs, + drr.drr_u.drr_begin.drr_toname, + sizeof (nonpackage_sendfs)); + if ((cp = strchr(nonpackage_sendfs, '@')) != NULL) + *cp = '\0'; + sendfs = nonpackage_sendfs; + VERIFY(finalsnap == NULL); + } + return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags, + &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs, + cleanup_fd, action_handlep, finalsnap)); + } else { assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM); - return (zfs_receive_package(hdl, infd, tosnap, flags, - &drr, &zcksum, top_zfs)); + return (zfs_receive_package(hdl, infd, tosnap, flags, &drr, + &zcksum, top_zfs, cleanup_fd, action_handlep)); } } @@ -2696,18 +3739,33 @@ zfs_receive_impl(libzfs_handle_t *hdl, c * Restores a backup of tosnap from the file descriptor specified by infd. * Return 0 on total success, -2 if some things couldn't be * destroyed/renamed/promoted, -1 if some things couldn't be received. - * (-1 will override -2). + * (-1 will override -2, if -1 and the resumable flag was specified the + * transfer can be resumed if the sending side supports it). */ int -zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags, - int infd, avl_tree_t *stream_avl) +zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, + recvflags_t *flags, int infd, avl_tree_t *stream_avl) { char *top_zfs = NULL; int err; + int cleanup_fd; + uint64_t action_handle = 0; + char *originsnap = NULL; + if (props) { + err = nvlist_lookup_string(props, "origin", &originsnap); + if (err && err != ENOENT) + return (err); + } + + cleanup_fd = open(ZFS_DEV, O_RDWR|O_EXCL); + VERIFY(cleanup_fd >= 0); + + err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL, + stream_avl, &top_zfs, cleanup_fd, &action_handle, NULL); - err = zfs_receive_impl(hdl, tosnap, flags, infd, stream_avl, &top_zfs); + VERIFY(0 == close(cleanup_fd)); - if (err == 0 && !flags.nomount && top_zfs) { + if (err == 0 && !flags->nomount && top_zfs) { zfs_handle_t *zhp; prop_changelist_t *clp; Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 libzfs_status.c --- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c 27 Feb 2010 22:30:28 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c 2 Sep 2013 11:36:30 -0000 @@ -18,9 +18,11 @@ * * CDDL HEADER END */ + /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. */ /* @@ -43,6 +45,7 @@ #include #include #include "libzfs_impl.h" +#include "zfeature_common.h" /* * Message ID table. This must be kept in sync with the ZPOOL_STATUS_* defines @@ -70,57 +73,66 @@ static char *zfs_msgid_table[] = { /* ARGSUSED */ static int -vdev_missing(uint64_t state, uint64_t aux, uint64_t errs) +vdev_missing(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_CANT_OPEN && - aux == VDEV_AUX_OPEN_FAILED); + return (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_OPEN_FAILED); } /* ARGSUSED */ static int -vdev_faulted(uint64_t state, uint64_t aux, uint64_t errs) +vdev_faulted(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_FAULTED); + return (vs->vs_state == VDEV_STATE_FAULTED); } /* ARGSUSED */ static int -vdev_errors(uint64_t state, uint64_t aux, uint64_t errs) +vdev_errors(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_DEGRADED || errs != 0); + return (vs->vs_state == VDEV_STATE_DEGRADED || + vs->vs_read_errors != 0 || vs->vs_write_errors != 0 || + vs->vs_checksum_errors != 0); } /* ARGSUSED */ static int -vdev_broken(uint64_t state, uint64_t aux, uint64_t errs) +vdev_broken(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_CANT_OPEN); + return (vs->vs_state == VDEV_STATE_CANT_OPEN); } /* ARGSUSED */ static int -vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs) +vdev_offlined(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_OFFLINE); + return (vs->vs_state == VDEV_STATE_OFFLINE); } /* ARGSUSED */ static int -vdev_removed(uint64_t state, uint64_t aux, uint64_t errs) +vdev_removed(vdev_stat_t *vs, uint_t vsc) +{ + return (vs->vs_state == VDEV_STATE_REMOVED); +} + +static int +vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_REMOVED); + return (VDEV_STAT_VALID(vs_physical_ashift, vsc) && + vs->vs_configured_ashift < vs->vs_physical_ashift); } /* * Detect if any leaf devices that have seen errors or could not be opened. */ static boolean_t -find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) +find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t), + boolean_t ignore_replacing) { nvlist_t **child; vdev_stat_t *vs; - uint_t c, children; - char *type; + uint_t c, vsc, children; /* * Ignore problems within a 'replacing' vdev, since we're presumably in @@ -128,26 +140,38 @@ find_vdev_problem(nvlist_t *vdev, int (* * out again. We'll pick up the fact that a resilver is happening * later. */ - verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_REPLACING) == 0) - return (B_FALSE); + if (ignore_replacing == B_TRUE) { + char *type; + + verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, + &type) == 0); + if (strcmp(type, VDEV_TYPE_REPLACING) == 0) + return (B_FALSE); + } if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) - if (find_vdev_problem(child[c], func)) + if (find_vdev_problem(child[c], func, ignore_replacing)) return (B_TRUE); } else { - verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_STATS, - (uint64_t **)&vs, &c) == 0); + verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) == 0); - if (func(vs->vs_state, vs->vs_aux, - vs->vs_read_errors + - vs->vs_write_errors + - vs->vs_checksum_errors)) + if (func(vs, vsc) != 0) return (B_TRUE); } + /* + * Check any L2 cache devs + */ + if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_L2CACHE, &child, + &children) == 0) { + for (c = 0; c < children; c++) + if (find_vdev_problem(child[c], func, ignore_replacing)) + return (B_TRUE); + } + return (B_FALSE); } @@ -173,7 +197,8 @@ check_status(nvlist_t *config, boolean_t { nvlist_t *nvroot; vdev_stat_t *vs; - uint_t vsc; + pool_scan_stat_t *ps = NULL; + uint_t vsc, psc; uint64_t nerr; uint64_t version; uint64_t stateval; @@ -184,15 +209,24 @@ check_status(nvlist_t *config, boolean_t &version) == 0); verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_STATS, + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &stateval) == 0); - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); + + /* + * Currently resilvering a vdev + */ + (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &psc); + if (ps && ps->pss_func == POOL_SCAN_RESILVER && + ps->pss_state == DSS_SCANNING) + return (ZPOOL_STATUS_RESILVERING); /* * Pool last accessed by another system. */ + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); if (hostid != 0 && (unsigned long)hostid != gethostid() && stateval == POOL_STATE_ACTIVE) return (ZPOOL_STATUS_HOSTID_MISMATCH); @@ -205,6 +239,20 @@ check_status(nvlist_t *config, boolean_t return (ZPOOL_STATUS_VERSION_NEWER); /* + * Unsupported feature(s). + */ + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_UNSUP_FEAT) { + nvlist_t *nvinfo; + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + &nvinfo) == 0); + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_CAN_RDONLY)) + return (ZPOOL_STATUS_UNSUP_FEAT_WRITE); + return (ZPOOL_STATUS_UNSUP_FEAT_READ); + } + + /* * Check that the config is complete. */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && @@ -233,15 +281,15 @@ check_status(nvlist_t *config, boolean_t * Bad devices in non-replicated config. */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_faulted)) + find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) return (ZPOOL_STATUS_FAULTED_DEV_NR); if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_missing)) + find_vdev_problem(nvroot, vdev_missing, B_TRUE)) return (ZPOOL_STATUS_MISSING_DEV_NR); if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_broken)) + find_vdev_problem(nvroot, vdev_broken, B_TRUE)) return (ZPOOL_STATUS_CORRUPT_LABEL_NR); /* @@ -263,43 +311,67 @@ check_status(nvlist_t *config, boolean_t /* * Missing devices in a replicated config. */ - if (find_vdev_problem(nvroot, vdev_faulted)) + if (find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) return (ZPOOL_STATUS_FAULTED_DEV_R); - if (find_vdev_problem(nvroot, vdev_missing)) + if (find_vdev_problem(nvroot, vdev_missing, B_TRUE)) return (ZPOOL_STATUS_MISSING_DEV_R); - if (find_vdev_problem(nvroot, vdev_broken)) + if (find_vdev_problem(nvroot, vdev_broken, B_TRUE)) return (ZPOOL_STATUS_CORRUPT_LABEL_R); /* * Devices with errors */ - if (!isimport && find_vdev_problem(nvroot, vdev_errors)) + if (!isimport && find_vdev_problem(nvroot, vdev_errors, B_TRUE)) return (ZPOOL_STATUS_FAILING_DEV); /* * Offlined devices */ - if (find_vdev_problem(nvroot, vdev_offlined)) + if (find_vdev_problem(nvroot, vdev_offlined, B_TRUE)) return (ZPOOL_STATUS_OFFLINE_DEV); /* * Removed device */ - if (find_vdev_problem(nvroot, vdev_removed)) + if (find_vdev_problem(nvroot, vdev_removed, B_TRUE)) return (ZPOOL_STATUS_REMOVED_DEV); /* - * Currently resilvering + * Suboptimal, but usable, ashift configuration. */ - if (!vs->vs_scrub_complete && vs->vs_scrub_type == POOL_SCRUB_RESILVER) - return (ZPOOL_STATUS_RESILVERING); + if (find_vdev_problem(nvroot, vdev_non_native_ashift, B_FALSE)) + return (ZPOOL_STATUS_NON_NATIVE_ASHIFT); /* * Outdated, but usable, version */ - if (version < SPA_VERSION) + if (SPA_VERSION_IS_SUPPORTED(version) && version != SPA_VERSION) return (ZPOOL_STATUS_VERSION_OLDER); + /* + * Usable pool with disabled features + */ + if (version >= SPA_VERSION_FEATURES) { + int i; + nvlist_t *feat; + + if (isimport) { + feat = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_LOAD_INFO); + feat = fnvlist_lookup_nvlist(feat, + ZPOOL_CONFIG_ENABLED_FEAT); + } else { + feat = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_FEATURE_STATS); + } + + for (i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *fi = &spa_feature_table[i]; + if (!nvlist_exists(feat, fi->fi_guid)) + return (ZPOOL_STATUS_FEAT_DISABLED); + } + } + return (ZPOOL_STATUS_OK); } Index: src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c,v retrieving revision 1.6 diff -u -p -r1.6 libzfs_util.c --- src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c 26 Jan 2013 20:15:50 -0000 1.6 +++ src/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c 25 Apr 2017 17:20:07 -0000 @@ -18,15 +18,23 @@ * * CDDL HEADER END */ + /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov */ /* * Internal utility routines for the ZFS library. */ +#include +#include +#include +#include + #include #include #include @@ -42,10 +50,17 @@ #include #include +#ifdef __NetBSD__ +#include +#endif + #include +#include #include "libzfs_impl.h" #include "zfs_prop.h" +#include "zfeature_common.h" + int libzfs_errno(libzfs_handle_t *hdl) @@ -71,7 +86,7 @@ libzfs_error_description(libzfs_handle_t case EZFS_BADPROP: return (dgettext(TEXT_DOMAIN, "invalid property value")); case EZFS_PROPREADONLY: - return (dgettext(TEXT_DOMAIN, "read only property")); + return (dgettext(TEXT_DOMAIN, "read-only property")); case EZFS_PROPTYPE: return (dgettext(TEXT_DOMAIN, "property doesn't apply to " "datasets of this type")); @@ -91,7 +106,7 @@ libzfs_error_description(libzfs_handle_t case EZFS_BADSTREAM: return (dgettext(TEXT_DOMAIN, "invalid backup stream")); case EZFS_DSREADONLY: - return (dgettext(TEXT_DOMAIN, "dataset is read only")); + return (dgettext(TEXT_DOMAIN, "dataset is read-only")); case EZFS_VOLTOOBIG: return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for " "this system")); @@ -113,7 +128,8 @@ libzfs_error_description(libzfs_handle_t case EZFS_RESILVERING: return (dgettext(TEXT_DOMAIN, "currently resilvering")); case EZFS_BADVERSION: - return (dgettext(TEXT_DOMAIN, "unsupported version")); + return (dgettext(TEXT_DOMAIN, "unsupported version or " + "feature")); case EZFS_POOLUNAVAIL: return (dgettext(TEXT_DOMAIN, "pool is unavailable")); case EZFS_DEVOVERFLOW: @@ -137,14 +153,12 @@ libzfs_error_description(libzfs_handle_t return (dgettext(TEXT_DOMAIN, "smb remove share failed")); case EZFS_SHARESMBFAILED: return (dgettext(TEXT_DOMAIN, "smb add share failed")); - case EZFS_ISCSISVCUNAVAIL: - return (dgettext(TEXT_DOMAIN, - "iscsitgt service need to be enabled by " - "a privileged user")); case EZFS_PERM: return (dgettext(TEXT_DOMAIN, "permission denied")); case EZFS_NOSPC: return (dgettext(TEXT_DOMAIN, "out of space")); + case EZFS_FAULT: + return (dgettext(TEXT_DOMAIN, "bad address")); case EZFS_IO: return (dgettext(TEXT_DOMAIN, "I/O error")); case EZFS_INTR: @@ -158,12 +172,6 @@ libzfs_error_description(libzfs_handle_t return (dgettext(TEXT_DOMAIN, "recursive dataset dependency")); case EZFS_NOHISTORY: return (dgettext(TEXT_DOMAIN, "no history available")); - case EZFS_UNSHAREISCSIFAILED: - return (dgettext(TEXT_DOMAIN, - "iscsitgtd failed request to unshare")); - case EZFS_SHAREISCSIFAILED: - return (dgettext(TEXT_DOMAIN, - "iscsitgtd failed request to share")); case EZFS_POOLPROPS: return (dgettext(TEXT_DOMAIN, "failed to retrieve " "pool properties")); @@ -191,9 +199,6 @@ libzfs_error_description(libzfs_handle_t case EZFS_NODELEGATION: return (dgettext(TEXT_DOMAIN, "delegated administration is " "disabled on pool")); - case EZFS_PERMRDONLY: - return (dgettext(TEXT_DOMAIN, "snapshot permissions cannot be" - " modified")); case EZFS_BADCACHE: return (dgettext(TEXT_DOMAIN, "invalid or missing cache file")); case EZFS_ISL2CACHE: @@ -224,6 +229,17 @@ libzfs_error_description(libzfs_handle_t case EZFS_POSTSPLIT_ONLINE: return (dgettext(TEXT_DOMAIN, "disk was split from this pool " "into a new one")); + case EZFS_SCRUBBING: + return (dgettext(TEXT_DOMAIN, "currently scrubbing; " + "use 'zpool scrub -s' to cancel current scrub")); + case EZFS_NO_SCRUB: + return (dgettext(TEXT_DOMAIN, "there is no active scrub")); + case EZFS_DIFF: + return (dgettext(TEXT_DOMAIN, "unable to generate diffs")); + case EZFS_DIFFDATA: + return (dgettext(TEXT_DOMAIN, "invalid diff data")); + case EZFS_POOLREADONLY: + return (dgettext(TEXT_DOMAIN, "pool is read-only")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: @@ -312,6 +328,10 @@ zfs_common_error(libzfs_handle_t *hdl, i zfs_verror(hdl, EZFS_IO, fmt, ap); return (-1); + case EFAULT: + zfs_verror(hdl, EZFS_FAULT, fmt, ap); + return (-1); + case EINTR: zfs_verror(hdl, EZFS_INTR, fmt, ap); return (-1); @@ -342,6 +362,7 @@ zfs_standard_error_fmt(libzfs_handle_t * switch (error) { case ENXIO: case ENODEV: + case EPIPE: zfs_verror(hdl, EZFS_IO, fmt, ap); break; @@ -354,6 +375,7 @@ zfs_standard_error_fmt(libzfs_handle_t * case ENOSPC: case EDQUOT: zfs_verror(hdl, EZFS_NOSPC, fmt, ap); + va_end(ap); return (-1); case EEXIST: @@ -368,9 +390,7 @@ zfs_standard_error_fmt(libzfs_handle_t * zfs_verror(hdl, EZFS_BUSY, fmt, ap); break; case EROFS: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "snapshot permissions cannot be modified")); - zfs_verror(hdl, EZFS_PERMRDONLY, fmt, ap); + zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); break; case ENAMETOOLONG: zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap); @@ -455,13 +475,19 @@ zpool_standard_error_fmt(libzfs_handle_t case ENOSPC: case EDQUOT: zfs_verror(hdl, EZFS_NOSPC, fmt, ap); + va_end(ap); return (-1); + case EAGAIN: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool I/O is currently suspended")); zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); break; + case EROFS: + zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); + break; + default: zfs_error_aux(hdl, strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); @@ -495,6 +521,29 @@ zfs_alloc(libzfs_handle_t *hdl, size_t s } /* + * A safe form of asprintf() which will die if the allocation fails. + */ +/*PRINTFLIKE2*/ +char * +zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...) +{ + va_list ap; + char *ret; + int err; + + va_start(ap, fmt); + + err = vasprintf(&ret, fmt, ap); + + va_end(ap); + + if (err < 0) + (void) no_memory(hdl); + + return (ret); +} + +/* * A safe form of realloc(), which also zeroes newly allocated space. */ void * @@ -575,12 +624,43 @@ libzfs_print_on_error(libzfs_handle_t *h hdl->libzfs_printerr = printerr; } +static int +libzfs_load(void) +{ +#ifdef __FreeBSD__ + if (modfind("zfs") < 0) { + /* Not present in kernel, try loading it. */ + if (kldload("zfs") < 0 || modfind("zfs") < 0) { + if (errno != EEXIST) + return (-1); + } + } +#endif +#ifdef __NetBSD__ + modctl_load_t cmdargs; + + cmdargs.ml_filename = "zfs"; + cmdargs.ml_flags = MODCTL_NO_PROP; + cmdargs.ml_props = NULL; + cmdargs.ml_propslen = 0; + + if (modctl(MODCTL_LOAD, &cmdargs) < 0 && errno != EEXIST) + return (-1); +#endif + return (0); +} + libzfs_handle_t * libzfs_init(void) { libzfs_handle_t *hdl; - if ((hdl = calloc(sizeof (libzfs_handle_t), 1)) == NULL) { + if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) { + return (NULL); + } + + if (libzfs_load() < 0) { + free(hdl); return (NULL); } @@ -595,10 +675,19 @@ libzfs_init(void) return (NULL); } - hdl->libzfs_sharetab = fopen("/etc/dfs/sharetab", "r"); + hdl->libzfs_sharetab = fopen(ZFS_EXPORTS_PATH, "r"); + + if (libzfs_core_init() != 0) { + (void) close(hdl->libzfs_fd); + (void) fclose(hdl->libzfs_mnttab); + (void) fclose(hdl->libzfs_sharetab); + free(hdl); + return (NULL); + } zfs_prop_init(); zpool_prop_init(); + zpool_feature_init(); libzfs_mnttab_init(hdl); return (hdl); @@ -613,14 +702,13 @@ libzfs_fini(libzfs_handle_t *hdl) if (hdl->libzfs_sharetab) (void) fclose(hdl->libzfs_sharetab); zfs_uninit_libshare(hdl); - if (hdl->libzfs_log_str) - (void) free(hdl->libzfs_log_str); zpool_free_handles(hdl); -#ifdef PORT_SOLARIS +#ifdef illumos libzfs_fru_clear(hdl, B_TRUE); -#endif +#endif namespace_clear(hdl); libzfs_mnttab_fini(hdl); + libzfs_core_fini(); free(hdl); } @@ -651,7 +739,8 @@ zfs_get_pool_handle(const zfs_handle_t * zfs_handle_t * zfs_path_to_zhandle(libzfs_handle_t *hdl, char *path, zfs_type_t argtype) { - struct statvfs statbuf; + struct stat64 statbuf; + struct extmnttab entry; int ret; if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) { @@ -661,18 +750,57 @@ zfs_path_to_zhandle(libzfs_handle_t *hdl return (zfs_open(hdl, path, argtype)); } - if (getstatfs(&statbuf, path) != 0) { + if (stat64(path, &statbuf) != 0) { (void) fprintf(stderr, "%s: %s\n", path, strerror(errno)); return (NULL); } - if (strcmp(statbuf.f_fstypename, MNTTYPE_ZFS) != 0) { +#ifdef illumos + rewind(hdl->libzfs_mnttab); + while ((ret = getextmntent(hdl->libzfs_mnttab, &entry, 0)) == 0) { + if (makedevice(entry.mnt_major, entry.mnt_minor) == + statbuf.st_dev) { + break; + } + } +#endif /* illumos */ +#ifdef __FreeBSD__ + { + struct statfs sfs; + + ret = statfs(path, &sfs); + if (ret == 0) + statfs2mnttab(&sfs, &entry); + else { + (void) fprintf(stderr, "%s: %s\n", path, + strerror(errno)); + } + } +#endif /* __FreeBSD__ */ +#ifdef __NetBSD__ + { + struct statvfs sfs; + + ret = statvfs(path, &sfs); + if (ret == 0) + statvfs2mnttab(&sfs, &entry); + else { + (void) fprintf(stderr, "%s: %s\n", path, + strerror(errno)); + } + } +#endif /* __NetBSD__ */ + if (ret != 0) { + return (NULL); + } + + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { (void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"), path); return (NULL); } - return (zfs_open(hdl, statbuf.f_mntfromname, ZFS_TYPE_FILESYSTEM)); + return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM)); } /* @@ -683,10 +811,11 @@ int zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len) { if (len == 0) - len = 4*1024; + len = 16 * 1024; zc->zc_nvlist_dst_size = len; - if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t) - zfs_alloc(hdl, zc->zc_nvlist_dst_size)) == 0) + zc->zc_nvlist_dst = + (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size); + if (zc->zc_nvlist_dst == 0) return (-1); return (0); @@ -701,8 +830,9 @@ int zcmd_expand_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc) { free((void *)(uintptr_t)zc->zc_nvlist_dst); - if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t) - zfs_alloc(hdl, zc->zc_nvlist_dst_size)) == 0) + zc->zc_nvlist_dst = + (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size); + if (zc->zc_nvlist_dst == 0) return (-1); return (0); @@ -717,6 +847,9 @@ zcmd_free_nvlists(zfs_cmd_t *zc) free((void *)(uintptr_t)zc->zc_nvlist_conf); free((void *)(uintptr_t)zc->zc_nvlist_src); free((void *)(uintptr_t)zc->zc_nvlist_dst); + zc->zc_nvlist_conf = NULL; + zc->zc_nvlist_src = NULL; + zc->zc_nvlist_dst = NULL; } static int @@ -769,17 +902,7 @@ zcmd_read_dst_nvlist(libzfs_handle_t *hd int zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) { - int error; - - zc->zc_history = (uint64_t)(uintptr_t)hdl->libzfs_log_str; - error = ioctl(hdl->libzfs_fd, request, zc); - if (hdl->libzfs_log_str) { - free(hdl->libzfs_log_str); - hdl->libzfs_log_str = NULL; - } - zc->zc_history = 0; - - return (error); + return (ioctl(hdl->libzfs_fd, request, zc)); } /* @@ -918,7 +1041,7 @@ zprop_print_one_property(const char *nam const char *source, const char *recvd_value) { int i; - const char *str; + const char *str = NULL; char buf[128]; /* @@ -970,6 +1093,10 @@ zprop_print_one_property(const char *nam case ZPROP_SRC_RECEIVED: str = "received"; break; + + default: + str = NULL; + assert(!"unhandled zprop_source_t"); } break; @@ -1180,6 +1307,16 @@ zprop_parse_value(libzfs_handle_t *hdl, "use 'none' to disable quota/refquota")); goto error; } + + /* + * Special handling for "*_limit=none". In this case it's not + * 0 but UINT64_MAX. + */ + if ((type & ZFS_TYPE_DATASET) && isnone && + (prop == ZFS_PROP_FILESYSTEM_LIMIT || + prop == ZFS_PROP_SNAPSHOT_LIMIT)) { + *ivalp = UINT64_MAX; + } break; case PROP_TYPE_INDEX: @@ -1241,8 +1378,11 @@ addlist(libzfs_handle_t *hdl, char *prop * this is a pool property or if this isn't a user-defined * dataset property, */ - if (prop == ZPROP_INVAL && (type == ZFS_TYPE_POOL || - (!zfs_prop_user(propname) && !zfs_prop_userquota(propname)))) { + if (prop == ZPROP_INVAL && ((type == ZFS_TYPE_POOL && + !zpool_prop_feature(propname) && + !zpool_prop_unsupported(propname)) || + (type == ZFS_TYPE_DATASET && !zfs_prop_user(propname) && + !zfs_prop_userquota(propname) && !zfs_prop_written(propname)))) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); return (zfs_error(hdl, EZFS_BADPROP, @@ -1254,7 +1394,8 @@ addlist(libzfs_handle_t *hdl, char *prop entry->pl_prop = prop; if (prop == ZPROP_INVAL) { - if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) == NULL) { + if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) == + NULL) { free(entry); return (-1); } Index: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.c =================================================================== RCS file: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.c diff -N src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.c 3 Dec 2016 17:05:32 -0000 @@ -0,0 +1,848 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* + * LibZFS_Core (lzc) is intended to replace most functionality in libzfs. + * It has the following characteristics: + * + * - Thread Safe. libzfs_core is accessible concurrently from multiple + * threads. This is accomplished primarily by avoiding global data + * (e.g. caching). Since it's thread-safe, there is no reason for a + * process to have multiple libzfs "instances". Therefore, we store + * our few pieces of data (e.g. the file descriptor) in global + * variables. The fd is reference-counted so that the libzfs_core + * library can be "initialized" multiple times (e.g. by different + * consumers within the same process). + * + * - Committed Interface. The libzfs_core interface will be committed, + * therefore consumers can compile against it and be confident that + * their code will continue to work on future releases of this code. + * Currently, the interface is Evolving (not Committed), but we intend + * to commit to it once it is more complete and we determine that it + * meets the needs of all consumers. + * + * - Programatic Error Handling. libzfs_core communicates errors with + * defined error numbers, and doesn't print anything to stdout/stderr. + * + * - Thin Layer. libzfs_core is a thin layer, marshaling arguments + * to/from the kernel ioctls. There is generally a 1:1 correspondence + * between libzfs_core functions and ioctls to /dev/zfs. + * + * - Clear Atomicity. Because libzfs_core functions are generally 1:1 + * with kernel ioctls, and kernel ioctls are general atomic, each + * libzfs_core function is atomic. For example, creating multiple + * snapshots with a single call to lzc_snapshot() is atomic -- it + * can't fail with only some of the requested snapshots created, even + * in the event of power loss or system crash. + * + * - Continued libzfs Support. Some higher-level operations (e.g. + * support for "zfs send -R") are too complicated to fit the scope of + * libzfs_core. This functionality will continue to live in libzfs. + * Where appropriate, libzfs will use the underlying atomic operations + * of libzfs_core. For example, libzfs may implement "zfs send -R | + * zfs receive" by using individual "send one snapshot", rename, + * destroy, and "receive one snapshot" operations in libzfs_core. + * /sbin/zfs and /zbin/zpool will link with both libzfs and + * libzfs_core. Other consumers should aim to use only libzfs_core, + * since that will be the supported, stable interface going forwards. + */ + +#define _IN_LIBZFS_CORE_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libzfs_core_compat.h" +#include "libzfs_compat.h" + +#ifdef __FreeBSD__ +extern int zfs_ioctl_version; +#endif + +static int g_fd; +static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER; +static int g_refcount; + +int +libzfs_core_init(void) +{ + (void) pthread_mutex_lock(&g_lock); + if (g_refcount == 0) { + g_fd = open("/dev/zfs", O_RDWR); + if (g_fd < 0) { + (void) pthread_mutex_unlock(&g_lock); + return (errno); + } + } + g_refcount++; + (void) pthread_mutex_unlock(&g_lock); + + return (0); +} + +void +libzfs_core_fini(void) +{ + (void) pthread_mutex_lock(&g_lock); + ASSERT3S(g_refcount, >, 0); + g_refcount--; + if (g_refcount == 0) + (void) close(g_fd); + (void) pthread_mutex_unlock(&g_lock); +} + +static int +lzc_ioctl(zfs_ioc_t ioc, const char *name, + nvlist_t *source, nvlist_t **resultp) +{ + zfs_cmd_t zc = { 0 }; + int error = 0; + char *packed; +#ifdef __FreeBSD__ + nvlist_t *oldsource; +#endif + size_t size; + + ASSERT3S(g_refcount, >, 0); + + (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); + +#ifdef __FreeBSD__ + if (zfs_ioctl_version == ZFS_IOCVER_UNDEF) + zfs_ioctl_version = get_zfs_ioctl_version(); + + if (zfs_ioctl_version < ZFS_IOCVER_LZC) { + oldsource = source; + error = lzc_compat_pre(&zc, &ioc, &source); + if (error) + return (error); + } +#endif + + packed = fnvlist_pack(source, &size); + zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; + zc.zc_nvlist_src_size = size; + + if (resultp != NULL) { + *resultp = NULL; + zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024); + zc.zc_nvlist_dst = (uint64_t)(uintptr_t) + malloc(zc.zc_nvlist_dst_size); +#ifdef illumos + if (zc.zc_nvlist_dst == NULL) { +#else + if (zc.zc_nvlist_dst == 0) { +#endif + error = ENOMEM; + goto out; + } + } + + while (ioctl(g_fd, ioc, &zc) != 0) { + if (errno == ENOMEM && resultp != NULL) { + free((void *)(uintptr_t)zc.zc_nvlist_dst); + zc.zc_nvlist_dst_size *= 2; + zc.zc_nvlist_dst = (uint64_t)(uintptr_t) + malloc(zc.zc_nvlist_dst_size); +#ifdef illumos + if (zc.zc_nvlist_dst == NULL) { +#else + if (zc.zc_nvlist_dst == 0) { +#endif + error = ENOMEM; + goto out; + } + } else { + error = errno; + break; + } + } + +#ifdef __FreeBSD__ + if (zfs_ioctl_version < ZFS_IOCVER_LZC) + lzc_compat_post(&zc, ioc); +#endif + if (zc.zc_nvlist_dst_filled) { + *resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst, + zc.zc_nvlist_dst_size); + } +#ifdef __FreeBSD__ + if (zfs_ioctl_version < ZFS_IOCVER_LZC) + lzc_compat_outnvl(&zc, ioc, resultp); +#endif +out: +#ifdef __FreeBSD__ + if (zfs_ioctl_version < ZFS_IOCVER_LZC) { + if (source != oldsource) + nvlist_free(source); + source = oldsource; + } +#endif + fnvlist_pack_free(packed, size); + free((void *)(uintptr_t)zc.zc_nvlist_dst); + return (error); +} + +int +lzc_create(const char *fsname, enum lzc_dataset_type type, nvlist_t *props) +{ + int error; + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_int32(args, "type", (dmu_objset_type_t)type); + if (props != NULL) + fnvlist_add_nvlist(args, "props", props); + error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL); + nvlist_free(args); + return (error); +} + +int +lzc_clone(const char *fsname, const char *origin, + nvlist_t *props) +{ + int error; + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_string(args, "origin", origin); + if (props != NULL) + fnvlist_add_nvlist(args, "props", props); + error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL); + nvlist_free(args); + return (error); +} + +/* + * Creates snapshots. + * + * The keys in the snaps nvlist are the snapshots to be created. + * They must all be in the same pool. + * + * The props nvlist is properties to set. Currently only user properties + * are supported. { user:prop_name -> string value } + * + * The returned results nvlist will have an entry for each snapshot that failed. + * The value will be the (int32) error code. + * + * The return value will be 0 if all snapshots were created, otherwise it will + * be the errno of a (unspecified) snapshot that failed. + */ +int +lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist) +{ + nvpair_t *elem; + nvlist_t *args; + int error; + char pool[ZFS_MAX_DATASET_NAME_LEN]; + + *errlist = NULL; + + /* determine the pool name */ + elem = nvlist_next_nvpair(snaps, NULL); + if (elem == NULL) + return (0); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/@")] = '\0'; + + args = fnvlist_alloc(); + fnvlist_add_nvlist(args, "snaps", snaps); + if (props != NULL) + fnvlist_add_nvlist(args, "props", props); + + error = lzc_ioctl(ZFS_IOC_SNAPSHOT, pool, args, errlist); + nvlist_free(args); + + return (error); +} + +/* + * Destroys snapshots. + * + * The keys in the snaps nvlist are the snapshots to be destroyed. + * They must all be in the same pool. + * + * Snapshots that do not exist will be silently ignored. + * + * If 'defer' is not set, and a snapshot has user holds or clones, the + * destroy operation will fail and none of the snapshots will be + * destroyed. + * + * If 'defer' is set, and a snapshot has user holds or clones, it will be + * marked for deferred destruction, and will be destroyed when the last hold + * or clone is removed/destroyed. + * + * The return value will be 0 if all snapshots were destroyed (or marked for + * later destruction if 'defer' is set) or didn't exist to begin with. + * + * Otherwise the return value will be the errno of a (unspecified) snapshot + * that failed, no snapshots will be destroyed, and the errlist will have an + * entry for each snapshot that failed. The value in the errlist will be + * the (int32) error code. + */ +int +lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist) +{ + nvpair_t *elem; + nvlist_t *args; + int error; + char pool[ZFS_MAX_DATASET_NAME_LEN]; + + /* determine the pool name */ + elem = nvlist_next_nvpair(snaps, NULL); + if (elem == NULL) + return (0); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/@")] = '\0'; + + args = fnvlist_alloc(); + fnvlist_add_nvlist(args, "snaps", snaps); + if (defer) + fnvlist_add_boolean(args, "defer"); + + error = lzc_ioctl(ZFS_IOC_DESTROY_SNAPS, pool, args, errlist); + nvlist_free(args); + + return (error); +} + +int +lzc_snaprange_space(const char *firstsnap, const char *lastsnap, + uint64_t *usedp) +{ + nvlist_t *args; + nvlist_t *result; + int err; + char fs[ZFS_MAX_DATASET_NAME_LEN]; + char *atp; + + /* determine the fs name */ + (void) strlcpy(fs, firstsnap, sizeof (fs)); + atp = strchr(fs, '@'); + if (atp == NULL) + return (EINVAL); + *atp = '\0'; + + args = fnvlist_alloc(); + fnvlist_add_string(args, "firstsnap", firstsnap); + + err = lzc_ioctl(ZFS_IOC_SPACE_SNAPS, lastsnap, args, &result); + nvlist_free(args); + if (err == 0) + *usedp = fnvlist_lookup_uint64(result, "used"); + fnvlist_free(result); + + return (err); +} + +boolean_t +lzc_exists(const char *dataset) +{ + /* + * The objset_stats ioctl is still legacy, so we need to construct our + * own zfs_cmd_t rather than using zfsc_ioctl(). + */ + zfs_cmd_t zc = { 0 }; + + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + return (ioctl(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0); +} + +/* + * Create "user holds" on snapshots. If there is a hold on a snapshot, + * the snapshot can not be destroyed. (However, it can be marked for deletion + * by lzc_destroy_snaps(defer=B_TRUE).) + * + * The keys in the nvlist are snapshot names. + * The snapshots must all be in the same pool. + * The value is the name of the hold (string type). + * + * If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL). + * In this case, when the cleanup_fd is closed (including on process + * termination), the holds will be released. If the system is shut down + * uncleanly, the holds will be released when the pool is next opened + * or imported. + * + * Holds for snapshots which don't exist will be skipped and have an entry + * added to errlist, but will not cause an overall failure. + * + * The return value will be 0 if all holds, for snapshots that existed, + * were succesfully created. + * + * Otherwise the return value will be the errno of a (unspecified) hold that + * failed and no holds will be created. + * + * In all cases the errlist will have an entry for each hold that failed + * (name = snapshot), with its value being the error code (int32). + */ +int +lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist) +{ + char pool[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *args; + nvpair_t *elem; + int error; + + /* determine the pool name */ + elem = nvlist_next_nvpair(holds, NULL); + if (elem == NULL) + return (0); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/@")] = '\0'; + + args = fnvlist_alloc(); + fnvlist_add_nvlist(args, "holds", holds); + if (cleanup_fd != -1) + fnvlist_add_int32(args, "cleanup_fd", cleanup_fd); + + error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist); + nvlist_free(args); + return (error); +} + +/* + * Release "user holds" on snapshots. If the snapshot has been marked for + * deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have + * any clones, and all the user holds are removed, then the snapshot will be + * destroyed. + * + * The keys in the nvlist are snapshot names. + * The snapshots must all be in the same pool. + * The value is a nvlist whose keys are the holds to remove. + * + * Holds which failed to release because they didn't exist will have an entry + * added to errlist, but will not cause an overall failure. + * + * The return value will be 0 if the nvl holds was empty or all holds that + * existed, were successfully removed. + * + * Otherwise the return value will be the errno of a (unspecified) hold that + * failed to release and no holds will be released. + * + * In all cases the errlist will have an entry for each hold that failed to + * to release. + */ +int +lzc_release(nvlist_t *holds, nvlist_t **errlist) +{ + char pool[ZFS_MAX_DATASET_NAME_LEN]; + nvpair_t *elem; + + /* determine the pool name */ + elem = nvlist_next_nvpair(holds, NULL); + if (elem == NULL) + return (0); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/@")] = '\0'; + + return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist)); +} + +/* + * Retrieve list of user holds on the specified snapshot. + * + * On success, *holdsp will be set to a nvlist which the caller must free. + * The keys are the names of the holds, and the value is the creation time + * of the hold (uint64) in seconds since the epoch. + */ +int +lzc_get_holds(const char *snapname, nvlist_t **holdsp) +{ + int error; + nvlist_t *innvl = fnvlist_alloc(); + error = lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, innvl, holdsp); + fnvlist_free(innvl); + return (error); +} + +/* + * Generate a zfs send stream for the specified snapshot and write it to + * the specified file descriptor. + * + * "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap") + * + * If "from" is NULL, a full (non-incremental) stream will be sent. + * If "from" is non-NULL, it must be the full name of a snapshot or + * bookmark to send an incremental from (e.g. "pool/fs@earlier_snap" or + * "pool/fs#earlier_bmark"). If non-NULL, the specified snapshot or + * bookmark must represent an earlier point in the history of "snapname"). + * It can be an earlier snapshot in the same filesystem or zvol as "snapname", + * or it can be the origin of "snapname"'s filesystem, or an earlier + * snapshot in the origin, etc. + * + * "fd" is the file descriptor to write the send stream to. + * + * If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted + * to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT + * records with drr_blksz > 128K. + * + * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted + * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA, + * which the receiving system must support (as indicated by support + * for the "embedded_data" feature). + */ +int +lzc_send(const char *snapname, const char *from, int fd, + enum lzc_send_flags flags) +{ + return (lzc_send_resume(snapname, from, fd, flags, 0, 0)); +} + +int +lzc_send_resume(const char *snapname, const char *from, int fd, + enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff) +{ + nvlist_t *args; + int err; + + args = fnvlist_alloc(); + fnvlist_add_int32(args, "fd", fd); + if (from != NULL) + fnvlist_add_string(args, "fromsnap", from); + if (flags & LZC_SEND_FLAG_LARGE_BLOCK) + fnvlist_add_boolean(args, "largeblockok"); + if (flags & LZC_SEND_FLAG_EMBED_DATA) + fnvlist_add_boolean(args, "embedok"); + if (resumeobj != 0 || resumeoff != 0) { + fnvlist_add_uint64(args, "resume_object", resumeobj); + fnvlist_add_uint64(args, "resume_offset", resumeoff); + } + err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL); + nvlist_free(args); + return (err); +} + +/* + * "from" can be NULL, a snapshot, or a bookmark. + * + * If from is NULL, a full (non-incremental) stream will be estimated. This + * is calculated very efficiently. + * + * If from is a snapshot, lzc_send_space uses the deadlists attached to + * each snapshot to efficiently estimate the stream size. + * + * If from is a bookmark, the indirect blocks in the destination snapshot + * are traversed, looking for blocks with a birth time since the creation TXG of + * the snapshot this bookmark was created from. This will result in + * significantly more I/O and be less efficient than a send space estimation on + * an equivalent snapshot. + */ +int +lzc_send_space(const char *snapname, const char *from, uint64_t *spacep) +{ + nvlist_t *args; + nvlist_t *result; + int err; + + args = fnvlist_alloc(); + if (from != NULL) + fnvlist_add_string(args, "from", from); + err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result); + nvlist_free(args); + if (err == 0) + *spacep = fnvlist_lookup_uint64(result, "space"); + nvlist_free(result); + return (err); +} + +static int +recv_read(int fd, void *buf, int ilen) +{ + char *cp = buf; + int rv; + int len = ilen; + + do { + rv = read(fd, cp, len); + cp += rv; + len -= rv; + } while (rv > 0); + + if (rv < 0 || len != 0) + return (EIO); + + return (0); +} + +static int +recv_impl(const char *snapname, nvlist_t *props, const char *origin, + boolean_t force, boolean_t resumable, int fd, + const dmu_replay_record_t *begin_record) +{ + /* + * The receive ioctl is still legacy, so we need to construct our own + * zfs_cmd_t rather than using zfsc_ioctl(). + */ + zfs_cmd_t zc = { 0 }; + char *atp; + char *packed = NULL; + size_t size; + int error; + + ASSERT3S(g_refcount, >, 0); + + /* zc_name is name of containing filesystem */ + (void) strlcpy(zc.zc_name, snapname, sizeof (zc.zc_name)); + atp = strchr(zc.zc_name, '@'); + if (atp == NULL) + return (EINVAL); + *atp = '\0'; + + /* if the fs does not exist, try its parent. */ + if (!lzc_exists(zc.zc_name)) { + char *slashp = strrchr(zc.zc_name, '/'); + if (slashp == NULL) + return (ENOENT); + *slashp = '\0'; + + } + + /* zc_value is full name of the snapshot to create */ + (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + + if (props != NULL) { + /* zc_nvlist_src is props to set */ + packed = fnvlist_pack(props, &size); + zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; + zc.zc_nvlist_src_size = size; + } + + /* zc_string is name of clone origin (if DRR_FLAG_CLONE) */ + if (origin != NULL) + (void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string)); + + /* zc_begin_record is non-byteswapped BEGIN record */ + if (begin_record == NULL) { + error = recv_read(fd, &zc.zc_begin_record, + sizeof (zc.zc_begin_record)); + if (error != 0) + goto out; + } else { + zc.zc_begin_record = *begin_record; + } + + /* zc_cookie is fd to read from */ + zc.zc_cookie = fd; + + /* zc guid is force flag */ + zc.zc_guid = force; + + zc.zc_resumable = resumable; + + /* zc_cleanup_fd is unused */ + zc.zc_cleanup_fd = -1; + + error = ioctl(g_fd, ZFS_IOC_RECV, &zc); + if (error != 0) + error = errno; + +out: + if (packed != NULL) + fnvlist_pack_free(packed, size); + free((void*)(uintptr_t)zc.zc_nvlist_dst); + return (error); +} + +/* + * The simplest receive case: receive from the specified fd, creating the + * specified snapshot. Apply the specified properties as "received" properties + * (which can be overridden by locally-set properties). If the stream is a + * clone, its origin snapshot must be specified by 'origin'. The 'force' + * flag will cause the target filesystem to be rolled back or destroyed if + * necessary to receive. + * + * Return 0 on success or an errno on failure. + * + * Note: this interface does not work on dedup'd streams + * (those with DMU_BACKUP_FEATURE_DEDUP). + */ +int +lzc_receive(const char *snapname, nvlist_t *props, const char *origin, + boolean_t force, int fd) +{ + return (recv_impl(snapname, props, origin, force, B_FALSE, fd, NULL)); +} + +/* + * Like lzc_receive, but if the receive fails due to premature stream + * termination, the intermediate state will be preserved on disk. In this + * case, ECKSUM will be returned. The receive may subsequently be resumed + * with a resuming send stream generated by lzc_send_resume(). + */ +int +lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin, + boolean_t force, int fd) +{ + return (recv_impl(snapname, props, origin, force, B_TRUE, fd, NULL)); +} + +/* + * Like lzc_receive, but allows the caller to read the begin record and then to + * pass it in. That could be useful if the caller wants to derive, for example, + * the snapname or the origin parameters based on the information contained in + * the begin record. + * The begin record must be in its original form as read from the stream, + * in other words, it should not be byteswapped. + * + * The 'resumable' parameter allows to obtain the same behavior as with + * lzc_receive_resumable. + */ +int +lzc_receive_with_header(const char *snapname, nvlist_t *props, + const char *origin, boolean_t force, boolean_t resumable, int fd, + const dmu_replay_record_t *begin_record) +{ + if (begin_record == NULL) + return (EINVAL); + return (recv_impl(snapname, props, origin, force, resumable, fd, + begin_record)); +} + +/* + * Roll back this filesystem or volume to its most recent snapshot. + * If snapnamebuf is not NULL, it will be filled in with the name + * of the most recent snapshot. + * + * Return 0 on success or an errno on failure. + */ +int +lzc_rollback(const char *fsname, char *snapnamebuf, int snapnamelen) +{ + nvlist_t *args; + nvlist_t *result; + int err; + + args = fnvlist_alloc(); + err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result); + nvlist_free(args); + if (err == 0 && snapnamebuf != NULL) { + const char *snapname = fnvlist_lookup_string(result, "target"); + (void) strlcpy(snapnamebuf, snapname, snapnamelen); + } + return (err); +} + +/* + * Creates bookmarks. + * + * The bookmarks nvlist maps from name of the bookmark (e.g. "pool/fs#bmark") to + * the name of the snapshot (e.g. "pool/fs@snap"). All the bookmarks and + * snapshots must be in the same pool. + * + * The returned results nvlist will have an entry for each bookmark that failed. + * The value will be the (int32) error code. + * + * The return value will be 0 if all bookmarks were created, otherwise it will + * be the errno of a (undetermined) bookmarks that failed. + */ +int +lzc_bookmark(nvlist_t *bookmarks, nvlist_t **errlist) +{ + nvpair_t *elem; + int error; + char pool[ZFS_MAX_DATASET_NAME_LEN]; + + /* determine the pool name */ + elem = nvlist_next_nvpair(bookmarks, NULL); + if (elem == NULL) + return (0); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/#")] = '\0'; + + error = lzc_ioctl(ZFS_IOC_BOOKMARK, pool, bookmarks, errlist); + + return (error); +} + +/* + * Retrieve bookmarks. + * + * Retrieve the list of bookmarks for the given file system. The props + * parameter is an nvlist of property names (with no values) that will be + * returned for each bookmark. + * + * The following are valid properties on bookmarks, all of which are numbers + * (represented as uint64 in the nvlist) + * + * "guid" - globally unique identifier of the snapshot it refers to + * "createtxg" - txg when the snapshot it refers to was created + * "creation" - timestamp when the snapshot it refers to was created + * + * The format of the returned nvlist as follows: + * -> { + * -> { + * "value" -> uint64 + * } + * } + */ +int +lzc_get_bookmarks(const char *fsname, nvlist_t *props, nvlist_t **bmarks) +{ + return (lzc_ioctl(ZFS_IOC_GET_BOOKMARKS, fsname, props, bmarks)); +} + +/* + * Destroys bookmarks. + * + * The keys in the bmarks nvlist are the bookmarks to be destroyed. + * They must all be in the same pool. Bookmarks are specified as + * #. + * + * Bookmarks that do not exist will be silently ignored. + * + * The return value will be 0 if all bookmarks that existed were destroyed. + * + * Otherwise the return value will be the errno of a (undetermined) bookmark + * that failed, no bookmarks will be destroyed, and the errlist will have an + * entry for each bookmarks that failed. The value in the errlist will be + * the (int32) error code. + */ +int +lzc_destroy_bookmarks(nvlist_t *bmarks, nvlist_t **errlist) +{ + nvpair_t *elem; + int error; + char pool[ZFS_MAX_DATASET_NAME_LEN]; + + /* determine the pool name */ + elem = nvlist_next_nvpair(bmarks, NULL); + if (elem == NULL) + return (0); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/#")] = '\0'; + + error = lzc_ioctl(ZFS_IOC_DESTROY_BOOKMARKS, pool, bmarks, errlist); + + return (error); +} Index: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.h =================================================================== RCS file: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.h diff -N src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core.h 3 Dec 2016 17:05:32 -0000 @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013 by Martin Matuska . All rights reserved. + */ + +#ifndef _LIBZFS_CORE_H +#define _LIBZFS_CORE_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +int libzfs_core_init(void); +void libzfs_core_fini(void); + +/* + * NB: this type should be kept binary compatible with dmu_objset_type_t. + */ +enum lzc_dataset_type { + LZC_DATSET_TYPE_ZFS = 2, + LZC_DATSET_TYPE_ZVOL +}; + +int lzc_snapshot(nvlist_t *, nvlist_t *, nvlist_t **); +int lzc_create(const char *, enum lzc_dataset_type, nvlist_t *); +int lzc_clone(const char *, const char *, nvlist_t *); +int lzc_destroy_snaps(nvlist_t *, boolean_t, nvlist_t **); +int lzc_bookmark(nvlist_t *, nvlist_t **); +int lzc_get_bookmarks(const char *, nvlist_t *, nvlist_t **); +int lzc_destroy_bookmarks(nvlist_t *, nvlist_t **); + +int lzc_snaprange_space(const char *, const char *, uint64_t *); + +int lzc_hold(nvlist_t *, int, nvlist_t **); +int lzc_release(nvlist_t *, nvlist_t **); +int lzc_get_holds(const char *, nvlist_t **); + +enum lzc_send_flags { + LZC_SEND_FLAG_EMBED_DATA = 1 << 0, + LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1 +}; + +int lzc_send(const char *, const char *, int, enum lzc_send_flags); +int lzc_send_resume(const char *, const char *, int, + enum lzc_send_flags, uint64_t, uint64_t); +int lzc_send_space(const char *, const char *, uint64_t *); + +struct dmu_replay_record; + +int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, int); +int lzc_receive_resumable(const char *, nvlist_t *, const char *, + boolean_t, int); +int lzc_receive_with_header(const char *, nvlist_t *, const char *, boolean_t, + boolean_t, int, const struct dmu_replay_record *); + +boolean_t lzc_exists(const char *); + +int lzc_rollback(const char *, char *, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBZFS_CORE_H */ Index: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.c =================================================================== RCS file: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.c diff -N src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.c 23 Mar 2013 15:31:39 -0000 @@ -0,0 +1,189 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 Martin Matuska . All rights reserved. + */ + +#include +#include +#include "libzfs_core_compat.h" + +extern int zfs_ioctl_version; + +int +lzc_compat_pre(zfs_cmd_t *zc, zfs_ioc_t *ioc, nvlist_t **source) +{ + nvlist_t *nvl = NULL; + nvpair_t *pair, *hpair; + char *buf, *val; + zfs_ioc_t vecnum; + uint32_t type32; + int32_t cleanup_fd; + int error = 0; + int pos; + + if (zfs_ioctl_version >= ZFS_IOCVER_LZC) + return (0); + + vecnum = *ioc; + + switch (vecnum) { + case ZFS_IOC_CREATE: + type32 = fnvlist_lookup_int32(*source, "type"); + zc->zc_objset_type = (uint64_t)type32; + nvlist_lookup_nvlist(*source, "props", &nvl); + *source = nvl; + break; + case ZFS_IOC_CLONE: + buf = fnvlist_lookup_string(*source, "origin"); + strlcpy(zc->zc_value, buf, MAXPATHLEN); + nvlist_lookup_nvlist(*source, "props", &nvl); + *ioc = ZFS_IOC_CREATE; + *source = nvl; + break; + case ZFS_IOC_SNAPSHOT: + nvl = fnvlist_lookup_nvlist(*source, "snaps"); + pair = nvlist_next_nvpair(nvl, NULL); + if (pair != NULL) { + buf = nvpair_name(pair); + pos = strcspn(buf, "@"); + strlcpy(zc->zc_name, buf, pos + 1); + strlcpy(zc->zc_value, buf + pos + 1, MAXPATHLEN); + } else + error = EINVAL; + /* old kernel cannot create multiple snapshots */ + if (!error && nvlist_next_nvpair(nvl, pair) != NULL) + error = EOPNOTSUPP; + nvlist_free(nvl); + nvl = NULL; + nvlist_lookup_nvlist(*source, "props", &nvl); + *source = nvl; + break; + case ZFS_IOC_SPACE_SNAPS: + buf = fnvlist_lookup_string(*source, "firstsnap"); + strlcpy(zc->zc_value, buf, MAXPATHLEN); + break; + case ZFS_IOC_DESTROY_SNAPS: + nvl = fnvlist_lookup_nvlist(*source, "snaps"); + pair = nvlist_next_nvpair(nvl, NULL); + if (pair != NULL) { + buf = nvpair_name(pair); + pos = strcspn(buf, "@"); + strlcpy(zc->zc_name, buf, pos + 1); + } else + error = EINVAL; + /* old kernel cannot atomically destroy multiple snaps */ + if (!error && nvlist_next_nvpair(nvl, pair) != NULL) + error = EOPNOTSUPP; + *source = nvl; + break; + case ZFS_IOC_HOLD: + nvl = fnvlist_lookup_nvlist(*source, "holds"); + pair = nvlist_next_nvpair(nvl, NULL); + if (pair != NULL) { + buf = nvpair_name(pair); + pos = strcspn(buf, "@"); + strlcpy(zc->zc_name, buf, pos + 1); + strlcpy(zc->zc_value, buf + pos + 1, MAXPATHLEN); + if (nvpair_value_string(pair, &val) == 0) + strlcpy(zc->zc_string, val, MAXNAMELEN); + else + error = EINVAL; + } else + error = EINVAL; + /* old kernel cannot atomically create multiple holds */ + if (!error && nvlist_next_nvpair(nvl, pair) != NULL) + error = EOPNOTSUPP; + nvlist_free(nvl); + if (nvlist_lookup_int32(*source, "cleanup_fd", + &cleanup_fd) == 0) + zc->zc_cleanup_fd = cleanup_fd; + else + zc->zc_cleanup_fd = -1; + break; + case ZFS_IOC_RELEASE: + pair = nvlist_next_nvpair(*source, NULL); + if (pair != NULL) { + buf = nvpair_name(pair); + pos = strcspn(buf, "@"); + strlcpy(zc->zc_name, buf, pos + 1); + strlcpy(zc->zc_value, buf + pos + 1, MAXPATHLEN); + if (nvpair_value_nvlist(pair, &nvl) == 0) { + hpair = nvlist_next_nvpair(nvl, NULL); + if (hpair != NULL) + strlcpy(zc->zc_string, + nvpair_name(hpair), MAXNAMELEN); + else + error = EINVAL; + if (!error && nvlist_next_nvpair(nvl, + hpair) != NULL) + error = EOPNOTSUPP; + } else + error = EINVAL; + } else + error = EINVAL; + /* old kernel cannot atomically release multiple holds */ + if (!error && nvlist_next_nvpair(nvl, pair) != NULL) + error = EOPNOTSUPP; + break; + } + + return (error); +} + +void +lzc_compat_post(zfs_cmd_t *zc, const zfs_ioc_t ioc) +{ + if (zfs_ioctl_version >= ZFS_IOCVER_LZC) + return; + + switch (ioc) { + case ZFS_IOC_CREATE: + case ZFS_IOC_CLONE: + case ZFS_IOC_SNAPSHOT: + case ZFS_IOC_SPACE_SNAPS: + case ZFS_IOC_DESTROY_SNAPS: + zc->zc_nvlist_dst_filled = B_FALSE; + break; + } +} + +int +lzc_compat_outnvl(zfs_cmd_t *zc, const zfs_ioc_t ioc, nvlist_t **outnvl) +{ + nvlist_t *nvl; + + if (zfs_ioctl_version >= ZFS_IOCVER_LZC) + return (0); + + switch (ioc) { + case ZFS_IOC_SPACE_SNAPS: + nvl = fnvlist_alloc(); + fnvlist_add_uint64(nvl, "used", zc->zc_cookie); + fnvlist_add_uint64(nvl, "compressed", zc->zc_objset_type); + fnvlist_add_uint64(nvl, "uncompressed", zc->zc_perm_action); + *outnvl = nvl; + break; + } + + return (0); +} Index: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.h =================================================================== RCS file: src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.h diff -N src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/lib/libzfs_core/common/libzfs_core_compat.h 23 Mar 2013 15:31:39 -0000 @@ -0,0 +1,47 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 by Martin Matuska . All rights reserved. + */ + +#ifndef _LIBZFS_CORE_COMPAT_H +#define _LIBZFS_CORE_COMPAT_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +int lzc_compat_pre(zfs_cmd_t *, zfs_ioc_t *, nvlist_t **); +void lzc_compat_post(zfs_cmd_t *, const zfs_ioc_t); +int lzc_compat_outnvl(zfs_cmd_t *, const zfs_ioc_t, nvlist_t **); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBZFS_CORE_COMPAT_H */ Index: src/external/cddl/osnet/dist/lib/libzpool/common/kernel.c =================================================================== RCS file: src/external/cddl/osnet/dist/lib/libzpool/common/kernel.c diff -N src/external/cddl/osnet/dist/lib/libzpool/common/kernel.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/lib/libzpool/common/kernel.c 30 Apr 2017 04:17:20 -0000 @@ -0,0 +1,1210 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Emulation of kernel services in userland. + */ + +#ifndef __FreeBSD__ +int aok; +#endif +uint64_t physmem; +vnode_t *rootdir = (vnode_t *)0xabcd1234; +char hw_serial[HW_HOSTID_LEN]; +#ifdef illumos +kmutex_t cpu_lock; +#endif + +/* If set, all blocks read will be copied to the specified directory. */ +char *vn_dumpdir = NULL; + +struct utsname utsname = { + "userland", "libzpool", "1", "1", "na" +}; + +/* this only exists to have its address taken */ +struct proc p0; + +/* + * ========================================================================= + * threads + * ========================================================================= + */ +/*ARGSUSED*/ +kthread_t * +zk_thread_create(void (*func)(), void *arg) +{ + thread_t tid; + + VERIFY(thr_create(0, 0, (void *(*)(void *))func, arg, THR_DETACHED, + &tid) == 0); + + return ((void *)(uintptr_t)tid); +} + +/* + * ========================================================================= + * kstats + * ========================================================================= + */ +/*ARGSUSED*/ +kstat_t * +kstat_create(char *module, int instance, char *name, char *class, + uchar_t type, ulong_t ndata, uchar_t ks_flag) +{ + return (NULL); +} + +/*ARGSUSED*/ +void +kstat_named_init(kstat_named_t *knp, const char *name, uchar_t type) +{} + +/*ARGSUSED*/ +void +kstat_install(kstat_t *ksp) +{} + +/*ARGSUSED*/ +void +kstat_delete(kstat_t *ksp) +{} + +/* + * ========================================================================= + * mutexes + * ========================================================================= + */ +void +zmutex_init(kmutex_t *mp) +{ + mp->m_owner = NULL; + mp->initialized = B_TRUE; + (void) _mutex_init(&mp->m_lock, USYNC_THREAD, NULL); +} + +void +zmutex_destroy(kmutex_t *mp) +{ + ASSERT(mp->initialized == B_TRUE); + ASSERT(mp->m_owner == NULL); + (void) _mutex_destroy(&(mp)->m_lock); + mp->m_owner = (void *)-1UL; + mp->initialized = B_FALSE; +} + +int +zmutex_owned(kmutex_t *mp) +{ + ASSERT(mp->initialized == B_TRUE); + + return (mp->m_owner == curthread); +} + +void +mutex_enter(kmutex_t *mp) +{ + ASSERT(mp->initialized == B_TRUE); + ASSERT(mp->m_owner != (void *)-1UL); + ASSERT(mp->m_owner != curthread); + VERIFY(mutex_lock(&mp->m_lock) == 0); + ASSERT(mp->m_owner == NULL); + mp->m_owner = curthread; +} + +int +mutex_tryenter(kmutex_t *mp) +{ + ASSERT(mp->initialized == B_TRUE); + ASSERT(mp->m_owner != (void *)-1UL); + if (0 == mutex_trylock(&mp->m_lock)) { + ASSERT(mp->m_owner == NULL); + mp->m_owner = curthread; + return (1); + } else { + return (0); + } +} + +void +mutex_exit(kmutex_t *mp) +{ + ASSERT(mp->initialized == B_TRUE); + ASSERT(mutex_owner(mp) == curthread); + mp->m_owner = NULL; + VERIFY(mutex_unlock(&mp->m_lock) == 0); +} + +void * +mutex_owner(kmutex_t *mp) +{ + ASSERT(mp->initialized == B_TRUE); + return (mp->m_owner); +} + +/* + * ========================================================================= + * rwlocks + * ========================================================================= + */ +/*ARGSUSED*/ +void +rw_init(krwlock_t *rwlp, char *name, int type, void *arg) +{ + rwlock_init(&rwlp->rw_lock, USYNC_THREAD, NULL); + rwlp->rw_owner = NULL; + rwlp->initialized = B_TRUE; + rwlp->rw_count = 0; +} + +void +rw_destroy(krwlock_t *rwlp) +{ + ASSERT(rwlp->rw_count == 0); + rwlock_destroy(&rwlp->rw_lock); + rwlp->rw_owner = (void *)-1UL; + rwlp->initialized = B_FALSE; +} + +void +rw_enter(krwlock_t *rwlp, krw_t rw) +{ + //ASSERT(!RW_LOCK_HELD(rwlp)); + ASSERT(rwlp->initialized == B_TRUE); + ASSERT(rwlp->rw_owner != (void *)-1UL); + ASSERT(rwlp->rw_owner != curthread); + + if (rw == RW_READER) { + VERIFY(rw_rdlock(&rwlp->rw_lock) == 0); + ASSERT(rwlp->rw_count >= 0); + atomic_add_int(&rwlp->rw_count, 1); + } else { + VERIFY(rw_wrlock(&rwlp->rw_lock) == 0); + ASSERT(rwlp->rw_count == 0); + rwlp->rw_count = -1; + rwlp->rw_owner = curthread; + } +} + +void +rw_exit(krwlock_t *rwlp) +{ + ASSERT(rwlp->initialized == B_TRUE); + ASSERT(rwlp->rw_owner != (void *)-1UL); + + if (rwlp->rw_owner == curthread) { + /* Write locked. */ + ASSERT(rwlp->rw_count == -1); + rwlp->rw_count = 0; + rwlp->rw_owner = NULL; + } else { + /* Read locked. */ + ASSERT(rwlp->rw_count > 0); + atomic_add_int(&rwlp->rw_count, -1); + } + VERIFY(rw_unlock(&rwlp->rw_lock) == 0); +} + +int +rw_tryenter(krwlock_t *rwlp, krw_t rw) +{ + int rv; + + ASSERT(rwlp->initialized == B_TRUE); + ASSERT(rwlp->rw_owner != (void *)-1UL); + ASSERT(rwlp->rw_owner != curthread); + + if (rw == RW_READER) + rv = rw_tryrdlock(&rwlp->rw_lock); + else + rv = rw_trywrlock(&rwlp->rw_lock); + + if (rv == 0) { + ASSERT(rwlp->rw_owner == NULL); + if (rw == RW_READER) { + ASSERT(rwlp->rw_count >= 0); + atomic_add_int(&rwlp->rw_count, 1); + } else { + ASSERT(rwlp->rw_count == 0); + rwlp->rw_count = -1; + rwlp->rw_owner = curthread; + } + return (1); + } + + return (0); +} + +/*ARGSUSED*/ +int +rw_tryupgrade(krwlock_t *rwlp) +{ + ASSERT(rwlp->initialized == B_TRUE); + ASSERT(rwlp->rw_owner != (void *)-1UL); + + return (0); +} + +int +rw_lock_held(krwlock_t *rwlp) +{ + + return (rwlp->rw_count != 0); +} + +/* + * ========================================================================= + * condition variables + * ========================================================================= + */ +/*ARGSUSED*/ +void +cv_init(kcondvar_t *cv, char *name, int type, void *arg) +{ + VERIFY(cond_init(cv, name, NULL) == 0); +} + +void +cv_destroy(kcondvar_t *cv) +{ + VERIFY(cond_destroy(cv) == 0); +} + +void +cv_wait(kcondvar_t *cv, kmutex_t *mp) +{ + ASSERT(mutex_owner(mp) == curthread); + mp->m_owner = NULL; + int ret = cond_wait(cv, &mp->m_lock); + VERIFY(ret == 0 || ret == EINTR); + mp->m_owner = curthread; +} + +clock_t +cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) +{ + int error; + struct timespec ts; + struct timeval tv; + clock_t delta; + + abstime += ddi_get_lbolt(); +top: + delta = abstime - ddi_get_lbolt(); + if (delta <= 0) + return (-1); + + if (gettimeofday(&tv, NULL) != 0) + assert(!"gettimeofday() failed"); + + ts.tv_sec = tv.tv_sec + delta / hz; + ts.tv_nsec = tv.tv_usec * 1000 + (delta % hz) * (NANOSEC / hz); + ASSERT(ts.tv_nsec >= 0); + + if (ts.tv_nsec >= NANOSEC) { + ts.tv_sec++; + ts.tv_nsec -= NANOSEC; + } + + ASSERT(mutex_owner(mp) == curthread); + mp->m_owner = NULL; + error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); + mp->m_owner = curthread; + + if (error == EINTR) + goto top; + + if (error == ETIMEDOUT) + return (-1); + + ASSERT(error == 0); + + return (1); +} + +/*ARGSUSED*/ +clock_t +cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, + int flag) +{ + int error; + timestruc_t ts; + hrtime_t delta; + + ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE); + +top: + delta = tim; + if (flag & CALLOUT_FLAG_ABSOLUTE) + delta -= gethrtime(); + + if (delta <= 0) + return (-1); + + ts.tv_sec = delta / NANOSEC; + ts.tv_nsec = delta % NANOSEC; + + ASSERT(mutex_owner(mp) == curthread); + mp->m_owner = NULL; + error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); + mp->m_owner = curthread; + + if (error == ETIMEDOUT) + return (-1); + + if (error == EINTR) + goto top; + + ASSERT(error == 0); + + return (1); +} + +void +cv_signal(kcondvar_t *cv) +{ + VERIFY(cond_signal(cv) == 0); +} + +void +cv_broadcast(kcondvar_t *cv) +{ + VERIFY(cond_broadcast(cv) == 0); +} + +/* + * ========================================================================= + * vnode operations + * ========================================================================= + */ +/* + * Note: for the xxxat() versions of these functions, we assume that the + * starting vp is always rootdir (which is true for spa_directory.c, the only + * ZFS consumer of these interfaces). We assert this is true, and then emulate + * them by adding '/' in front of the path. + */ + +/*ARGSUSED*/ +int +vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) +{ + int fd; + int dump_fd; + vnode_t *vp; + int old_umask; + char realpath[MAXPATHLEN]; + struct stat64 st; + + /* + * If we're accessing a real disk from userland, we need to use + * the character interface to avoid caching. This is particularly + * important if we're trying to look at a real in-kernel storage + * pool from userland, e.g. via zdb, because otherwise we won't + * see the changes occurring under the segmap cache. + * On the other hand, the stupid character device returns zero + * for its size. So -- gag -- we open the block device to get + * its size, and remember it for subsequent VOP_GETATTR(). + */ + if (strncmp(path, "/dev/", 5) == 0) { + char *dsk; + fd = open64(path, O_RDONLY); + if (fd == -1) + return (errno); + if (fstat64(fd, &st) == -1) { + close(fd); + return (errno); + } + close(fd); + (void) sprintf(realpath, "%s", path); + dsk = strstr(path, "/dsk/"); + if (dsk != NULL) + (void) sprintf(realpath + (dsk - path) + 1, "r%s", + dsk + 1); + } else { + (void) sprintf(realpath, "%s", path); + if (!(flags & FCREAT) && stat64(realpath, &st) == -1) + return (errno); + } + + if (flags & FCREAT) + old_umask = umask(0); + + /* + * The construct 'flags - FREAD' conveniently maps combinations of + * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR. + */ + fd = open64(realpath, flags - FREAD, mode); + + if (flags & FCREAT) + (void) umask(old_umask); + + if (vn_dumpdir != NULL) { + char dumppath[MAXPATHLEN]; + (void) snprintf(dumppath, sizeof (dumppath), + "%s/%s", vn_dumpdir, basename(realpath)); + dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); + if (dump_fd == -1) + return (errno); + } else { + dump_fd = -1; + } + + if (fd == -1) + return (errno); + + if (fstat64(fd, &st) == -1) { + close(fd); + return (errno); + } + + (void) fcntl(fd, F_SETFD, FD_CLOEXEC); + + *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL); + + vp->v_fd = fd; + vp->v_size = st.st_size; + vp->v_path = spa_strdup(path); + vp->v_dump_fd = dump_fd; + + return (0); +} + +/*ARGSUSED*/ +int +vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, + int x3, vnode_t *startvp, int fd) +{ + char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL); + int ret; + + ASSERT(startvp == rootdir); + (void) sprintf(realpath, "/%s", path); + + /* fd ignored for now, need if want to simulate nbmand support */ + ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3); + + umem_free(realpath, strlen(path) + 2); + + return (ret); +} + +/*ARGSUSED*/ +int +vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, + int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp) +{ + ssize_t iolen, split; + + if (uio == UIO_READ) { + iolen = pread64(vp->v_fd, addr, len, offset); + if (vp->v_dump_fd != -1) { + int status = + pwrite64(vp->v_dump_fd, addr, iolen, offset); + ASSERT(status != -1); + } + } else { + /* + * To simulate partial disk writes, we split writes into two + * system calls so that the process can be killed in between. + */ + int sectors = len >> SPA_MINBLOCKSHIFT; + split = (sectors > 0 ? rand() % sectors : 0) << + SPA_MINBLOCKSHIFT; + iolen = pwrite64(vp->v_fd, addr, split, offset); + iolen += pwrite64(vp->v_fd, (char *)addr + split, + len - split, offset + split); + } + + if (iolen == -1) + return (errno); + if (residp) + *residp = len - iolen; + else if (iolen != len) + return (EIO); + return (0); +} + +void +vn_close(vnode_t *vp, int openflag, cred_t *cr, kthread_t *td) +{ + close(vp->v_fd); + if (vp->v_dump_fd != -1) + close(vp->v_dump_fd); + spa_strfree(vp->v_path); + umem_free(vp, sizeof (vnode_t)); +} + +/* + * At a minimum we need to update the size since vdev_reopen() + * will no longer call vn_openat(). + */ +int +fop_getattr(vnode_t *vp, vattr_t *vap) +{ + struct stat64 st; + + if (fstat64(vp->v_fd, &st) == -1) { + close(vp->v_fd); + return (errno); + } + + vap->va_size = st.st_size; + return (0); +} + +#ifdef ZFS_DEBUG + +/* + * ========================================================================= + * Figure out which debugging statements to print + * ========================================================================= + */ + +static char *dprintf_string; +static int dprintf_print_all; + +int +dprintf_find_string(const char *string) +{ + char *tmp_str = dprintf_string; + int len = strlen(string); + + /* + * Find out if this is a string we want to print. + * String format: file1.c,function_name1,file2.c,file3.c + */ + + while (tmp_str != NULL) { + if (strncmp(tmp_str, string, len) == 0 && + (tmp_str[len] == ',' || tmp_str[len] == '\0')) + return (1); + tmp_str = strchr(tmp_str, ','); + if (tmp_str != NULL) + tmp_str++; /* Get rid of , */ + } + return (0); +} + +void +dprintf_setup(int *argc, char **argv) +{ + int i, j; + + /* + * Debugging can be specified two ways: by setting the + * environment variable ZFS_DEBUG, or by including a + * "debug=..." argument on the command line. The command + * line setting overrides the environment variable. + */ + + for (i = 1; i < *argc; i++) { + int len = strlen("debug="); + /* First look for a command line argument */ + if (strncmp("debug=", argv[i], len) == 0) { + dprintf_string = argv[i] + len; + /* Remove from args */ + for (j = i; j < *argc; j++) + argv[j] = argv[j+1]; + argv[j] = NULL; + (*argc)--; + } + } + + if (dprintf_string == NULL) { + /* Look for ZFS_DEBUG environment variable */ + dprintf_string = getenv("ZFS_DEBUG"); + } + + /* + * Are we just turning on all debugging? + */ + if (dprintf_find_string("on")) + dprintf_print_all = 1; + + if (dprintf_string != NULL) + zfs_flags |= ZFS_DEBUG_DPRINTF; +} + +int +sysctl_handle_64(SYSCTL_HANDLER_ARGS) +{ + return (0); +} + +/* + * ========================================================================= + * debug printfs + * ========================================================================= + */ +void +__dprintf(const char *file, const char *func, int line, const char *fmt, ...) +{ + const char *newfile; + va_list adx; + + /* + * Get rid of annoying "../common/" prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + if (dprintf_print_all || + dprintf_find_string(newfile) || + dprintf_find_string(func)) { + /* Print out just the function name if requested */ + flockfile(stdout); + if (dprintf_find_string("pid")) + (void) printf("%d ", getpid()); + if (dprintf_find_string("tid")) + (void) printf("%lu ", thr_self()); +#if 0 + if (dprintf_find_string("cpu")) + (void) printf("%u ", getcpuid()); +#endif + if (dprintf_find_string("time")) + (void) printf("%llu ", gethrtime()); + if (dprintf_find_string("long")) + (void) printf("%s, line %d: ", newfile, line); + (void) printf("%s: ", func); + va_start(adx, fmt); + (void) vprintf(fmt, adx); + va_end(adx); + funlockfile(stdout); + } +} + +#endif /* ZFS_DEBUG */ + +/* + * ========================================================================= + * cmn_err() and panic() + * ========================================================================= + */ +static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" }; +static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" }; + +void +vpanic(const char *fmt, va_list adx) +{ + (void) fprintf(stderr, "error: "); + (void) vfprintf(stderr, fmt, adx); + (void) fprintf(stderr, "\n"); + + abort(); /* think of it as a "user-level crash dump" */ +} + +void +panic(const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + vpanic(fmt, adx); + va_end(adx); +} + +void +vcmn_err(int ce, const char *fmt, va_list adx) +{ + if (ce == CE_PANIC) + vpanic(fmt, adx); + if (ce != CE_NOTE) { /* suppress noise in userland stress testing */ + (void) fprintf(stderr, "%s", ce_prefix[ce]); + (void) vfprintf(stderr, fmt, adx); + (void) fprintf(stderr, "%s", ce_suffix[ce]); + } +} + +/*PRINTFLIKE2*/ +void +cmn_err(int ce, const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + vcmn_err(ce, fmt, adx); + va_end(adx); +} + +/* + * ========================================================================= + * kobj interfaces + * ========================================================================= + */ +struct _buf * +kobj_open_file(char *name) +{ + struct _buf *file; + vnode_t *vp; + + /* set vp as the _fd field of the file */ + if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, + -1) != 0) + return ((void *)-1UL); + + file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL); + file->_fd = (intptr_t)vp; + return (file); +} + +int +kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) +{ + ssize_t resid; + + vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off, + UIO_SYSSPACE, 0, 0, 0, &resid); + + return (size - resid); +} + +void +kobj_close_file(struct _buf *file) +{ + vn_close((vnode_t *)file->_fd, 0, NULL, NULL); + umem_free(file, sizeof (struct _buf)); +} + +int +kobj_get_filesize(struct _buf *file, uint64_t *size) +{ + struct stat64 st; + vnode_t *vp = (vnode_t *)file->_fd; + + if (fstat64(vp->v_fd, &st) == -1) { + vn_close(vp, 0, NULL, NULL); + return (errno); + } + *size = st.st_size; + return (0); +} + +/* + * ========================================================================= + * misc routines + * ========================================================================= + */ + +void +delay(clock_t ticks) +{ + poll(0, 0, ticks * (1000 / hz)); +} + +#if 0 +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + */ +int +highbit64(uint64_t i) +{ + int h = 1; + + if (i == 0) + return (0); + if (i & 0xffffffff00000000ULL) { + h += 32; i >>= 32; + } + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +} +#endif + +static int random_fd = -1, urandom_fd = -1; + +static int +random_get_bytes_common(uint8_t *ptr, size_t len, int fd) +{ + size_t resid = len; + ssize_t bytes; + + ASSERT(fd != -1); + + while (resid != 0) { + bytes = read(fd, ptr, resid); + ASSERT3S(bytes, >=, 0); + ptr += bytes; + resid -= bytes; + } + + return (0); +} + +int +random_get_bytes(uint8_t *ptr, size_t len) +{ + return (random_get_bytes_common(ptr, len, random_fd)); +} + +int +random_get_pseudo_bytes(uint8_t *ptr, size_t len) +{ + return (random_get_bytes_common(ptr, len, urandom_fd)); +} + +int +ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result) +{ + char *end; + + *result = strtoul(hw_serial, &end, base); + if (*result == 0) + return (errno); + return (0); +} + +int +ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result) +{ + char *end; + + *result = strtoull(str, &end, base); + if (*result == 0) + return (errno); + return (0); +} + +#ifndef __FreeBSD__ +/* ARGSUSED */ +cyclic_id_t +cyclic_add(cyc_handler_t *hdlr, cyc_time_t *when) +{ + return (1); +} + +/* ARGSUSED */ +void +cyclic_remove(cyclic_id_t id) +{ +} + +/* ARGSUSED */ +int +cyclic_reprogram(cyclic_id_t id, hrtime_t expiration) +{ + return (1); +} +#endif + +/* + * ========================================================================= + * kernel emulation setup & teardown + * ========================================================================= + */ +static int +umem_out_of_memory(void) +{ + char errmsg[] = "out of memory -- generating core dump\n"; + + write(fileno(stderr), errmsg, sizeof (errmsg)); + abort(); + return (0); +} + +void +kernel_init(int mode) +{ + extern uint_t rrw_tsd_key; + + umem_nofail_callback(umem_out_of_memory); + + physmem = sysconf(_SC_PHYS_PAGES); + + dprintf("physmem = %llu pages (%.2f GB)\n", physmem, + (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); + + (void) snprintf(hw_serial, sizeof (hw_serial), "%lu", + (mode & FWRITE) ? (unsigned long)gethostid() : 0); + + VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1); + VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1); + + system_taskq_init(); + +#ifdef illumos + mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL); +#endif + + spa_init(mode); + + tsd_create(&rrw_tsd_key, rrw_tsd_destroy); +} + +void +kernel_fini(void) +{ + spa_fini(); + + system_taskq_fini(); + + close(random_fd); + close(urandom_fd); + + random_fd = -1; + urandom_fd = -1; +} + +int +z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen) +{ + int ret; + uLongf len = *dstlen; + + if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK) + *dstlen = (size_t)len; + + return (ret); +} + +int +z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen, + int level) +{ + int ret; + uLongf len = *dstlen; + + if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK) + *dstlen = (size_t)len; + + return (ret); +} + +uid_t +crgetuid(cred_t *cr) +{ + return (0); +} + +uid_t +crgetruid(cred_t *cr) +{ + return (0); +} + +gid_t +crgetgid(cred_t *cr) +{ + return (0); +} + +int +crgetngroups(cred_t *cr) +{ + return (0); +} + +gid_t * +crgetgroups(cred_t *cr) +{ + return (NULL); +} + +int +zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) +{ + return (0); +} + +int +zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) +{ + return (0); +} + +int +zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) +{ + return (0); +} + +ksiddomain_t * +ksid_lookupdomain(const char *dom) +{ + ksiddomain_t *kd; + + kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL); + kd->kd_name = spa_strdup(dom); + return (kd); +} + +void +ksiddomain_rele(ksiddomain_t *ksid) +{ + spa_strfree(ksid->kd_name); + umem_free(ksid, sizeof (ksiddomain_t)); +} + +/* + * Do not change the length of the returned string; it must be freed + * with strfree(). + */ +char * +kmem_asprintf(const char *fmt, ...) +{ + int size; + va_list adx; + char *buf; + + va_start(adx, fmt); + size = vsnprintf(NULL, 0, fmt, adx) + 1; + va_end(adx); + + buf = kmem_alloc(size, KM_SLEEP); + + va_start(adx, fmt); + size = vsnprintf(buf, size, fmt, adx); + va_end(adx); + + return (buf); +} + +/* ARGSUSED */ +int +zfs_onexit_fd_hold(int fd, minor_t *minorp) +{ + *minorp = 0; + return (0); +} + +/* ARGSUSED */ +void +zfs_onexit_fd_rele(int fd) +{ +} + +/* ARGSUSED */ +int +zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, + uint64_t *action_handle) +{ + return (0); +} + +/* ARGSUSED */ +int +zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) +{ + return (0); +} + +/* ARGSUSED */ +int +zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) +{ + return (0); +} + +#ifdef __FreeBSD__ +/* ARGSUSED */ +int +zvol_create_minors(const char *name) +{ + return (0); +} +#endif + +#ifdef illumos +void +bioinit(buf_t *bp) +{ + bzero(bp, sizeof (buf_t)); +} + +void +biodone(buf_t *bp) +{ + if (bp->b_iodone != NULL) { + (*(bp->b_iodone))(bp); + return; + } + ASSERT((bp->b_flags & B_DONE) == 0); + bp->b_flags |= B_DONE; +} + +void +bioerror(buf_t *bp, int error) +{ + ASSERT(bp != NULL); + ASSERT(error >= 0); + + if (error != 0) { + bp->b_flags |= B_ERROR; + } else { + bp->b_flags &= ~B_ERROR; + } + bp->b_error = error; +} + + +int +geterror(struct buf *bp) +{ + int error = 0; + + if (bp->b_flags & B_ERROR) { + error = bp->b_error; + if (!error) + error = EIO; + } + return (error); +} +#endif Index: src/external/cddl/osnet/dist/lib/libzpool/common/taskq.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzpool/common/taskq.c,v retrieving revision 1.4 diff -u -p -r1.4 taskq.c --- src/external/cddl/osnet/dist/lib/libzpool/common/taskq.c 21 Jun 2013 16:22:45 -0000 1.4 +++ src/external/cddl/osnet/dist/lib/libzpool/common/taskq.c 19 Nov 2014 12:24:40 -0000 @@ -19,25 +19,25 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2012 Garrett D'Amore . All rights reserved. + * Copyright (c) 2014 by Delphix. All rights reserved. + */ #include int taskq_now; taskq_t *system_taskq; -typedef struct task { - struct task *task_next; - struct task *task_prev; - task_func_t *task_func; - void *task_arg; -} task_t; - #define TASKQ_ACTIVE 0x00010000 +#define TASKQ_NAMELEN 31 struct taskq { + char tq_name[TASKQ_NAMELEN + 1]; kmutex_t tq_lock; krwlock_t tq_threadlock; kcondvar_t tq_dispatch_cv; @@ -49,37 +49,46 @@ struct taskq { int tq_nalloc; int tq_minalloc; int tq_maxalloc; - task_t *tq_freelist; - task_t tq_task; + kcondvar_t tq_maxalloc_cv; + int tq_maxalloc_wait; + taskq_ent_t *tq_freelist; + taskq_ent_t tq_task; }; -static task_t * +static taskq_ent_t * task_alloc(taskq_t *tq, int tqflags) { - task_t *t; + taskq_ent_t *t; + int rv; - if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) { - tq->tq_freelist = t->task_next; +again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) { + tq->tq_freelist = t->tqent_next; } else { - mutex_exit(&tq->tq_lock); if (tq->tq_nalloc >= tq->tq_maxalloc) { - if (!(tqflags & KM_SLEEP)) { - mutex_enter(&tq->tq_lock); + if (!(tqflags & KM_SLEEP)) return (NULL); - } + /* * We don't want to exceed tq_maxalloc, but we can't * wait for other tasks to complete (and thus free up * task structures) without risking deadlock with * the caller. So, we just delay for one second - * to throttle the allocation rate. + * to throttle the allocation rate. If we have tasks + * complete before one second timeout expires then + * taskq_ent_free will signal us and we will + * immediately retry the allocation. */ - xdelay(hz); + tq->tq_maxalloc_wait++; + rv = cv_timedwait(&tq->tq_maxalloc_cv, + &tq->tq_lock, ddi_get_lbolt() + hz); + tq->tq_maxalloc_wait--; + if (rv > 0) + goto again; /* signaled */ } + mutex_exit(&tq->tq_lock); + + t = kmem_alloc(sizeof (taskq_ent_t), tqflags & KM_SLEEP); - /* Clean up TQ_FRONT from tqflags before passing it to kmem */ - t = kmem_alloc(sizeof (task_t), - tqflags & (KM_SLEEP | KM_NOSLEEP)); mutex_enter(&tq->tq_lock); if (t != NULL) tq->tq_nalloc++; @@ -88,23 +97,26 @@ task_alloc(taskq_t *tq, int tqflags) } static void -task_free(taskq_t *tq, task_t *t) +task_free(taskq_t *tq, taskq_ent_t *t) { if (tq->tq_nalloc <= tq->tq_minalloc) { - t->task_next = tq->tq_freelist; + t->tqent_next = tq->tq_freelist; tq->tq_freelist = t; } else { tq->tq_nalloc--; mutex_exit(&tq->tq_lock); - kmem_free(t, sizeof (task_t)); + kmem_free(t, sizeof (taskq_ent_t)); mutex_enter(&tq->tq_lock); } + + if (tq->tq_maxalloc_wait) + cv_signal(&tq->tq_maxalloc_cv); } taskqid_t taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags) { - task_t *t; + taskq_ent_t *t; if (taskq_now) { func(arg); @@ -118,26 +130,59 @@ taskq_dispatch(taskq_t *tq, task_func_t return (0); } if (tqflags & TQ_FRONT) { - t->task_next = tq->tq_task.task_next; - t->task_prev = &tq->tq_task; + t->tqent_next = tq->tq_task.tqent_next; + t->tqent_prev = &tq->tq_task; } else { - t->task_next = &tq->tq_task; - t->task_prev = tq->tq_task.task_prev; + t->tqent_next = &tq->tq_task; + t->tqent_prev = tq->tq_task.tqent_prev; } - t->task_next->task_prev = t; - t->task_prev->task_next = t; - t->task_func = func; - t->task_arg = arg; + t->tqent_next->tqent_prev = t; + t->tqent_prev->tqent_next = t; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_flags = 0; cv_signal(&tq->tq_dispatch_cv); mutex_exit(&tq->tq_lock); return (1); } void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + ASSERT(func != NULL); + ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC)); + + /* + * Mark it as a prealloc'd task. This is important + * to ensure that we don't free it later. + */ + t->tqent_flags |= TQENT_FLAG_PREALLOC; + /* + * Enqueue the task to the underlying queue. + */ + mutex_enter(&tq->tq_lock); + + if (flags & TQ_FRONT) { + t->tqent_next = tq->tq_task.tqent_next; + t->tqent_prev = &tq->tq_task; + } else { + t->tqent_next = &tq->tq_task; + t->tqent_prev = tq->tq_task.tqent_prev; + } + t->tqent_next->tqent_prev = t; + t->tqent_prev->tqent_next = t; + t->tqent_func = func; + t->tqent_arg = arg; + cv_signal(&tq->tq_dispatch_cv); + mutex_exit(&tq->tq_lock); +} + +void taskq_wait(taskq_t *tq) { mutex_enter(&tq->tq_lock); - while (tq->tq_task.task_next != &tq->tq_task || tq->tq_active != 0) + while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0) cv_wait(&tq->tq_wait_cv, &tq->tq_lock); mutex_exit(&tq->tq_lock); } @@ -146,27 +191,32 @@ static void * taskq_thread(void *arg) { taskq_t *tq = arg; - task_t *t; + taskq_ent_t *t; + boolean_t prealloc; mutex_enter(&tq->tq_lock); while (tq->tq_flags & TASKQ_ACTIVE) { - if ((t = tq->tq_task.task_next) == &tq->tq_task) { + if ((t = tq->tq_task.tqent_next) == &tq->tq_task) { if (--tq->tq_active == 0) cv_broadcast(&tq->tq_wait_cv); cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock); tq->tq_active++; continue; } - t->task_prev->task_next = t->task_next; - t->task_next->task_prev = t->task_prev; + t->tqent_prev->tqent_next = t->tqent_next; + t->tqent_next->tqent_prev = t->tqent_prev; + t->tqent_next = NULL; + t->tqent_prev = NULL; + prealloc = t->tqent_flags & TQENT_FLAG_PREALLOC; mutex_exit(&tq->tq_lock); rw_enter(&tq->tq_threadlock, RW_READER); - t->task_func(t->task_arg); + t->tqent_func(t->tqent_arg); rw_exit(&tq->tq_threadlock); mutex_enter(&tq->tq_lock); - task_free(tq, t); + if (!prealloc) + task_free(tq, t); } tq->tq_nthreads--; cv_broadcast(&tq->tq_wait_cv); @@ -199,13 +249,15 @@ taskq_create(const char *name, int nthre mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL); cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL); + (void) strncpy(tq->tq_name, name, TASKQ_NAMELEN + 1); tq->tq_flags = flags | TASKQ_ACTIVE; tq->tq_active = nthreads; tq->tq_nthreads = nthreads; tq->tq_minalloc = minalloc; tq->tq_maxalloc = maxalloc; - tq->tq_task.task_next = &tq->tq_task; - tq->tq_task.task_prev = &tq->tq_task; + tq->tq_task.tqent_next = &tq->tq_task; + tq->tq_task.tqent_prev = &tq->tq_task; tq->tq_threadlist = kmem_alloc(nthreads * sizeof (thread_t), KM_SLEEP); if (flags & TASKQ_PREPOPULATE) { @@ -255,6 +307,7 @@ taskq_destroy(taskq_t *tq) mutex_destroy(&tq->tq_lock); cv_destroy(&tq->tq_dispatch_cv); cv_destroy(&tq->tq_wait_cv); + cv_destroy(&tq->tq_maxalloc_cv); kmem_free(tq, sizeof (taskq_t)); } Index: src/external/cddl/osnet/dist/lib/libzpool/common/util.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/lib/libzpool/common/util.c,v retrieving revision 1.3 diff -u -p -r1.3 util.c --- src/external/cddl/osnet/dist/lib/libzpool/common/util.c 28 Mar 2014 02:50:18 -0000 1.3 +++ src/external/cddl/osnet/dist/lib/libzpool/common/util.c 10 Oct 2016 11:14:31 -0000 @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -38,7 +37,7 @@ */ void -nicenum(uint64_t num, char *buf, size_t buflen) +nicenum(uint64_t num, char *buf) { uint64_t n = num; int index = 0; @@ -52,15 +51,15 @@ nicenum(uint64_t num, char *buf, size_t u = " KMGTPE"[index]; if (index == 0) { - (void) snprintf(buf, buflen, "%llu", (u_longlong_t)n); + (void) sprintf(buf, "%llu", (u_longlong_t)n); } else if (n < 10 && (num & (num - 1)) != 0) { - (void) snprintf(buf, buflen, "%.2f%c", + (void) sprintf(buf, "%.2f%c", (double)num / (1ULL << 10 * index), u); } else if (n < 100 && (num & (num - 1)) != 0) { - (void) snprintf(buf, buflen, "%.1f%c", + (void) sprintf(buf, "%.1f%c", (double)num / (1ULL << 10 * index), u); } else { - (void) snprintf(buf, buflen, "%llu%c", (u_longlong_t)n, u); + (void) sprintf(buf, "%llu%c", (u_longlong_t)n, u); } } @@ -90,26 +89,26 @@ show_vdev_stats(const char *desc, const if (is_log) prefix = "log "; - if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS, + if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &c) != 0) vs = &v0; sec = MAX(1, vs->vs_timestamp / NANOSEC); - nicenum(vs->vs_alloc, used, sizeof(used)); - nicenum(vs->vs_space - vs->vs_alloc, avail, sizeof(avail)); - nicenum(vs->vs_ops[ZIO_TYPE_READ] / sec, rops, sizeof(rops)); - nicenum(vs->vs_ops[ZIO_TYPE_WRITE] / sec, wops, sizeof(wops)); - nicenum(vs->vs_bytes[ZIO_TYPE_READ] / sec, rbytes, sizeof(rbytes)); - nicenum(vs->vs_bytes[ZIO_TYPE_WRITE] / sec, wbytes, sizeof(wbytes)); - nicenum(vs->vs_read_errors, rerr, sizeof(rerr)); - nicenum(vs->vs_write_errors, werr, sizeof(werr)); - nicenum(vs->vs_checksum_errors, cerr, sizeof(cerr)); + nicenum(vs->vs_alloc, used); + nicenum(vs->vs_space - vs->vs_alloc, avail); + nicenum(vs->vs_ops[ZIO_TYPE_READ] / sec, rops); + nicenum(vs->vs_ops[ZIO_TYPE_WRITE] / sec, wops); + nicenum(vs->vs_bytes[ZIO_TYPE_READ] / sec, rbytes); + nicenum(vs->vs_bytes[ZIO_TYPE_WRITE] / sec, wbytes); + nicenum(vs->vs_read_errors, rerr); + nicenum(vs->vs_write_errors, werr); + nicenum(vs->vs_checksum_errors, cerr); (void) printf("%*s%s%*s%*s%*s %5s %5s %5s %5s %5s %5s %5s\n", indent, "", prefix, - indent + strlen(prefix) - 25 - (vs->vs_space ? 0 : 12), + (int)(indent + strlen(prefix) - 25 - (vs->vs_space ? 0 : 12)), desc, vs->vs_space ? 6 : 0, vs->vs_space ? used : "", vs->vs_space ? 6 : 0, vs->vs_space ? avail : "", Index: src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.c,v retrieving revision 1.3 diff -u -p -r1.3 barrier.c --- src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.c 13 Feb 2016 21:37:12 -0000 1.3 +++ src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.c 4 Feb 2015 07:20:42 -0000 @@ -38,13 +38,8 @@ */ #include -#if defined(sun) +#ifdef illumos #include -#else -#define USYNC_THREAD 1 -#define sema_init(a, b, c, d) sem_init((a), (c) != USYNC_THREAD, (b)) -#define sema_wait(a) sem_wait(a) -#define sema_post(a) sem_post(a) #endif #include @@ -54,7 +49,12 @@ void barrier_init(barrier_t *bar, int nthreads) { pthread_mutex_init(&bar->bar_lock, NULL); +#ifdef illumos sema_init(&bar->bar_sem, 0, USYNC_THREAD, NULL); +#else + sem_init(&bar->bar_sem, 0, 0); +#endif + bar->bar_numin = 0; bar->bar_nthr = nthreads; } @@ -66,7 +66,12 @@ barrier_wait(barrier_t *bar) if (++bar->bar_numin < bar->bar_nthr) { pthread_mutex_unlock(&bar->bar_lock); +#ifdef illumos sema_wait(&bar->bar_sem); +#else + sem_wait(&bar->bar_sem); +#endif + return (0); } else { @@ -75,7 +80,11 @@ barrier_wait(barrier_t *bar) /* reset for next use */ bar->bar_numin = 0; for (i = 1; i < bar->bar_nthr; i++) +#ifdef illumos sema_post(&bar->bar_sem); +#else + sem_post(&bar->bar_sem); +#endif pthread_mutex_unlock(&bar->bar_lock); return (1); Index: src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.h,v retrieving revision 1.2 diff -u -p -r1.2 barrier.h --- src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.h 21 Feb 2010 00:49:55 -0000 1.2 +++ src/external/cddl/osnet/dist/tools/ctf/cvt/barrier.h 4 Feb 2015 07:20:42 -0000 @@ -33,7 +33,7 @@ * APIs for the barrier synchronization primitive. */ -#if defined(sun) +#ifdef illumos #include #else #include Index: src/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c,v retrieving revision 1.13 diff -u -p -r1.13 ctf.c --- src/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c 18 Mar 2016 17:11:04 -0000 1.13 +++ src/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c 13 Apr 2017 19:16:42 -0000 @@ -56,8 +56,6 @@ static char *curfile; #define CTF_BUF_CHUNK_SIZE (64 * 1024) #define RES_BUF_CHUNK_SIZE (64 * 1024) -static int ntypes = 0; /* The number of types. */ - struct ctf_buf { strtab_t ctb_strtab; /* string table */ caddr_t ctb_base; /* pointer to base of buffer */ @@ -1157,10 +1155,6 @@ resurrect_types(ctf_header_t *h, tdata_t (*mpp)->ml_type = tdarr[ctm->ctm_type]; (*mpp)->ml_offset = ctm->ctm_offset; (*mpp)->ml_size = 0; - if (ctm->ctm_type > ntypes) { - parseterminate("Invalid member type ctm_type=%d", - ctm->ctm_type); - } } } else { for (i = 0, mpp = &tdp->t_members; i < vlen; @@ -1177,10 +1171,6 @@ resurrect_types(ctf_header_t *h, tdata_t (*mpp)->ml_offset = (int)CTF_LMEM_OFFSET(ctlm); (*mpp)->ml_size = 0; - if (ctlm->ctlm_type > ntypes) { - parseterminate("Invalid lmember type ctlm_type=%d", - ctlm->ctlm_type); - } } } @@ -1294,10 +1284,9 @@ ctf_parse(ctf_header_t *h, caddr_t buf, { tdata_t *td = tdata_new(); tdesc_t **tdarr; + int ntypes = count_types(h, buf); int idx, i; - ntypes = count_types(h, buf); - /* shudder */ tdarr = xcalloc(sizeof (tdesc_t *) * (ntypes + 1)); tdarr[0] = NULL; Index: src/external/cddl/osnet/dist/tools/ctf/cvt/ctfconvert.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/ctfconvert.c,v retrieving revision 1.5 diff -u -p -r1.5 ctfconvert.c --- src/external/cddl/osnet/dist/tools/ctf/cvt/ctfconvert.c 20 Feb 2016 22:08:44 -0000 1.5 +++ src/external/cddl/osnet/dist/tools/ctf/cvt/ctfconvert.c 13 Apr 2017 19:21:22 -0000 @@ -68,7 +68,7 @@ usage(void) static void terminate_cleanup(void) { -#if 0 +#if !defined(__FreeBSD__) && !defined(__NetBSD__) if (!outfile) { fprintf(stderr, "Removing %s\n", infile); unlink(infile); @@ -159,7 +159,7 @@ main(int argc, char **argv) int keep_stabs = 0; int c; -#if defined(sun) +#ifdef illumos sighold(SIGINT); sighold(SIGQUIT); sighold(SIGTERM); @@ -225,7 +225,7 @@ main(int argc, char **argv) */ set_terminate_cleanup(terminate_cleanup); -#if defined(sun) +#ifdef illumos sigset(SIGINT, handle_sig); sigset(SIGQUIT, handle_sig); sigset(SIGTERM, handle_sig); Index: src/external/cddl/osnet/dist/tools/ctf/cvt/ctfmerge.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/ctfmerge.c,v retrieving revision 1.15 diff -u -p -r1.15 ctfmerge.c --- src/external/cddl/osnet/dist/tools/ctf/cvt/ctfmerge.c 20 Feb 2016 22:08:44 -0000 1.15 +++ src/external/cddl/osnet/dist/tools/ctf/cvt/ctfmerge.c 13 Apr 2017 19:25:33 -0000 @@ -186,20 +186,20 @@ #endif #include #include -#if defined(sun) +#ifdef illumos #include #endif #include #include #include #include -#if defined(sun) +#ifdef illumos #include #endif #include #include #include -#if defined(sun) +#ifdef illumos #include #endif @@ -222,10 +222,9 @@ static char *outfile = NULL; static char *tmpname = NULL; static int dynsym; int debug_level = DEBUG_LEVEL; -#if 0 +#ifdef illumos static size_t maxpgsize = 0x400000; #endif -static int maxslots = MERGE_PHASE1_MAX_SLOTS; static void @@ -245,7 +244,7 @@ usage(void) progname, progname); } -#if defined(sun) +#ifdef illumos static void bigheap(void) { @@ -624,7 +623,7 @@ terminate_cleanup(void) if (outfile == NULL) return; -#if 0 +#if !defined (__FreeBSD__) && !defined(__NetBSD__) if (dounlink) { fprintf(stderr, "Removing %s\n", outfile); unlink(outfile); @@ -658,7 +657,7 @@ wq_init(workqueue_t *wq, int nfiles) if (getenv("CTFMERGE_MAX_SLOTS")) nslots = atoi(getenv("CTFMERGE_MAX_SLOTS")); else - nslots = maxslots; + nslots = MERGE_PHASE1_MAX_SLOTS; if (getenv("CTFMERGE_PHASE1_BATCH_SIZE")) wq->wq_maxbatchsz = atoi(getenv("CTFMERGE_PHASE1_BATCH_SIZE")); @@ -733,7 +732,7 @@ start_threads(workqueue_t *wq) (void *(*)(void *))worker_thread, wq); } -#if defined(sun) +#ifdef illumos sigset(SIGINT, handle_sig); sigset(SIGQUIT, handle_sig); sigset(SIGTERM, handle_sig); Index: src/external/cddl/osnet/dist/tools/ctf/cvt/ctftools.h =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/ctftools.h,v retrieving revision 1.7 diff -u -p -r1.7 ctftools.h --- src/external/cddl/osnet/dist/tools/ctf/cvt/ctftools.h 18 Mar 2016 17:07:23 -0000 1.7 +++ src/external/cddl/osnet/dist/tools/ctf/cvt/ctftools.h 13 Apr 2017 19:26:29 -0000 @@ -26,8 +26,6 @@ #ifndef _CTFTOOLS_H #define _CTFTOOLS_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Functions and data structures used in the manipulation of stabs and CTF data */ @@ -39,6 +37,8 @@ #include #include +#include + #ifdef __cplusplus extern "C" { #endif Index: src/external/cddl/osnet/dist/tools/ctf/cvt/dwarf.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/dwarf.c,v retrieving revision 1.23 diff -u -p -r1.23 dwarf.c --- src/external/cddl/osnet/dist/tools/ctf/cvt/dwarf.c 8 Jun 2016 21:32:27 -0000 1.23 +++ src/external/cddl/osnet/dist/tools/ctf/cvt/dwarf.c 22 Apr 2017 17:06:56 -0000 @@ -429,7 +429,7 @@ die_string(dwarf_t *dw, Dwarf_Die die, D static Dwarf_Off die_attr_ref(dwarf_t *dw, Dwarf_Die die, Dwarf_Half name) { - Dwarf_Unsigned off; + Dwarf_Off off; if (dwarf_attrval_unsigned(die, name, &off, &dw->dw_err) != DW_DLV_OK) { terminate("die %ju: failed to get ref: %s\n", @@ -672,8 +672,6 @@ tdesc_array_create(dwarf_t *dw, Dwarf_Di if ((dim2 = die_sibling(dw, dim)) == NULL) { ctdp = arrtdp; - debug(3, "die %ju: sibling type %#x for dimension\n", - (uintmax_t)die_off(dw, dim), ctdp->t_id); } else if (die_tag(dw, dim2) == DW_TAG_subrange_type) { ctdp = xcalloc(sizeof (tdesc_t)); ctdp->t_id = mfgtid_next(dw); @@ -762,13 +760,6 @@ die_array_create(dwarf_t *dw, Dwarf_Die tdesc_t *dimtdp; int flags; - /* Check for bogus gcc DW_AT_byte_size attribute */ - if (uval == (unsigned)-1) { - printf("dwarf.c:%s() working around bogus -1 DW_AT_byte_size\n", - __func__); - uval = 0; - } - tdp->t_size = uval; /* @@ -852,19 +843,16 @@ die_enum_create(dwarf_t *dw, Dwarf_Die d Dwarf_Unsigned uval; Dwarf_Signed sval; + if (die_isdecl(dw, die)) { + tdp->t_type = FORWARD; + return; + } + debug(3, "die %ju: creating enum\n", (uintmax_t)off); - tdp->t_type = (die_isdecl(dw, die) ? FORWARD : ENUM); - if (tdp->t_type != ENUM) - return; + tdp->t_type = ENUM; (void) die_unsigned(dw, die, DW_AT_byte_size, &uval, DW_ATTR_REQ); - /* Check for bogus gcc DW_AT_byte_size attribute */ - if (uval == (unsigned)-1) { - printf("dwarf.c:%s() working around bogus -1 DW_AT_byte_size\n", - __func__); - uval = 0; - } tdp->t_size = uval; if ((mem = die_child(dw, die)) != NULL) { @@ -980,7 +968,7 @@ static void die_sou_create(dwarf_t *dw, Dwarf_Die str, Dwarf_Off off, tdesc_t *tdp, int type, const char *typename) { - Dwarf_Unsigned sz, bitsz, bitoff, maxsz=0; + Dwarf_Unsigned sz, bitsz, bitoff; #if BYTE_ORDER == LITTLE_ENDIAN Dwarf_Unsigned bysz; #endif @@ -1040,8 +1028,6 @@ die_sou_create(dwarf_t *dw, Dwarf_Die st ml->ml_name = NULL; ml->ml_type = die_lookup_pass1(dw, mem, DW_AT_type); - debug(3, "die_sou_create(): ml_type = %p t_id = %#x\n", - ml->ml_type, ml->ml_type->t_id); if (die_mem_offset(dw, mem, DW_AT_data_member_location, &mloff, 0)) { @@ -1088,24 +1074,8 @@ die_sou_create(dwarf_t *dw, Dwarf_Die st *mlastp = ml; mlastp = &ml->ml_next; - - /* Find the size of the largest member to work around a gcc - * bug. See GCC Bugzilla 35998. - */ - if (maxsz < ml->ml_size) - maxsz = ml->ml_size; - } while ((mem = die_sibling(dw, mem)) != NULL); - /* See if we got a bogus DW_AT_byte_size. GCC will sometimes - * emit this. - */ - if (sz == (unsigned)-1) { - printf("dwarf.c:%s() working around bogus -1 DW_AT_byte_size\n", - __func__); - tdp->t_size = maxsz / 8; /* maxsz is in bits, t_size is bytes */ - } - /* * GCC will attempt to eliminate unused types, thus decreasing the * size of the emitted dwarf. That is, if you declare a foo_t in your @@ -1246,7 +1216,7 @@ die_sou_resolve(tdesc_t *tdp, tdesc_t ** } if (ml->ml_size != 0 && mt->t_type == INTRINSIC && - mt->t_intr->intr_nbits != (int)ml->ml_size) { + mt->t_intr->intr_nbits != ml->ml_size) { /* * This member is a bitfield, and needs to reference * an intrinsic type with the same width. If the @@ -1564,13 +1534,6 @@ die_base_create(dwarf_t *dw, Dwarf_Die b */ (void) die_unsigned(dw, base, DW_AT_byte_size, &sz, DW_ATTR_REQ); - /* Check for bogus gcc DW_AT_byte_size attribute */ - if (sz == (unsigned)-1) { - printf("dwarf.c:%s() working around bogus -1 DW_AT_byte_size\n", - __func__); - sz = 0; - } - if (tdp->t_name == NULL) terminate("die %ju: base type without name\n", (uintmax_t)off); @@ -2065,7 +2028,6 @@ dw_read(tdata_t *td, Elf *elf, char *fil errno = ENOENT; return (-1); } else { - return (0); } } else if (rc != DW_DLV_OK) { Index: src/external/cddl/osnet/dist/tools/ctf/cvt/merge.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/merge.c,v retrieving revision 1.7 diff -u -p -r1.7 merge.c --- src/external/cddl/osnet/dist/tools/ctf/cvt/merge.c 10 Apr 2016 23:37:10 -0000 1.7 +++ src/external/cddl/osnet/dist/tools/ctf/cvt/merge.c 13 Apr 2017 19:51:37 -0000 @@ -287,24 +287,14 @@ static int equiv_su(tdesc_t *stdp, tdesc_t *ttdp, equiv_data_t *ed) { mlist_t *ml1 = stdp->t_members, *ml2 = ttdp->t_members; - mlist_t *olm1 = NULL; while (ml1 && ml2) { if (ml1->ml_offset != ml2->ml_offset || - strcmp(ml1->ml_name, ml2->ml_name) != 0) + strcmp(ml1->ml_name, ml2->ml_name) != 0 || + ml1->ml_size != ml2->ml_size || + !equiv_node(ml1->ml_type, ml2->ml_type, ed)) return (0); - /* - * Don't do the recursive equivalency checking more than - * we have to. - */ - if (olm1 == NULL || olm1->ml_type->t_id != ml1->ml_type->t_id) { - if (ml1->ml_size != ml2->ml_size || - !equiv_node(ml1->ml_type, ml2->ml_type, ed)) - return (0); - } - - olm1 = ml1; ml1 = ml1->ml_next; ml2 = ml2->ml_next; } @@ -352,7 +342,8 @@ fwd_equiv(tdesc_t *ctdp, tdesc_t *mtdp) { tdesc_t *defn = (ctdp->t_type == FORWARD ? mtdp : ctdp); - return (defn->t_type == STRUCT || defn->t_type == UNION); + return (defn->t_type == STRUCT || defn->t_type == UNION || + defn->t_type == ENUM); } static int Index: src/external/cddl/osnet/dist/tools/ctf/cvt/output.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/output.c,v retrieving revision 1.8 diff -u -p -r1.8 output.c --- src/external/cddl/osnet/dist/tools/ctf/cvt/output.c 18 Mar 2016 17:07:23 -0000 1.8 +++ src/external/cddl/osnet/dist/tools/ctf/cvt/output.c 22 Apr 2017 20:40:20 -0000 @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Routines for preparing tdata trees for conversion into CTF data, and * for placing the resulting data into an output file. @@ -582,7 +580,7 @@ write_file(Elf *src, const char *srcname shdr.sh_name); } -#if !defined(sun) +#ifndef illumos if (gelf_update_shdr(dscn, &shdr) == 0) elfterminate(dstname, "Cannot update sect %s", sname); #endif @@ -591,7 +589,7 @@ write_file(Elf *src, const char *srcname elfterminate(srcname, "Cannot get sect %s data", sname); if ((ddata = elf_newdata(dscn)) == NULL) elfterminate(dstname, "Can't make sect %s data", sname); -#if defined(sun) +#ifdef illumos bcopy(sdata, ddata, sizeof (Elf_Data)); #else /* @@ -651,7 +649,7 @@ write_file(Elf *src, const char *srcname } } -#if !defined(sun) +#ifndef illumos if (ddata->d_buf == NULL && sdata->d_buf != NULL) { ddata->d_buf = xmalloc(shdr.sh_size); bcopy(sdata->d_buf, ddata->d_buf, shdr.sh_size); Index: src/external/cddl/osnet/dist/tools/ctf/cvt/st_parse.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/st_parse.c,v retrieving revision 1.7 diff -u -p -r1.7 st_parse.c --- src/external/cddl/osnet/dist/tools/ctf/cvt/st_parse.c 17 Mar 2016 03:05:55 -0000 1.7 +++ src/external/cddl/osnet/dist/tools/ctf/cvt/st_parse.c 13 Apr 2017 19:53:16 -0000 @@ -956,7 +956,7 @@ soudef(char *cp, stabtype_t type, tdesc_ itdp = find_intrinsic(tdp); if (itdp->t_type == INTRINSIC) { - if ((int)mlp->ml_size != itdp->t_intr->intr_nbits) { + if (mlp->ml_size != itdp->t_intr->intr_nbits) { parse_debug(4, cp, "making %d bit intrinsic " "from %s", mlp->ml_size, tdesc_name(itdp)); mlp->ml_type = bitintrinsic(itdp, mlp->ml_size); @@ -1177,7 +1177,7 @@ resolve_typed_bitfields_cb(void *arg, vo while (tdp) { switch (tdp->t_type) { case INTRINSIC: - if ((int)ml->ml_size != tdp->t_intr->intr_nbits) { + if (ml->ml_size != tdp->t_intr->intr_nbits) { debug(3, "making %d bit intrinsic from %s", ml->ml_size, tdesc_name(tdp)); ml->ml_type = bitintrinsic(tdp, ml->ml_size); Index: src/external/cddl/osnet/dist/tools/ctf/cvt/stabs.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/stabs.c,v retrieving revision 1.5 diff -u -p -r1.5 stabs.c --- src/external/cddl/osnet/dist/tools/ctf/cvt/stabs.c 7 Feb 2015 20:30:03 -0000 1.5 +++ src/external/cddl/osnet/dist/tools/ctf/cvt/stabs.c 13 Apr 2017 19:54:32 -0000 @@ -190,7 +190,7 @@ stabs_read(tdata_t *td, Elf *elf, char * char *curfile = NULL; char *str; char *fstr = NULL, *ofstr = NULL; - int stabidx, stabstridx = 0; + int stabidx, stabstridx; int nstabs, rc, i; int scope = 0; @@ -198,8 +198,6 @@ stabs_read(tdata_t *td, Elf *elf, char * (stabstridx = findelfsecidx(elf, file, ".stab.exclstr")) >= 0) && !((stabidx = findelfsecidx(elf, file, ".stab")) >= 0 && (stabstridx = findelfsecidx(elf, file, ".stabstr")) >= 0)) { - debug(1, "NO stabs: .stab=%d, .stabstr=%d\n", stabidx, - stabstridx); errno = ENOENT; return (-1); } Index: src/external/cddl/osnet/dist/tools/ctf/cvt/strtab.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/strtab.c,v retrieving revision 1.3 diff -u -p -r1.3 strtab.c --- src/external/cddl/osnet/dist/tools/ctf/cvt/strtab.c 20 Feb 2016 21:50:02 -0000 1.3 +++ src/external/cddl/osnet/dist/tools/ctf/cvt/strtab.c 13 Apr 2017 19:54:50 -0000 @@ -28,8 +28,8 @@ #include #include -#include #include +#include #include #include "strtab.h" Index: src/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c,v retrieving revision 1.8 diff -u -p -r1.8 tdata.c --- src/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c 18 Mar 2016 17:07:23 -0000 1.8 +++ src/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c 13 Apr 2017 19:55:09 -0000 @@ -177,7 +177,7 @@ tdesc_namecmp(void *arg1, void *arg2) return (!streq(tdp1->t_name, tdp2->t_name)); } -#if defined(sun) +#ifdef illumos /*ARGSUSED1*/ static int tdesc_print(void *data, void *private __unused) Index: src/external/cddl/osnet/dist/tools/ctf/cvt/util.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/tools/ctf/cvt/util.c,v retrieving revision 1.5 diff -u -p -r1.5 util.c --- src/external/cddl/osnet/dist/tools/ctf/cvt/util.c 7 Feb 2015 20:30:03 -0000 1.5 +++ src/external/cddl/osnet/dist/tools/ctf/cvt/util.c 13 Apr 2017 19:55:46 -0000 @@ -175,7 +175,7 @@ aborterr(const char *format, ...) whine("ERROR", format, ap); va_end(ap); -#if defined(sun) +#ifdef illumos abort(); #else exit(0); Index: src/external/cddl/osnet/dist/uts/common/Makefile.files =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/Makefile.files,v retrieving revision 1.2 diff -u -p -r1.2 Makefile.files --- src/external/cddl/osnet/dist/uts/common/Makefile.files 27 Feb 2010 23:43:53 -0000 1.2 +++ src/external/cddl/osnet/dist/uts/common/Makefile.files 7 Jul 2017 17:50:49 -0000 @@ -20,1300 +20,30 @@ # # -# Copyright 2010 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. +# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved. +# Copyright (c) 2012 Joyent, Inc. All rights reserved. +# Copyright (c) 2011, 2014 by Delphix. All rights reserved. +# Copyright (c) 2013 by Saso Kiselkov. All rights reserved. # -# This Makefile defines all file modules for the directory uts/common -# and its children. These are the source files which may be considered -# common to all SunOS systems. - -i386_CORE_OBJS += \ - atomic.o \ - avintr.o \ - pic.o - -sparc_CORE_OBJS += - -COMMON_CORE_OBJS += \ - beep.o \ - bitset.o \ - bp_map.o \ - brand.o \ - cpucaps.o \ - cmt.o \ - cmt_policy.o \ - cpu.o \ - cpu_event.o \ - cpu_intr.o \ - cpu_pm.o \ - cpupart.o \ - cap_util.o \ - disp.o \ - group.o \ - kstat_fr.o \ - iscsiboot_prop.o \ - lgrp.o \ - lgrp_topo.o \ - mmapobj.o \ - mutex.o \ - page_lock.o \ - page_retire.o \ - panic.o \ - param.o \ - pg.o \ - pghw.o \ - putnext.o \ - rctl_proc.o \ - rwlock.o \ - seg_kmem.o \ - softint.o \ - string.o \ - strtol.o \ - strtoul.o \ - strtoll.o \ - strtoull.o \ - thread_intr.o \ - vm_page.o \ - vm_pagelist.o \ - zlib_obj.o \ - clock_tick.o - -CORE_OBJS += $(COMMON_CORE_OBJS) $($(MACH)_CORE_OBJS) - -ZLIB_OBJS = zutil.o zmod.o zmod_subr.o \ - adler32.o crc32.o deflate.o inffast.o \ - inflate.o inftrees.o trees.o - -GENUNIX_OBJS += \ - access.o \ - acl.o \ - acl_common.o \ - adjtime.o \ - alarm.o \ - aio_subr.o \ - auditsys.o \ - autoconf.o \ - avl.o \ - bdev_dsort.o \ - bio.o \ - bitmap.o \ - blabel.o \ - brandsys.o \ - bz2blocksort.o \ - bz2compress.o \ - bz2decompress.o \ - bz2randtable.o \ - bz2bzlib.o \ - bz2crctable.o \ - bz2huffman.o \ - callb.o \ - callout.o \ - chdir.o \ - chmod.o \ - chown.o \ - cladm.o \ - class.o \ - clock.o \ - clock_highres.o \ - clock_realtime.o\ - close.o \ - compress.o \ - condvar.o \ - conf.o \ - console.o \ - contract.o \ - copyops.o \ - core.o \ - corectl.o \ - cred.o \ - cs_stubs.o \ - dacf.o \ - dacf_clnt.o \ - damap.o \ - cyclic.o \ - ddi.o \ - ddifm.o \ - ddi_hp_impl.o \ - ddi_hp_ndi.o \ - ddi_intr.o \ - ddi_intr_impl.o \ - ddi_intr_irm.o \ - ddi_nodeid.o \ - ddi_timer.o \ - devcfg.o \ - devcache.o \ - device.o \ - devid.o \ - devid_cache.o \ - devid_scsi.o \ - devid_smp.o \ - devpolicy.o \ - disp_lock.o \ - dnlc.o \ - driver.o \ - dumpsubr.o \ - driver_lyr.o \ - dtrace_subr.o \ - errorq.o \ - etheraddr.o \ - evchannels.o \ - exacct.o \ - exacct_core.o \ - exec.o \ - exit.o \ - fbio.o \ - fcntl.o \ - fdbuffer.o \ - fdsync.o \ - fem.o \ - ffs.o \ - fio.o \ - flock.o \ - fm.o \ - fork.o \ - vpm.o \ - fsat.o \ - fs_reparse.o \ - fs_subr.o \ - fsflush.o \ - ftrace.o \ - getcwd.o \ - getdents.o \ - getloadavg.o \ - getpagesizes.o \ - getpid.o \ - gfs.o \ - rusagesys.o \ - gid.o \ - groups.o \ - grow.o \ - hat.o \ - hat_refmod.o \ - id32.o \ - id_space.o \ - inet_ntop.o \ - instance.o \ - ioctl.o \ - ip_cksum.o \ - issetugid.o \ - ippconf.o \ - kcpc.o \ - kdi.o \ - kiconv.o \ - klpd.o \ - kmem.o \ - ksyms_snapshot.o \ - l_strplumb.o \ - labelsys.o \ - link.o \ - list.o \ - lockstat_subr.o \ - log_sysevent.o \ - logsubr.o \ - lookup.o \ - lseek.o \ - ltos.o \ - lwp.o \ - lwp_create.o \ - lwp_info.o \ - lwp_self.o \ - lwp_sobj.o \ - lwp_timer.o \ - lwpsys.o \ - main.o \ - mmapobjsys.o \ - memcntl.o \ - memstr.o \ - lgrpsys.o \ - mkdir.o \ - mknod.o \ - mount.o \ - move.o \ - msacct.o \ - multidata.o \ - nbmlock.o \ - ndifm.o \ - nice.o \ - netstack.o \ - ntptime.o \ - nvpair.o \ - nvpair_alloc_system.o \ - nvpair_alloc_fixed.o \ - octet.o \ - open.o \ - p_online.o \ - pathconf.o \ - pathname.o \ - pause.o \ - serializer.o \ - pci_intr_lib.o \ - pci_cap.o \ - pcifm.o \ - pgrp.o \ - pgrpsys.o \ - pid.o \ - policy.o \ - poll.o \ - pool.o \ - pool_pset.o \ - port_subr.o \ - ppriv.o \ - printf.o \ - priocntl.o \ - priv.o \ - priv_const.o \ - proc.o \ - procset.o \ - processor_bind.o \ - processor_info.o \ - profil.o \ - project.o \ - qsort.o \ - rctl.o \ - rctlsys.o \ - readlink.o \ - refstr.o \ - rename.o \ - resolvepath.o \ - retire_store.o \ - process.o \ - rlimit.o \ - rmap.o \ - rmdir.o \ - rw.o \ - rwstlock.o \ - sad_conf.o \ - sid.o \ - sidsys.o \ - sched.o \ - schedctl.o \ - sctp_crc32.o \ - seg_dev.o \ - seg_kp.o \ - seg_kpm.o \ - seg_map.o \ - seg_vn.o \ - seg_spt.o \ - semaphore.o \ - sendfile.o \ - session.o \ - share.o \ - shuttle.o \ - sig.o \ - sigaction.o \ - sigaltstack.o \ - signotify.o \ - sigpending.o \ - sigprocmask.o \ - sigqueue.o \ - sigsendset.o \ - sigsuspend.o \ - sigtimedwait.o \ - sleepq.o \ - sock_conf.o \ - space.o \ - sscanf.o \ - stat.o \ - statfs.o \ - statvfs.o \ - stol.o \ - str_conf.o \ - strcalls.o \ - stream.o \ - streamio.o \ - strext.o \ - strsubr.o \ - strsun.o \ - subr.o \ - sunddi.o \ - sunmdi.o \ - sunndi.o \ - sunpci.o \ - sunpm.o \ - sundlpi.o \ - suntpi.o \ - swap_subr.o \ - swap_vnops.o \ - symlink.o \ - sync.o \ - sysclass.o \ - sysconfig.o \ - sysent.o \ - sysfs.o \ - systeminfo.o \ - task.o \ - taskq.o \ - tasksys.o \ - time.o \ - timer.o \ - times.o \ - timers.o \ - thread.o \ - tlabel.o \ - tnf_res.o \ - turnstile.o \ - tty_common.o \ - u8_textprep.o \ - uadmin.o \ - uconv.o \ - ucredsys.o \ - uid.o \ - umask.o \ - umount.o \ - uname.o \ - unix_bb.o \ - unlink.o \ - urw.o \ - utime.o \ - utssys.o \ - uucopy.o \ - vfs.o \ - vfs_conf.o \ - vmem.o \ - vm_anon.o \ - vm_as.o \ - vm_meter.o \ - vm_pageout.o \ - vm_pvn.o \ - vm_rm.o \ - vm_seg.o \ - vm_subr.o \ - vm_swap.o \ - vm_usage.o \ - vnode.o \ - vuid_queue.o \ - vuid_store.o \ - waitq.o \ - watchpoint.o \ - yield.o \ - scsi_confdata.o \ - xattr.o \ - xattr_common.o \ - xdr_mblk.o \ - xdr_mem.o \ - xdr.o \ - xdr_array.o \ - xdr_refer.o \ - xhat.o \ - zone.o - -# -# Stubs for the stand-alone linker/loader -# -sparc_GENSTUBS_OBJS = \ - kobj_stubs.o - -i386_GENSTUBS_OBJS = - -COMMON_GENSTUBS_OBJS = - -GENSTUBS_OBJS += $(COMMON_GENSTUBS_OBJS) $($(MACH)_GENSTUBS_OBJS) - -# -# DTrace and DTrace Providers -# -DTRACE_OBJS += dtrace.o dtrace_isa.o dtrace_asm.o - -SDT_OBJS += sdt_subr.o - -PROFILE_OBJS += profile.o - -SYSTRACE_OBJS += systrace.o - -LX_SYSTRACE_OBJS += lx_systrace.o - -LOCKSTAT_OBJS += lockstat.o - -FASTTRAP_OBJS += fasttrap.o fasttrap_isa.o - -DCPC_OBJS += dcpc.o - -# -# Driver (pseudo-driver) Modules -# -IPP_OBJS += ippctl.o - -AUDIO_OBJS += audio_client.o audio_ddi.o audio_engine.o \ - audio_fltdata.o audio_format.o audio_ctrl.o \ - audio_grc3.o audio_output.o audio_input.o \ - audio_oss.o audio_sun.o - -AUDIOEMU10K_OBJS += audioemu10k.o - -AUDIOENS_OBJS += audioens.o - -AUDIOVIA823X_OBJS += audiovia823x.o - -AUDIOVIA97_OBJS += audiovia97.o - -AUDIO1575_OBJS += audio1575.o - -AUDIO810_OBJS += audio810.o - -AUDIOCMI_OBJS += audiocmi.o - -AUDIOHD_OBJS += audiohd.o - -AUDIOIXP_OBJS += audioixp.o - -AUDIOLS_OBJS += audiols.o - -AUDIOP16X_OBJS += audiop16x.o - -AUDIOPCI_OBJS += audiopci.o - -AUDIOSOLO_OBJS += audiosolo.o - -AUDIOTS_OBJS += audiots.o - -AC97_OBJS += ac97.o ac97_ad.o ac97_alc.o ac97_cmi.o - -CARDBUS_OBJS += cardbus.o cardbus_hp.o cardbus_cfg.o - -CONSKBD_OBJS += conskbd.o - -CONSMS_OBJS += consms.o - -OLDPTY_OBJS += tty_ptyconf.o - -PTC_OBJS += tty_pty.o - -PTSL_OBJS += tty_pts.o - -PTM_OBJS += ptm.o - -LX_PTM_OBJS += lx_ptm.o - -LX_AUDIO_OBJS += lx_audio.o - -MII_OBJS += mii.o mii_cicada.o mii_natsemi.o mii_intel.o mii_qualsemi.o \ - mii_marvell.o mii_realtek.o mii_other.o - -PTS_OBJS += pts.o - -PTY_OBJS += ptms_conf.o - -SAD_OBJS += sad.o - -MD4_OBJS += md4.o md4_mod.o - -MD5_OBJS += md5.o md5_mod.o - -SHA1_OBJS += sha1.o sha1_mod.o fips_sha1_util.o - -SHA2_OBJS += sha2.o sha2_mod.o fips_sha2_util.o - -IPGPC_OBJS += classifierddi.o classifier.o filters.o trie.o table.o \ - ba_table.o - -DSCPMK_OBJS += dscpmk.o dscpmkddi.o - -DLCOSMK_OBJS += dlcosmk.o dlcosmkddi.o - -FLOWACCT_OBJS += flowacctddi.o flowacct.o - -TOKENMT_OBJS += tokenmt.o tokenmtddi.o - -TSWTCL_OBJS += tswtcl.o tswtclddi.o - -ARP_OBJS += arpddi.o - -ICMP_OBJS += icmpddi.o - -ICMP6_OBJS += icmp6ddi.o - -RTS_OBJS += rtsddi.o - -IP_ICMP_OBJS = icmp.o icmp_opt_data.o -IP_RTS_OBJS = rts.o rts_opt_data.o -IP_TCP_OBJS = tcp.o tcp_fusion.o tcp_kssl.o tcp_opt_data.o tcp_sack.o -IP_UDP_OBJS = udp.o udp_opt_data.o -IP_SCTP_OBJS = sctp.o sctp_opt_data.o sctp_output.o \ - sctp_init.o sctp_input.o sctp_cookie.o \ - sctp_conn.o sctp_error.o sctp_snmp.o \ - sctp_param.o sctp_shutdown.o sctp_common.o \ - sctp_timer.o sctp_heartbeat.o sctp_hash.o \ - sctp_ioc.o sctp_bind.o sctp_notify.o sctp_asconf.o \ - sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o -IP_ILB_OBJS = ilb.o ilb_nat.o ilb_conn.o ilb_alg_hash.o ilb_alg_rr.o - -IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \ - ip6_rts.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \ - ip_multi.o ip2mac.o ip_ndp.o ip_rts.o ip_srcid.o \ - ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \ - spd.o ipclassifier.o inet_common.o ip_squeue.o squeue.o \ - ip_sadb.o ip_ftable.o proto_set.o radix.o ip_dummy.o \ - ip_helper_stream.o \ - ip_output.o ip_input.o ip6_input.o ip6_output.o ip_arp.o \ - conn_opt.o ip_attr.o ip_dce.o \ - $(IP_ICMP_OBJS) \ - $(IP_RTS_OBJS) \ - $(IP_TCP_OBJS) \ - $(IP_UDP_OBJS) \ - $(IP_SCTP_OBJS) \ - $(IP_ILB_OBJS) - -IP6_OBJS += ip6ddi.o - -HOOK_OBJS += hook.o - -NETI_OBJS += neti_impl.o neti_mod.o neti_stack.o - -KEYSOCK_OBJS += keysockddi.o keysock.o keysock_opt_data.o - -IPNET_OBJS += ipnet.o ipnet_bpf.o - -SPDSOCK_OBJS += spdsockddi.o spdsock.o spdsock_opt_data.o - -IPSECESP_OBJS += ipsecespddi.o ipsecesp.o - -IPSECAH_OBJS += ipsecahddi.o ipsecah.o sadb.o - -SPPP_OBJS += sppp.o sppp_dlpi.o sppp_mod.o s_common.o - -SPPPTUN_OBJS += sppptun.o sppptun_mod.o - -SPPPASYN_OBJS += spppasyn.o spppasyn_mod.o - -SPPPCOMP_OBJS += spppcomp.o spppcomp_mod.o deflate.o bsd-comp.o vjcompress.o \ - zlib.o - -TCP_OBJS += tcpddi.o - -TCP6_OBJS += tcp6ddi.o - -SCTP_OBJS += sctpddi.o - -SCTP6_OBJS += sctp6ddi.o - -NCA_OBJS += ncaddi.o - -SDP_SOCK_MOD_OBJS += sockmod_sdp.o socksdp.o socksdpsubr.o - -SCTP_SOCK_MOD_OBJS += sockmod_sctp.o socksctp.o socksctpsubr.o - -PFP_SOCK_MOD_OBJS += sockmod_pfp.o - -RDS_OBJS += rdsddi.o rdssubr.o rds_opt.o rds_ioctl.o - -RDSIB_OBJS += rdsib.o rdsib_ib.o rdsib_cm.o rdsib_ep.o rdsib_buf.o \ - rdsib_debug.o rdsib_sc.o - -ISER_OBJS += iser.o iser_cm.o iser_cq.o iser_ib.o iser_idm.o \ - iser_resource.o iser_xfer.o - -UDP_OBJS += udpddi.o - -UDP6_OBJS += udp6ddi.o - -SY_OBJS += gentty.o - -TCO_OBJS += ticots.o - -TCOO_OBJS += ticotsord.o - -TCL_OBJS += ticlts.o - -TL_OBJS += tl.o - -DUMP_OBJS += dump.o - -BPF_OBJS += bpf.o bpf_filter.o bpf_mod.o bpf_dlt.o bpf_mac.o - -CLONE_OBJS += clone.o - -CN_OBJS += cons.o - -DLD_OBJS += dld_drv.o dld_proto.o dld_str.o dld_flow.o - -DLS_OBJS += dls.o dls_link.o dls_mod.o dls_stat.o dls_mgmt.o - -GLD_OBJS += gld.o gldutil.o - -MAC_OBJS += mac.o mac_bcast.o mac_client.o mac_datapath_setup.o mac_flow.o \ - mac_hio.o mac_mod.o mac_ndd.o mac_provider.o mac_sched.o \ - mac_protect.o mac_soft_ring.o mac_stat.o mac_util.o - -MAC_6TO4_OBJS += mac_6to4.o - -MAC_ETHER_OBJS += mac_ether.o - -MAC_IPV4_OBJS += mac_ipv4.o - -MAC_IPV6_OBJS += mac_ipv6.o - -MAC_WIFI_OBJS += mac_wifi.o - -MAC_IB_OBJS += mac_ib.o - -IPTUN_OBJS += iptun_dev.o iptun_ctl.o iptun.o - -AGGR_OBJS += aggr_dev.o aggr_ctl.o aggr_grp.o aggr_port.o \ - aggr_send.o aggr_recv.o aggr_lacp.o - -SOFTMAC_OBJS += softmac_main.o softmac_ctl.o softmac_capab.o \ - softmac_dev.o softmac_stat.o softmac_pkt.o softmac_fp.o - -NET80211_OBJS += net80211.o net80211_proto.o net80211_input.o \ - net80211_output.o net80211_node.o net80211_crypto.o \ - net80211_crypto_none.o net80211_crypto_wep.o net80211_ioctl.o \ - net80211_crypto_tkip.o net80211_crypto_ccmp.o \ - net80211_ht.o - -VNIC_OBJS += vnic_ctl.o vnic_dev.o - -SIMNET_OBJS += simnet.o - -IB_OBJS += ibnex.o ibnex_ioctl.o - -IBCM_OBJS += ibcm_impl.o ibcm_sm.o ibcm_ti.o ibcm_utils.o ibcm_path.o \ - ibcm_arp.o ibcm_arp_link.o - -IBDM_OBJS += ibdm.o - -IBDMA_OBJS += ibdma.o - -IBMF_OBJS += ibmf.o ibmf_impl.o ibmf_dr.o ibmf_wqe.o ibmf_ud_dest.o ibmf_mod.o \ - ibmf_send.o ibmf_recv.o ibmf_handlers.o ibmf_trans.o \ - ibmf_timers.o ibmf_msg.o ibmf_utils.o ibmf_rmpp.o \ - ibmf_saa.o ibmf_saa_impl.o ibmf_saa_utils.o ibmf_saa_events.o - -IBTL_OBJS += ibtl_impl.o ibtl_util.o ibtl_mem.o ibtl_handlers.o ibtl_qp.o \ - ibtl_cq.o ibtl_wr.o ibtl_hca.o ibtl_chan.o ibtl_cm.o \ - ibtl_mcg.o ibtl_ibnex.o ibtl_srq.o - -TAVOR_OBJS += tavor.o tavor_agents.o tavor_cfg.o tavor_ci.o tavor_cmd.o \ - tavor_cq.o tavor_event.o tavor_ioctl.o tavor_misc.o \ - tavor_mr.o tavor_qp.o tavor_qpmod.o tavor_rsrc.o \ - tavor_srq.o tavor_stats.o tavor_umap.o tavor_wr.o - -HERMON_OBJS += hermon.o hermon_agents.o hermon_cfg.o hermon_ci.o hermon_cmd.o \ - hermon_cq.o hermon_event.o hermon_ioctl.o hermon_misc.o \ - hermon_mr.o hermon_qp.o hermon_qpmod.o hermon_rsrc.o \ - hermon_srq.o hermon_stats.o hermon_umap.o hermon_wr.o \ - hermon_fm.o - -DAPLT_OBJS += daplt.o - -KSTAT_OBJS += kstat.o - -KSYMS_OBJS += ksyms.o - -INSTANCE_OBJS += inst_sync.o - -IWSCN_OBJS += iwscons.o - -LOFI_OBJS += lofi.o LzmaDec.o - -FSSNAP_OBJS += fssnap.o - -FSSNAPIF_OBJS += fssnap_if.o - -MM_OBJS += mem.o - -PHYSMEM_OBJS += physmem.o - -OPTIONS_OBJS += options.o - -WINLOCK_OBJS += winlockio.o - -PM_OBJS += pm.o -SRN_OBJS += srn.o - -PSEUDO_OBJS += pseudonex.o - -RAMDISK_OBJS += ramdisk.o - -LLC1_OBJS += llc1.o - -USBKBM_OBJS += usbkbm.o - -USBWCM_OBJS += usbwcm.o - -BOFI_OBJS += bofi.o - -HID_OBJS += hid.o - -HWA_RC_OBJS += hwarc.o - -USBSKEL_OBJS += usbskel.o - -USBVC_OBJS += usbvc.o usbvc_v4l2.o - -HIDPARSER_OBJS += hidparser.o - -USB_AC_OBJS += usb_ac.o - -USB_AS_OBJS += usb_as.o - -USB_AH_OBJS += usb_ah.o - -USBMS_OBJS += usbms.o - -USBPRN_OBJS += usbprn.o - -UGEN_OBJS += ugen.o - -USBSER_OBJS += usbser.o usbser_rseq.o - -USBSACM_OBJS += usbsacm.o - -USBSER_KEYSPAN_OBJS += usbser_keyspan.o keyspan_dsd.o keyspan_pipe.o - -USBS49_FW_OBJS += keyspan_49fw.o - -USBSPRL_OBJS += usbser_pl2303.o pl2303_dsd.o - -WUSB_CA_OBJS += wusb_ca.o - -USBFTDI_OBJS += usbser_uftdi.o uftdi_dsd.o - -WC_OBJS += wscons.o vcons.o - -VCONS_CONF_OBJS += vcons_conf.o - -SCSI_OBJS += scsi_capabilities.o scsi_confsubr.o scsi_control.o \ - scsi_data.o scsi_fm.o scsi_hba.o scsi_reset_notify.o \ - scsi_resource.o scsi_subr.o scsi_transport.o scsi_watch.o \ - smp_transport.o - -SCSI_VHCI_OBJS += scsi_vhci.o mpapi_impl.o scsi_vhci_tpgs.o - -SCSI_VHCI_F_SYM_OBJS += sym.o - -SCSI_VHCI_F_TPGS_OBJS += tpgs.o - -SCSI_VHCI_F_ASYM_SUN_OBJS += asym_sun.o - -SCSI_VHCI_F_SYM_HDS_OBJS += sym_hds.o - -SCSI_VHCI_F_TAPE_OBJS += tape.o - -SCSI_VHCI_F_TPGS_TAPE_OBJS += tpgs_tape.o - -SGEN_OBJS += sgen.o - -SMP_OBJS += smp.o - -SATA_OBJS += sata.o - -USBA_OBJS += hcdi.o usba.o usbai.o hubdi.o parser.o genconsole.o \ - usbai_pipe_mgmt.o usbai_req.o usbai_util.o usbai_register.o \ - usba_devdb.o usba10_calls.o usba_ugen.o whcdi.o wa.o -USBA_WITHOUT_WUSB_OBJS += hcdi.o usba.o usbai.o hubdi.o parser.o genconsole.o \ - usbai_pipe_mgmt.o usbai_req.o usbai_util.o usbai_register.o \ - usba_devdb.o usba10_calls.o usba_ugen.o - -USBA10_OBJS += usba10.o - -RSM_OBJS += rsm.o rsmka_pathmanager.o rsmka_util.o - -RSMOPS_OBJS += rsmops.o - -S1394_OBJS += t1394.o t1394_errmsg.o s1394.o s1394_addr.o s1394_asynch.o \ - s1394_bus_reset.o s1394_cmp.o s1394_csr.o s1394_dev_disc.o \ - s1394_fa.o s1394_fcp.o \ - s1394_hotplug.o s1394_isoch.o s1394_misc.o h1394.o nx1394.o - -HCI1394_OBJS += hci1394.o hci1394_async.o hci1394_attach.o hci1394_buf.o \ - hci1394_csr.o hci1394_detach.o hci1394_extern.o \ - hci1394_ioctl.o hci1394_isoch.o hci1394_isr.o \ - hci1394_ixl_comp.o hci1394_ixl_isr.o hci1394_ixl_misc.o \ - hci1394_ixl_update.o hci1394_misc.o hci1394_ohci.o \ - hci1394_q.o hci1394_s1394if.o hci1394_tlabel.o \ - hci1394_tlist.o hci1394_vendor.o - -AV1394_OBJS += av1394.o av1394_as.o av1394_async.o av1394_cfgrom.o \ - av1394_cmp.o av1394_fcp.o av1394_isoch.o av1394_isoch_chan.o \ - av1394_isoch_recv.o av1394_isoch_xmit.o av1394_list.o \ - av1394_queue.o - -DCAM1394_OBJS += dcam.o dcam_frame.o dcam_param.o dcam_reg.o \ - dcam_ring_buff.o - -SCSA1394_OBJS += hba.o sbp2_driver.o sbp2_bus.o - -SBP2_OBJS += cfgrom.o sbp2.o - -PMODEM_OBJS += pmodem.o pmodem_cis.o cis.o cis_callout.o cis_handlers.o cis_params.o - -DSW_OBJS += dsw.o dsw_dev.o ii_tree.o - -NCALL_OBJS += ncall.o \ - ncall_stub.o - -RDC_OBJS += rdc.o \ - rdc_dev.o \ - rdc_io.o \ - rdc_clnt.o \ - rdc_prot_xdr.o \ - rdc_svc.o \ - rdc_bitmap.o \ - rdc_health.o \ - rdc_subr.o \ - rdc_diskq.o - -RDCSRV_OBJS += rdcsrv.o - -RDCSTUB_OBJS += rdc_stub.o - -SDBC_OBJS += sd_bcache.o \ - sd_bio.o \ - sd_conf.o \ - sd_ft.o \ - sd_hash.o \ - sd_io.o \ - sd_misc.o \ - sd_pcu.o \ - sd_tdaemon.o \ - sd_trace.o \ - sd_iob_impl0.o \ - sd_iob_impl1.o \ - sd_iob_impl2.o \ - sd_iob_impl3.o \ - sd_iob_impl4.o \ - sd_iob_impl5.o \ - sd_iob_impl6.o \ - sd_iob_impl7.o \ - safestore.o \ - safestore_ram.o - -NSCTL_OBJS += nsctl.o \ - nsc_cache.o \ - nsc_disk.o \ - nsc_dev.o \ - nsc_freeze.o \ - nsc_gen.o \ - nsc_mem.o \ - nsc_ncallio.o \ - nsc_power.o \ - nsc_resv.o \ - nsc_rmspin.o \ - nsc_solaris.o \ - nsc_trap.o \ - nsc_list.o -UNISTAT_OBJS += spuni.o \ - spcs_s_k.o - -NSKERN_OBJS += nsc_ddi.o \ - nsc_proc.o \ - nsc_raw.o \ - nsc_thread.o \ - nskernd.o - -SV_OBJS += sv.o - -PMCS_OBJS += pmcs_attach.o pmcs_ds.o pmcs_intr.o pmcs_nvram.o pmcs_sata.o \ - pmcs_scsa.o pmcs_smhba.o pmcs_subr.o pmcs_fwlog.o - -PMCS8001FW_C_OBJS += pmcs_fw_hdr.o -PMCS8001FW_OBJS += $(PMCS8001FW_C_OBJS) SPCBoot.o ila.o firmware.o - -# -# Build up defines and paths. - -ST_OBJS += st.o st_conf.o - -EMLXS_OBJS += emlxs_clock.o emlxs_dfc.o emlxs_dhchap.o emlxs_diag.o \ - emlxs_download.o emlxs_dump.o emlxs_els.o emlxs_event.o \ - emlxs_fcp.o emlxs_fct.o emlxs_hba.o emlxs_ip.o \ - emlxs_mbox.o emlxs_mem.o emlxs_msg.o emlxs_node.o \ - emlxs_pkt.o emlxs_sli3.o emlxs_sli4.o emlxs_solaris.o \ - emlxs_thread.o - -EMLXS_FW_OBJS += emlxs_fw.o - -OCE_OBJS += oce_buf.o oce_fm.o oce_gld.o oce_hw.o oce_intr.o oce_main.o \ - oce_mbx.o oce_mq.o oce_queue.o oce_rx.o oce_stat.o oce_tx.o \ - oce_utils.o - -FCT_OBJS += discovery.o fct.o - -QLT_OBJS += 2400.o 2500.o 8100.o qlt.o qlt_dma.o - -SRPT_OBJS += srpt_mod.o srpt_ch.o srpt_cm.o srpt_ioc.o srpt_stp.o - -FCOE_OBJS += fcoe.o fcoe_eth.o fcoe_fc.o - -FCOET_OBJS += fcoet.o fcoet_eth.o fcoet_fc.o - -FCOEI_OBJS += fcoei.o fcoei_eth.o fcoei_lv.o - -ISCSIT_SHARED_OBJS += \ - iscsit_common.o - -ISCSIT_OBJS += $(ISCSIT_SHARED_OBJS) \ - iscsit.o iscsit_tgt.o iscsit_sess.o iscsit_login.o \ - iscsit_text.o iscsit_isns.o iscsit_radiusauth.o \ - iscsit_radiuspacket.o iscsit_auth.o iscsit_authclient.o - -PPPT_OBJS += alua_ic_if.o pppt.o pppt_msg.o pppt_tgt.o - -STMF_OBJS += lun_map.o stmf.o - -STMF_SBD_OBJS += sbd.o sbd_scsi.o sbd_pgr.o - -SYSMSG_OBJS += sysmsg.o - -SES_OBJS += ses.o ses_sen.o ses_safte.o ses_ses.o - -TNF_OBJS += tnf_buf.o tnf_trace.o tnf_writer.o trace_init.o \ - trace_funcs.o tnf_probe.o tnf.o - -LOGINDMUX_OBJS += logindmux.o - -DEVINFO_OBJS += devinfo.o - -DEVPOLL_OBJS += devpoll.o - -DEVPOOL_OBJS += devpool.o - -I8042_OBJS += i8042.o - -KB8042_OBJS += \ - at_keyprocess.o \ - kb8042.o \ - kb8042_keytables.o - -MOUSE8042_OBJS += mouse8042.o - -FDC_OBJS += fdc.o - -ASY_OBJS += asy.o - -ECPP_OBJS += ecpp.o - -VUIDM3P_OBJS += vuidmice.o vuidm3p.o - -VUIDM4P_OBJS += vuidmice.o vuidm4p.o - -VUIDM5P_OBJS += vuidmice.o vuidm5p.o - -VUIDPS2_OBJS += vuidmice.o vuidps2.o - -HPCSVC_OBJS += hpcsvc.o - -PCIE_MISC_OBJS += pcie.o pcie_fault.o pcie_hp.o pciehpc.o pcishpc.o pcie_pwr.o pciev.o - -PCIHPNEXUS_OBJS += pcihp.o - -OPENEEPR_OBJS += openprom.o - -RANDOM_OBJS += random.o - -PSHOT_OBJS += pshot.o - -GEN_DRV_OBJS += gen_drv.o - -TCLIENT_OBJS += tclient.o - -TPHCI_OBJS += tphci.o - -TVHCI_OBJS += tvhci.o - -EMUL64_OBJS += emul64.o emul64_bsd.o - -FCP_OBJS += fcp.o - -FCIP_OBJS += fcip.o - -FCSM_OBJS += fcsm.o - -FCTL_OBJS += fctl.o - -FP_OBJS += fp.o - -QLC_OBJS += ql_api.o ql_debug.o ql_hba_fru.o ql_init.o ql_iocb.o ql_ioctl.o \ - ql_isr.o ql_mbx.o ql_xioctl.o ql_fw_table.o - -QLC_FW_2200_OBJS += ql_fw_2200.o - -QLC_FW_2300_OBJS += ql_fw_2300.o - -QLC_FW_2400_OBJS += ql_fw_2400.o - -QLC_FW_2500_OBJS += ql_fw_2500.o - -QLC_FW_6322_OBJS += ql_fw_6322.o - -QLC_FW_8100_OBJS += ql_fw_8100.o - -QLGE_OBJS += qlge.o qlge_dbg.o qlge_flash.o qlge_gld.o qlge_mpi.o - -ZCONS_OBJS += zcons.o - -NV_SATA_OBJS += nv_sata.o - -SI3124_OBJS += si3124.o - -AHCI_OBJS += ahci.o - -PCIIDE_OBJS += pci-ide.o - -PCEPP_OBJS += pcepp.o - -CPC_OBJS += cpc.o - -CPUID_OBJS += cpuid_drv.o - -SYSEVENT_OBJS += sysevent.o - -BL_OBJS += bl.o - -DRM_OBJS += drm_sunmod.o drm_kstat.o drm_agpsupport.o \ - drm_auth.o drm_bufs.o drm_context.o drm_dma.o \ - drm_drawable.o drm_drv.o drm_fops.o drm_ioctl.o drm_irq.o \ - drm_lock.o drm_memory.o drm_msg.o drm_pci.o drm_scatter.o \ - drm_cache.o drm_gem.o drm_mm.o ati_pcigart.o - -FM_OBJS += devfm.o devfm_machdep.o - -RTLS_OBJS += rtls.o - -# -# exec modules -# -AOUTEXEC_OBJS +=aout.o - -ELFEXEC_OBJS += elf.o elf_notes.o old_notes.o - -INTPEXEC_OBJS +=intp.o - -SHBINEXEC_OBJS +=shbin.o - -JAVAEXEC_OBJS +=java.o - -# -# file system modules -# -AUTOFS_OBJS += auto_vfsops.o auto_vnops.o auto_subr.o auto_xdr.o auto_sys.o - -CACHEFS_OBJS += cachefs_cnode.o cachefs_cod.o \ - cachefs_dir.o cachefs_dlog.o cachefs_filegrp.o \ - cachefs_fscache.o cachefs_ioctl.o cachefs_log.o \ - cachefs_module.o \ - cachefs_noopc.o cachefs_resource.o \ - cachefs_strict.o \ - cachefs_subr.o cachefs_vfsops.o \ - cachefs_vnops.o - -DCFS_OBJS += dc_vnops.o - -DEVFS_OBJS += devfs_subr.o devfs_vfsops.o devfs_vnops.o - -DEV_OBJS += sdev_subr.o sdev_vfsops.o sdev_vnops.o \ - sdev_ptsops.o sdev_zvolops.o sdev_comm.o \ - sdev_profile.o sdev_ncache.o sdev_netops.o \ - sdev_ipnetops.o \ - sdev_vtops.o - -CTFS_OBJS += ctfs_all.o ctfs_cdir.o ctfs_ctl.o ctfs_event.o \ - ctfs_latest.o ctfs_root.o ctfs_sym.o ctfs_tdir.o ctfs_tmpl.o - -OBJFS_OBJS += objfs_vfs.o objfs_root.o objfs_common.o \ - objfs_odir.o objfs_data.o - -FDFS_OBJS += fdops.o - -FIFO_OBJS += fifosubr.o fifovnops.o - -PIPE_OBJS += pipe.o - -HSFS_OBJS += hsfs_node.o hsfs_subr.o hsfs_vfsops.o hsfs_vnops.o \ - hsfs_susp.o hsfs_rrip.o hsfs_susp_subr.o - -LOFS_OBJS += lofs_subr.o lofs_vfsops.o lofs_vnops.o - -NAMEFS_OBJS += namevfs.o namevno.o - -NFS_OBJS += nfs_client.o nfs_common.o nfs_dump.o \ - nfs_subr.o nfs_vfsops.o nfs_vnops.o \ - nfs_xdr.o nfs_sys.o nfs_strerror.o \ - nfs3_vfsops.o nfs3_vnops.o nfs3_xdr.o \ - nfs_acl_vnops.o nfs_acl_xdr.o nfs4_vfsops.o \ - nfs4_vnops.o nfs4_xdr.o nfs4_idmap.o \ - nfs4_shadow.o nfs4_subr.o \ - nfs4_attr.o nfs4_rnode.o nfs4_client.o \ - nfs4_acache.o nfs4_common.o nfs4_client_state.o \ - nfs4_callback.o nfs4_recovery.o nfs4_client_secinfo.o \ - nfs4_client_debug.o nfs_stats.o \ - nfs4_acl.o nfs4_stub_vnops.o nfs_cmd.o - -NFSSRV_OBJS += nfs_server.o nfs_srv.o nfs3_srv.o \ - nfs_acl_srv.o nfs_auth.o nfs_auth_xdr.o \ - nfs_export.o nfs_log.o nfs_log_xdr.o \ - nfs4_srv.o nfs4_state.o nfs4_srv_attr.o \ - nfs4_srv_ns.o nfs4_db.o nfs4_srv_deleg.o \ - nfs4_deleg_ops.o nfs4_srv_readdir.o nfs4_dispatch.o - -SMBSRV_SHARED_OBJS += \ - smb_inet.o \ - smb_match.o \ - smb_msgbuf.o \ - smb_oem.o \ - smb_string.o \ - smb_utf8.o \ - smb_common_door_decode.o \ - smb_xdr_utils.o \ - smb_token.o \ - smb_token_xdr.o \ - smb_sid.o \ - smb_status_xlat.o \ - smb_native.o \ - smb_netbios_util.o \ - smb_share_door_decode.o - -SMBSRV_OBJS += $(SMBSRV_SHARED_OBJS) \ - smb_acl.o \ - smb_alloc.o \ - smb_close.o \ - smb_common_open.o \ - smb_common_transact.o \ - smb_create.o \ - smb_delete.o \ - smb_directory.o \ - smb_dispatch.o \ - smb_echo.o \ - smb_fem.o \ - smb_find.o \ - smb_flush.o \ - smb_fsops.o \ - smb_init.o \ - smb_kdoor_encdec.o \ - smb_kdoor_clnt.o \ - smb_kshare.o \ - smb_lock.o \ - smb_lock_byte_range.o \ - smb_locking_andx.o \ - smb_logoff_andx.o \ - smb_mangle_name.o \ - smb_mbuf_marshaling.o \ - smb_mbuf_util.o \ - smb_negotiate.o \ - smb_net.o \ - smb_node.o \ - smb_nt_cancel.o \ - smb_nt_create_andx.o \ - smb_nt_transact_create.o \ - smb_nt_transact_ioctl.o \ - smb_nt_transact_notify_change.o \ - smb_nt_transact_security.o \ - smb_odir.o \ - smb_ofile.o \ - smb_open_andx.o \ - smb_opipe.o \ - smb_oplock.o \ - smb_pathname.o \ - smb_print.o \ - smb_process_exit.o \ - smb_query_fileinfo.o \ - smb_query_information_disk.o \ - smb_read.o \ - smb_rename.o \ - smb_sd.o \ - smb_seek.o \ - smb_server.o \ - smb_session.o \ - smb_session_setup_andx.o \ - smb_set_fileinfo.o \ - smb_signing.o \ - smb_tree.o \ - smb_trans2_create_directory.o \ - smb_trans2_dfs.o \ - smb_trans2_find.o \ - smb_trans2_query_fs_information.o \ - smb_tree_connect.o \ - smb_unlock_byte_range.o \ - smb_upcalls.o \ - smb_user.o \ - smb_util.o \ - smb_vfs.o \ - smb_vops.o \ - smb_vss.o \ - smb_write.o \ - smb_write_raw.o \ - smb_xlate.o - -PCFS_OBJS += pc_alloc.o pc_dir.o pc_node.o pc_subr.o \ - pc_vfsops.o pc_vnops.o - -PROC_OBJS += prcontrol.o prioctl.o prsubr.o prusrio.o \ - prvfsops.o prvnops.o - -MNTFS_OBJS += mntvfsops.o mntvnops.o - -SHAREFS_OBJS += sharetab.o sharefs_vfsops.o sharefs_vnops.o - -SPEC_OBJS += specsubr.o specvfsops.o specvnops.o - -SOCK_OBJS += socksubr.o sockvfsops.o sockparams.o \ - socksyscalls.o socktpi.o sockstr.o sockssl.o \ - sockcommon_vnops.o sockcommon_subr.o \ - sockcommon_sops.o sockcommon.o \ - sock_notsupp.o socknotify.o \ - nl7c.o nl7curi.o nl7chttp.o nl7clogd.o \ - nl7cnca.o sodirect.o - -TMPFS_OBJS += tmp_dir.o tmp_subr.o tmp_tnode.o tmp_vfsops.o \ - tmp_vnops.o - -UDFS_OBJS += udf_alloc.o udf_bmap.o udf_dir.o \ - udf_inode.o udf_subr.o udf_vfsops.o \ - udf_vnops.o - -UFS_OBJS += ufs_alloc.o ufs_bmap.o ufs_dir.o ufs_xattr.o \ - ufs_inode.o ufs_subr.o ufs_tables.o ufs_vfsops.o \ - ufs_vnops.o quota.o quotacalls.o quota_ufs.o \ - ufs_filio.o ufs_lockfs.o ufs_thread.o ufs_trans.o \ - ufs_acl.o ufs_panic.o ufs_directio.o ufs_log.o \ - ufs_extvnops.o ufs_snap.o lufs.o lufs_thread.o \ - lufs_log.o lufs_map.o lufs_top.o lufs_debug.o -VSCAN_OBJS += vscan_drv.o vscan_svc.o vscan_door.o - -NSMB_OBJS += smb_conn.o smb_dev.o smb_iod.o smb_pass.o \ - smb_rq.o smb_sign.o smb_smb.o smb_subrs.o \ - smb_time.o smb_tran.o smb_trantcp.o smb_usr.o \ - subr_mchain.o - -SMBFS_COMMON_OBJS += smbfs_ntacl.o -SMBFS_OBJS += smbfs_vfsops.o smbfs_vnops.o smbfs_node.o \ - smbfs_acl.o smbfs_client.o smbfs_smb.o \ - smbfs_subr.o smbfs_subr2.o \ - smbfs_rwlock.o smbfs_xattr.o \ - $(SMBFS_COMMON_OBJS) - - -# -# LVM modules -# -MD_OBJS += md.o md_error.o md_ioctl.o md_mddb.o md_names.o \ - md_med.o md_rename.o md_subr.o - -MD_COMMON_OBJS = md_convert.o md_crc.o md_revchk.o - -MD_DERIVED_OBJS = metamed_xdr.o meta_basic_xdr.o - -SOFTPART_OBJS += sp.o sp_ioctl.o - -STRIPE_OBJS += stripe.o stripe_ioctl.o - -HOTSPARES_OBJS += hotspares.o - -RAID_OBJS += raid.o raid_ioctl.o raid_replay.o raid_resync.o raid_hotspare.o - -MIRROR_OBJS += mirror.o mirror_ioctl.o mirror_resync.o - -NOTIFY_OBJS += md_notify.o - -TRANS_OBJS += mdtrans.o trans_ioctl.o trans_log.o +# +# This Makefile defines all file modules for the directory uts/common +# and its children. These are the source files which may be considered +# common to all SunOS systems. ZFS_COMMON_OBJS += \ arc.o \ bplist.o \ + blkptr.o \ + bpobj.o \ + bptree.o \ + bqueue.o \ dbuf.o \ ddt.o \ ddt_zap.o \ dmu.o \ + dmu_diff.o \ dmu_send.o \ dmu_object.o \ dmu_objset.o \ @@ -1321,18 +51,28 @@ ZFS_COMMON_OBJS += \ dmu_tx.o \ dnode.o \ dnode_sync.o \ + dsl_bookmark.o \ dsl_dir.o \ dsl_dataset.o \ + dsl_deadlist.o \ + dsl_destroy.o \ dsl_pool.o \ dsl_synctask.o \ + dsl_userhold.o \ dmu_zfetch.o \ dsl_deleg.o \ dsl_prop.o \ - dsl_scrub.o \ + dsl_scan.o \ + zfeature.o \ gzip.o \ + lz4.o \ lzjb.o \ metaslab.o \ + multilist.o \ + range_tree.o \ refcount.o \ + rrwlock.o \ + sa.o \ sha256.o \ spa.o \ spa_config.o \ @@ -1340,6 +80,7 @@ ZFS_COMMON_OBJS += \ spa_history.o \ spa_misc.o \ space_map.o \ + space_reftree.o \ txg.o \ uberblock.o \ unique.o \ @@ -1356,626 +97,41 @@ ZFS_COMMON_OBJS += \ zap_leaf.o \ zap_micro.o \ zfs_byteswap.o \ + zfs_debug.o \ zfs_fm.o \ zfs_fuid.o \ + zfs_sa.o \ zfs_znode.o \ zil.o \ zio.o \ zio_checksum.o \ zio_compress.o \ zio_inject.o \ - zle.o + zle.o \ + zrlock.o ZFS_SHARED_OBJS += \ - zfs_namecheck.o \ - zfs_deleg.o \ - zfs_prop.o \ + zfeature_common.o \ zfs_comutil.o \ + zfs_deleg.o \ zfs_fletcher.o \ + zfs_namecheck.o \ + zfs_prop.o \ zpool_prop.o \ zprop_common.o ZFS_OBJS += \ $(ZFS_COMMON_OBJS) \ $(ZFS_SHARED_OBJS) \ - vdev_disk.o \ zfs_acl.o \ zfs_ctldir.o \ zfs_dir.o \ zfs_ioctl.o \ + zfs_ioctl_compat.o \ zfs_log.o \ + zfs_onexit.o \ zfs_replay.o \ zfs_rlock.o \ - rrwlock.o \ zfs_vfsops.o \ zfs_vnops.o \ zvol.o - -ZUT_OBJS += \ - zut.o - -# -# streams modules -# -BUFMOD_OBJS += bufmod.o - -CONNLD_OBJS += connld.o - -DEDUMP_OBJS += dedump.o - -DRCOMPAT_OBJS += drcompat.o - -LDLINUX_OBJS += ldlinux.o - -LDTERM_OBJS += ldterm.o uwidth.o - -PCKT_OBJS += pckt.o - -PFMOD_OBJS += pfmod.o - -PTEM_OBJS += ptem.o - -REDIRMOD_OBJS += strredirm.o - -TIMOD_OBJS += timod.o - -TIRDWR_OBJS += tirdwr.o - -TTCOMPAT_OBJS +=ttcompat.o - -LOG_OBJS += log.o - -PIPEMOD_OBJS += pipemod.o - -RPCMOD_OBJS += rpcmod.o clnt_cots.o clnt_clts.o \ - clnt_gen.o clnt_perr.o mt_rpcinit.o rpc_calmsg.o \ - rpc_prot.o rpc_sztypes.o rpc_subr.o rpcb_prot.o \ - svc.o svc_clts.o svc_gen.o svc_cots.o \ - rpcsys.o xdr_sizeof.o clnt_rdma.o svc_rdma.o \ - xdr_rdma.o rdma_subr.o xdrrdma_sizeof.o - -TLIMOD_OBJS += tlimod.o t_kalloc.o t_kbind.o t_kclose.o \ - t_kconnect.o t_kfree.o t_kgtstate.o t_kopen.o \ - t_krcvudat.o t_ksndudat.o t_kspoll.o t_kunbind.o \ - t_kutil.o - -RLMOD_OBJS += rlmod.o - -TELMOD_OBJS += telmod.o - -CRYPTMOD_OBJS += cryptmod.o - -KB_OBJS += kbd.o keytables.o - -# -# ID mapping module -# -IDMAP_OBJS += idmap_mod.o idmap_kapi.o idmap_xdr.o idmap_cache.o - -# -# scheduling class modules -# -SDC_OBJS += sysdc.o - -RT_OBJS += rt.o -RT_DPTBL_OBJS += rt_dptbl.o - -TS_OBJS += ts.o -TS_DPTBL_OBJS += ts_dptbl.o - -IA_OBJS += ia.o - -FSS_OBJS += fss.o - -FX_OBJS += fx.o -FX_DPTBL_OBJS += fx_dptbl.o - -# -# Inter-Process Communication (IPC) modules -# -IPC_OBJS += ipc.o - -IPCMSG_OBJS += msg.o - -IPCSEM_OBJS += sem.o - -IPCSHM_OBJS += shm.o - -# -# bignum module -# -COMMON_BIGNUM_OBJS += bignum_mod.o bignumimpl.o - -BIGNUM_OBJS += $(COMMON_BIGNUM_OBJS) $(BIGNUM_PSR_OBJS) - -# -# kernel cryptographic framework -# -KCF_OBJS += kcf.o kcf_callprov.o kcf_cbufcall.o kcf_cipher.o kcf_crypto.o \ - kcf_cryptoadm.o kcf_ctxops.o kcf_digest.o kcf_dual.o \ - kcf_keys.o kcf_mac.o kcf_mech_tabs.o kcf_miscapi.o \ - kcf_object.o kcf_policy.o kcf_prov_lib.o kcf_prov_tabs.o \ - kcf_sched.o kcf_session.o kcf_sign.o kcf_spi.o kcf_verify.o \ - kcf_random.o modes.o ecb.o cbc.o ctr.o ccm.o gcm.o fips_random.o - -CRYPTOADM_OBJS += cryptoadm.o - -CRYPTO_OBJS += crypto.o - -DPROV_OBJS += dprov.o - -DCA_OBJS += dca.o dca_3des.o dca_debug.o dca_dsa.o dca_kstat.o dca_rng.o \ - dca_rsa.o - -AESPROV_OBJS += aes.o aes_impl.o aes_modes.o fips_aes_util.o - -ARCFOURPROV_OBJS += arcfour.o arcfour_crypt.o - -BLOWFISHPROV_OBJS += blowfish.o blowfish_impl.o - -ECCPROV_OBJS += ecc.o ec.o ec2_163.o ec2_mont.o ecdecode.o ecl_mult.o \ - ecp_384.o ecp_jac.o ec2_193.o ecl.o ecp_192.o ecp_521.o \ - ecp_jm.o ec2_233.o ecl_curve.o ecp_224.o ecp_aff.o \ - ecp_mont.o ec2_aff.o ec_naf.o ecl_gf.o ecp_256.o mp_gf2m.o \ - mpi.o mplogic.o mpmontg.o mpprime.o oid.o \ - secitem.o ec2_test.o ecp_test.o fips_ecc_util.o - -RSAPROV_OBJS += rsa.o rsa_impl.o fips_rsa_util.o - -SWRANDPROV_OBJS += swrand.o fips_random_util.o - -# -# kernel SSL -# -KSSL_OBJS += kssl.o ksslioctl.o ksslapi.o ksslrec.o - -# -# misc. modules -# - -C2AUDIT_OBJS += adr.o audit.o audit_event.o audit_io.o \ - audit_path.o audit_start.o audit_syscalls.o audit_token.o \ - audit_mem.o audit_zone.o - -PCIC_OBJS += pcic.o - -RPCSEC_OBJS += secmod.o sec_clnt.o sec_svc.o sec_gen.o \ - auth_des.o auth_kern.o auth_none.o auth_loopb.o\ - authdesprt.o authdesubr.o authu_prot.o \ - key_call.o key_prot.o svc_authu.o svcauthdes.o - -RPCSEC_GSS_OBJS += rpcsec_gssmod.o rpcsec_gss.o rpcsec_gss_misc.o \ - rpcsec_gss_utils.o svc_rpcsec_gss.o - -CONSCONFIG_OBJS += consconfig.o - -CONSCONFIG_DACF_OBJS += consconfig_dacf.o consplat.o - -TEM_OBJS += tem.o tem_safe.o 6x10.o 7x14.o 12x22.o - -KBTRANS_OBJS += \ - kbtrans.o \ - kbtrans_keytables.o \ - kbtrans_polled.o \ - kbtrans_streams.o \ - usb_keytables.o - -KGSSD_OBJS += gssd_clnt_stubs.o gssd_handle.o gssd_prot.o \ - gss_display_name.o gss_release_name.o gss_import_name.o \ - gss_release_buffer.o gss_release_oid_set.o gen_oids.o gssdmod.o - -KGSSD_DERIVED_OBJS = gssd_xdr.o - -KGSS_DUMMY_OBJS += dmech.o - -KSOCKET_OBJS += ksocket.o ksocket_mod.o - -CRYPTO= cksumtypes.o decrypt.o encrypt.o encrypt_length.o etypes.o \ - nfold.o verify_checksum.o prng.o block_size.o make_checksum.o\ - checksum_length.o hmac.o default_state.o mandatory_sumtype.o - -# crypto/des -CRYPTO_DES= f_cbc.o f_cksum.o f_parity.o weak_key.o d3_cbc.o ef_crypto.o - -CRYPTO_DK= checksum.o derive.o dk_decrypt.o dk_encrypt.o - -CRYPTO_ARCFOUR= k5_arcfour.o - -# crypto/enc_provider -CRYPTO_ENC= des.o des3.o arcfour_provider.o aes_provider.o - -# crypto/hash_provider -CRYPTO_HASH= hash_kef_generic.o hash_kmd5.o hash_crc32.o hash_ksha1.o - -# crypto/keyhash_provider -CRYPTO_KEYHASH= descbc.o k5_kmd5des.o k_hmac_md5.o - -# crypto/crc32 -CRYPTO_CRC32= crc32.o - -# crypto/old -CRYPTO_OLD= old_decrypt.o old_encrypt.o - -# crypto/raw -CRYPTO_RAW= raw_decrypt.o raw_encrypt.o - -K5_KRB= kfree.o copy_key.o \ - parse.o init_ctx.o \ - ser_adata.o ser_addr.o \ - ser_auth.o ser_cksum.o \ - ser_key.o ser_princ.o \ - serialize.o unparse.o \ - ser_actx.o - -K5_OS= timeofday.o toffset.o \ - init_os_ctx.o c_ustime.o - -SEAL= -# EXPORT DELETE START -SEAL= seal.o unseal.o -# EXPORT DELETE END - -MECH= delete_sec_context.o \ - import_sec_context.o \ - gssapi_krb5.o \ - k5seal.o k5unseal.o k5sealv3.o \ - ser_sctx.o \ - sign.o \ - util_crypt.o \ - util_validate.o util_ordering.o \ - util_seqnum.o util_set.o util_seed.o \ - wrap_size_limit.o verify.o - - - -MECH_GEN= util_token.o - - -KGSS_KRB5_OBJS += krb5mech.o \ - $(MECH) $(SEAL) $(MECH_GEN) \ - $(CRYPTO) $(CRYPTO_DES) $(CRYPTO_DK) $(CRYPTO_ARCFOUR) \ - $(CRYPTO_ENC) $(CRYPTO_HASH) \ - $(CRYPTO_KEYHASH) $(CRYPTO_CRC32) \ - $(CRYPTO_OLD) \ - $(CRYPTO_RAW) $(K5_KRB) $(K5_OS) - -DES_OBJS += des_crypt.o des_impl.o des_ks.o des_soft.o fips_des_util.o - -DLBOOT_OBJS += bootparam_xdr.o nfs_dlinet.o scan.o - -KRTLD_OBJS += kobj_bootflags.o getoptstr.o \ - kobj.o kobj_kdi.o kobj_lm.o kobj_subr.o - -MOD_OBJS += modctl.o modsubr.o modsysfile.o modconf.o modhash.o - -STRPLUMB_OBJS += strplumb.o - -CPR_OBJS += cpr_driver.o cpr_dump.o \ - cpr_main.o cpr_misc.o cpr_mod.o cpr_stat.o \ - cpr_uthread.o - -PROF_OBJS += prf.o - -SE_OBJS += se_driver.o - -SYSACCT_OBJS += acct.o - -ACCTCTL_OBJS += acctctl.o - -EXACCTSYS_OBJS += exacctsys.o - -KAIO_OBJS += aio.o - -PCMCIA_OBJS += pcmcia.o cs.o cis.o cis_callout.o cis_handlers.o cis_params.o - -BUSRA_OBJS += busra.o - -PCS_OBJS += pcs.o - -PCAN_OBJS += pcan.o - -PCATA_OBJS += pcide.o pcdisk.o pclabel.o pcata.o - -PCSER_OBJS += pcser.o pcser_cis.o - -PCWL_OBJS += pcwl.o - -PSET_OBJS += pset.o - -OHCI_OBJS += ohci.o ohci_hub.o ohci_polled.o - -UHCI_OBJS += uhci.o uhciutil.o uhcitgt.o uhcihub.o uhcipolled.o - -EHCI_OBJS += ehci.o ehci_hub.o ehci_xfer.o ehci_intr.o ehci_util.o ehci_polled.o ehci_isoch.o ehci_isoch_util.o - -HUBD_OBJS += hubd.o - -USB_MID_OBJS += usb_mid.o - -USB_IA_OBJS += usb_ia.o - -UWBA_OBJS += uwba.o uwbai.o - -SCSA2USB_OBJS += scsa2usb.o usb_ms_bulkonly.o usb_ms_cbi.o - -HWAHC_OBJS += hwahc.o hwahc_util.o - -WUSB_DF_OBJS += wusb_df.o -WUSB_FWMOD_OBJS += wusb_fwmod.o - -IPF_OBJS += ip_fil_solaris.o fil.o solaris.o ip_state.o ip_frag.o ip_nat.o \ - ip_proxy.o ip_auth.o ip_pool.o ip_htable.o ip_lookup.o \ - ip_log.o misc.o ip_compat.o ip_nat6.o drand48.o - -IBD_OBJS += ibd.o ibd_cm.o - -DLPISTUB_OBJS += dlpistub.o - -SDP_OBJS += sdpddi.o - -TRILL_OBJS += trill.o - -CTF_OBJS += ctf_create.o ctf_decl.o ctf_error.o ctf_hash.o ctf_labels.o \ - ctf_lookup.o ctf_open.o ctf_types.o ctf_util.o ctf_subr.o ctf_mod.o - -SMBIOS_OBJS += smb_error.o smb_info.o smb_open.o smb_subr.o smb_dev.o - -RPCIB_OBJS += rpcib.o - -KMDB_OBJS += kdrv.o - -AFE_OBJS += afe.o - -BGE_OBJS += bge_main2.o bge_chip2.o bge_kstats.o bge_log.o bge_ndd.o \ - bge_atomic.o bge_mii.o bge_send.o bge_recv2.o bge_mii_5906.o - -DMFE_OBJS += dmfe_log.o dmfe_main.o dmfe_mii.o - -ELXL_OBJS += elxl.o - -HME_OBJS += hme.o - -IXGB_OBJS += ixgb.o ixgb_atomic.o ixgb_chip.o ixgb_gld.o ixgb_kstats.o \ - ixgb_log.o ixgb_ndd.o ixgb_rx.o ixgb_tx.o ixgb_xmii.o - -NGE_OBJS += nge_main.o nge_atomic.o nge_chip.o nge_ndd.o nge_kstats.o \ - nge_log.o nge_rx.o nge_tx.o nge_xmii.o - -RGE_OBJS += rge_main.o rge_chip.o rge_ndd.o rge_kstats.o rge_log.o rge_rxtx.o - -URTW_OBJS += urtw.o - -ARN_OBJS += arn_hw.o arn_eeprom.o arn_mac.o arn_calib.o arn_ani.o arn_phy.o arn_regd.o arn_beacon.o \ - arn_main.o arn_recv.o arn_xmit.o arn_rc.o - -ATH_OBJS += ath_aux.o ath_main.o ath_osdep.o ath_rate.o - -ATU_OBJS += atu.o - -IPW_OBJS += ipw2100_hw.o ipw2100.o - -IWI_OBJS += ipw2200_hw.o ipw2200.o - -IWH_OBJS += iwh.o - -IWK_OBJS += iwk2.o - -IWP_OBJS += iwp.o - -MWL_OBJS += mwl.o - -MWLFW_OBJS += mwlfw_mode.o - -WPI_OBJS += wpi.o - -RAL_OBJS += rt2560.o ral_rate.o - -RUM_OBJS += rum.o - -RWD_OBJS += rt2661.o - -RWN_OBJS += rt2860.o - -UATH_OBJS += uath.o - -UATHFW_OBJS += uathfw_mod.o - -URAL_OBJS += ural.o - -RTW_OBJS += rtw.o smc93cx6.o rtwphy.o rtwphyio.o - -ZYD_OBJS += zyd.o zyd_usb.o zyd_hw.o zyd_fw.o - -MXFE_OBJS += mxfe.o - -MPTSAS_OBJS += mptsas.o mptsas_impl.o mptsas_init.o mptsas_raid.o - -SFE_OBJS += sfe.o sfe_util.o - -BFE_OBJS += bfe.o - -BRIDGE_OBJS += bridge.o - -DDA_OBJS += dda.o - -DMD_OBJS += dmd.o - -IDM_SHARED_OBJS += base64.o - -IDM_OBJS += $(IDM_SHARED_OBJS) \ - idm.o idm_impl.o idm_text.o idm_conn_sm.o idm_so.o - -VR_OBJS += vr.o - -ATGE_OBJS += atge_main.o atge_l1e.o atge_mii.o atge_l1.o - -YGE_OBJS = yge.o - -# -# Build up defines and paths. -# -LINT_DEFS += -Dunix - -# -# This duality can be removed when the native and target compilers -# are the same (or at least recognize the same command line syntax!) -# It is a bug in the current compilation system that the assember -# can't process the -Y I, flag. -# -NATIVE_INC_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common -AS_INC_PATH += $(INC_PATH) -I$(UTSBASE)/common -INCLUDE_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common - -PCIEB_OBJS += pcieb.o - -# Chelsio N110 10G NIC driver module -# -CH_OBJS = ch.o glue.o pe.o sge.o - -CH_COM_OBJS = ch_mac.o ch_subr.o cspi.o espi.o ixf1010.o mc3.o mc4.o mc5.o \ - mv88e1xxx.o mv88x201x.o my3126.o pm3393.o tp.o ulp.o \ - vsc7321.o vsc7326.o xpak.o - -# -# PCI strings file -# -PCI_STRING_OBJS = pci_strings.o - -NET_DACF_OBJS += net_dacf.o - -# -# Xframe 10G NIC driver module -# -XGE_OBJS = xge.o xgell.o - -XGE_HAL_OBJS = xgehal-channel.o xgehal-fifo.o xgehal-ring.o xgehal-config.o \ - xgehal-driver.o xgehal-mm.o xgehal-stats.o xgehal-device.o \ - xge-queue.o xgehal-mgmt.o xgehal-mgmtaux.o - -# -# e1000g module -# -E1000G_OBJS += e1000_80003es2lan.o e1000_82540.o e1000_82541.o e1000_82542.o \ - e1000_82543.o e1000_82571.o e1000_api.o e1000_ich8lan.o \ - e1000_mac.o e1000_manage.o e1000_nvm.o e1000_osdep.o \ - e1000_phy.o e1000g_debug.o e1000g_main.o e1000g_alloc.o \ - e1000g_tx.o e1000g_rx.o e1000g_stat.o - -# -# Intel 82575 1G NIC driver module -# -IGB_OBJS = igb_82575.o igb_api.o igb_mac.o igb_manage.o \ - igb_nvm.o igb_osdep.o igb_phy.o igb_buf.o \ - igb_debug.o igb_gld.o igb_log.o igb_main.o \ - igb_rx.o igb_stat.o igb_tx.o - -# -# Intel 10GbE PCIE NIC driver module -# -IXGBE_OBJS = ixgbe_82598.o ixgbe_82599.o ixgbe_api.o \ - ixgbe_common.o ixgbe_phy.o \ - ixgbe_buf.o ixgbe_debug.o ixgbe_gld.o \ - ixgbe_log.o ixgbe_main.o \ - ixgbe_osdep.o ixgbe_rx.o ixgbe_stat.o \ - ixgbe_tx.o - -# -# NIU 10G/1G driver module -# -NXGE_OBJS = nxge_mac.o nxge_ipp.o nxge_rxdma.o \ - nxge_txdma.o nxge_txc.o nxge_main.o \ - nxge_hw.o nxge_fzc.o nxge_virtual.o \ - nxge_send.o nxge_classify.o nxge_fflp.o \ - nxge_fflp_hash.o nxge_ndd.o nxge_kstats.o \ - nxge_zcp.o nxge_fm.o nxge_espc.o nxge_hv.o \ - nxge_hio.o nxge_hio_guest.o nxge_intr.o - -NXGE_NPI_OBJS = \ - npi.o npi_mac.o npi_ipp.o \ - npi_txdma.o npi_rxdma.o npi_txc.o \ - npi_zcp.o npi_espc.o npi_fflp.o \ - npi_vir.o - -NXGE_HCALL_OBJS = \ - nxge_hcall.o - -# -# kiconv modules -# -KICONV_EMEA_OBJS += kiconv_emea.o - -# -# blk2scsa -# -BLK2SCSA_OBJS = blk2scsa.o - -KICONV_JA_OBJS += kiconv_ja.o - -KICONV_KO_OBJS += kiconv_cck_common.o kiconv_ko.o - -KICONV_SC_OBJS += kiconv_cck_common.o kiconv_sc.o - -KICONV_TC_OBJS += kiconv_cck_common.o kiconv_tc.o - -# -# AAC module -# -AAC_OBJS = aac.o aac_ioctl.o - -# -# sdcard modules -# -SDA_OBJS = sda_cmd.o sda_host.o sda_init.o sda_mem.o sda_mod.o \ - sda_nexus.o sda_slot.o -SDCARD_OBJS = sdcard.o -SDHOST_OBJS = sdhost.o - -# -# hxge 10G driver module -# -HXGE_OBJS = hxge_main.o hxge_vmac.o hxge_send.o \ - hxge_txdma.o hxge_rxdma.o hxge_virtual.o \ - hxge_fm.o hxge_fzc.o hxge_hw.o hxge_kstats.o \ - hxge_ndd.o hxge_pfc.o \ - hpi.o hpi_vmac.o hpi_rxdma.o hpi_txdma.o \ - hpi_vir.o hpi_pfc.o - -# -# MEGARAID_SAS module -# -MEGA_SAS_OBJS = megaraid_sas.o - -# -# MR_SAS module -# -MR_SAS_OBJS = mr_sas.o - -# -# ISCSI_INITIATOR module -# -ISCSI_INITIATOR_OBJS = chap.o iscsi_io.o iscsi_thread.o \ - iscsi_ioctl.o iscsid.o iscsi.o \ - iscsi_login.o isns_client.o iscsiAuthClient.o \ - iscsi_lun.o iscsiAuthClientGlue.o \ - iscsi_net.o nvfile.o iscsi_cmd.o \ - iscsi_queue.o persistent.o iscsi_conn.o \ - iscsi_sess.o radius_auth.o iscsi_crc.o \ - iscsi_stats.o radius_packet.o iscsi_doorclt.o \ - iscsi_targetparam.o utils.o kifconf.o - -# -# ntxn 10Gb/1Gb NIC driver module -# -NTXN_OBJS = unm_nic_init.o unm_gem.o unm_nic_hw.o unm_ndd.o \ - unm_nic_main.o unm_nic_isr.o unm_nic_ctx.o niu.o - -# -# Myricom 10Gb NIC driver module -# -MYRI10GE_OBJS = myri10ge.o myri10ge_lro.o - -# nulldriver module -# -NULLDRIVER_OBJS = nulldriver.o - -TPM_OBJS = tpm.o tpm_hcall.o Index: src/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c,v retrieving revision 1.35 diff -u -p -r1.35 dtrace.c --- src/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c 7 Jan 2017 21:39:53 -0000 1.35 +++ src/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c 11 Jun 2017 11:48:36 -0000 @@ -18,16 +18,15 @@ * * CDDL HEADER END * - * $FreeBSD: src/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c,v 1.10.2.1 2009/08/03 08:13:06 kensmith Exp $ + * $FreeBSD: head/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c 313266 2017-02-05 02:47:34Z markj $ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016, Joyent, Inc. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ -/* #pragma ident "%Z%%M% %I% %E% SMI" */ - /* * DTrace - Dynamic Tracing for Solaris * @@ -67,82 +66,119 @@ * [Group] Functions", allowing one to find each block by searching forward * on capital-f functions. */ -#if !defined(sun) -/* we need internal access to mutexes for state inspection */ +#ifdef __NetBSD__ #define __MUTEX_PRIVATE #define __RWLOCK_PRIVATE +#include #endif #include -#if !defined(sun) +#ifndef illumos #include #endif #include +#include #include #include -#if defined(sun) -#include +#ifdef illumos #include #include #endif #include #include -#if defined(sun) +#ifdef illumos #include #endif #include #include #include #include +#ifdef illumos #include #include +#endif #include -#if defined(sun) +#ifdef illumos #include #include #endif #include -#if defined(sun) +#ifdef illumos #include #include #endif #include -#if defined(sun) +#ifdef illumos #include #include #endif #include #include #include +#include "strtolctype.h" /* FreeBSD includes: */ -#if !defined(sun) - +#ifdef __FreeBSD__ +#include #include -#include -//#include +#include +#include +#include +#include #include #include +#include +#include +#include +#include +#include +#include #include +#endif + +#ifdef __NetBSD__ +#include +#include +#include +#include +#include #include #include +#include #include -//#include +#include +#include +#include +#include #include #include -#include #include #include #include +#endif + +#ifndef illumos + +#include + +#include "dtrace_xoroshiro128_plus.h" + #include + #include "dtrace_cddl.h" #include "dtrace_debug.c" -#endif -#if !defined(sun) -/* fake module entry for netbsd */ -module_t *mod_nbsd = NULL; -#endif +#ifdef __NetBSD__ +struct dtrace_state_worker *dtrace_state_worker_add(void (*fn)(dtrace_state_t *), + dtrace_state_t *state, hrtime_t interval); +void dtrace_state_worker_remove(struct dtrace_state_worker *w); + +modctl_t *mod_nbsd; + +#endif /* __NetBSD__ */ + +#endif /* !illumos */ + /* * DTrace Tunable Variables @@ -165,17 +201,21 @@ module_t *mod_nbsd = NULL; * /etc/system. */ int dtrace_destructive_disallow = 0; +#ifndef illumos +/* Positive logic version of dtrace_destructive_disallow for loader tunable */ +int dtrace_allow_destructive = 1; +#endif dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024); size_t dtrace_difo_maxsize = (256 * 1024); -dtrace_optval_t dtrace_dof_maxsize = (256 * 1024); -size_t dtrace_global_maxsize = (16 * 1024); +dtrace_optval_t dtrace_dof_maxsize = (8 * 1024 * 1024); +size_t dtrace_statvar_maxsize = (16 * 1024); size_t dtrace_actions_max = (16 * 1024); size_t dtrace_retain_max = 1024; -dtrace_optval_t dtrace_helper_actions_max = 32; +dtrace_optval_t dtrace_helper_actions_max = 128; dtrace_optval_t dtrace_helper_providers_max = 32; dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024); size_t dtrace_strsize_default = 256; -dtrace_optval_t dtrace_cleanrate_default = 99009900; /* 101 hz */ +dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */ dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */ dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */ dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */ @@ -189,13 +229,17 @@ dtrace_optval_t dtrace_ustackframes_defa dtrace_optval_t dtrace_jstackframes_default = 50; dtrace_optval_t dtrace_jstackstrsize_default = 512; int dtrace_msgdsize_max = 128; -hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */ +hrtime_t dtrace_chill_max = MSEC2NSEC(500); /* 500 ms */ hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */ int dtrace_devdepth_max = 32; int dtrace_err_verbose; hrtime_t dtrace_deadman_interval = NANOSEC; hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC; hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC; +hrtime_t dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC; +#ifndef illumos +int dtrace_memstr_max = 4096; +#endif /* * DTrace External Variables @@ -211,22 +255,28 @@ const char dtrace_zero[256] = { 0 }; /* /* * DTrace Internal Variables */ -#if defined(sun) +#ifdef illumos static dev_info_t *dtrace_devi; /* device info */ #endif +#ifdef illumos static vmem_t *dtrace_arena; /* probe ID arena */ -#if defined(sun) static vmem_t *dtrace_minor; /* minor number arena */ +#else static taskq_t *dtrace_taskq; /* task queue */ +#ifdef __NetBSD__ +static vmem_t *dtrace_arena; /* probe ID arena */ +#else +static struct unrhdr *dtrace_arena; /* Probe ID number. */ +#endif #endif static dtrace_probe_t **dtrace_probes; /* array of all probes */ -int dtrace_probes_size=0; /* size for kmem_free */ static int dtrace_nprobes; /* number of probes */ static dtrace_provider_t *dtrace_provider; /* provider list */ static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */ static int dtrace_opens; /* number of opens */ static int dtrace_helpers; /* number of helpers */ -#if defined(sun) +static int dtrace_getf; /* number of unpriv getf()s */ +#ifdef illumos static void *dtrace_softstate; /* softstate pointer */ #endif static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */ @@ -243,12 +293,14 @@ static dtrace_ecb_t *dtrace_ecb_create_c static dtrace_genid_t dtrace_probegen; /* current probe generation */ static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */ static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */ +static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */ static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */ -#if !defined(sun) -int dtrace_in_probe; /* non-zero if executing a probe */ -#if defined(__i386__) || defined(__amd64__) -uintptr_t dtrace_in_probe_addr; /* Address of invop when already in probe */ -#endif +static int dtrace_dynvar_failclean; /* dynvars failed to clean */ +#ifdef __FreeBSD__ +static struct mtx dtrace_unr_mtx; +MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF); +static eventhandler_tag dtrace_kld_load_tag; +static eventhandler_tag dtrace_kld_unload_try_tag; #endif /* @@ -285,19 +337,23 @@ static kmutex_t dtrace_lock; /* probe static kmutex_t dtrace_provider_lock; /* provider state lock */ static kmutex_t dtrace_meta_lock; /* meta-provider state lock */ -#if !defined(sun) +#ifndef illumos /* XXX FreeBSD hacks. */ +#ifdef __FreeBSD__ static kmutex_t mod_lock; +#endif #define cr_suid cr_svuid #define cr_sgid cr_svgid #define ipaddr_t in_addr_t #define mod_modname pathname #define vuprintf vprintf +#ifdef __NetBSD__ #define ttoproc(_a) ((_a)->l_proc) +#else +#define ttoproc(_a) ((_a)->td_proc) +#endif #define crgetzoneid(_a) 0 -//#define NCPU MAXCPUS -#define NCPU ncpu #define SNOCD 0 #define CPU_ON_INTR(_a) 0 @@ -309,12 +365,18 @@ static kmutex_t mod_lock; #define PRIV_PROC_ZONE (1 << 5) #define PRIV_ALL ~0 -//SYSCTL_NODE(_debug, OID_AUTO, dtrace, CTLFLAG_RD, 0, "DTrace Information"); +SYSCTL_NODE(_debug, OID_AUTO, dtrace, CTLFLAG_RD, 0, "DTrace Information"); +SYSCTL_DECL(_debug_dtrace); +SYSCTL_DECL(_kern_dtrace); #endif -#if defined(sun) +#ifdef illumos #define curcpu_id CPU->cpu_id -#else +#endif +#ifdef __FreeBSD__ +#define curcpu_id curcpu +#endif +#ifdef __NetBSD__ #define curcpu_id cpu_number() #endif @@ -337,20 +399,10 @@ static void dtrace_nullop(void) {} -static int -dtrace_enable_nullop(void) -{ - return (0); -} - static dtrace_pops_t dtrace_provider_ops = { - (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop, -#if defined(sun) - (void (*)(void *, modctl_t *))dtrace_nullop, -#else - (void (*)(void *, dtrace_modctl_t *))dtrace_nullop, -#endif - (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop, + (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop, + (void (*)(void *, modctl_t *))dtrace_nullop, + (int (*)(void *, dtrace_id_t, void *))dtrace_nullop, (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, @@ -366,17 +418,22 @@ dtrace_id_t dtrace_probeid_error; /* sp /* * DTrace Helper Tracing Variables - */ -uint32_t dtrace_helptrace_next = 0; -uint32_t dtrace_helptrace_nlocals; -char *dtrace_helptrace_buffer; -int dtrace_helptrace_bufsize = 512 * 1024; - -#ifdef DEBUG -int dtrace_helptrace_enabled = 1; -#else -int dtrace_helptrace_enabled = 0; -#endif + * + * These variables should be set dynamically to enable helper tracing. The + * only variables that should be set are dtrace_helptrace_enable (which should + * be set to a non-zero value to allocate helper tracing buffers on the next + * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a + * non-zero value to deallocate helper tracing buffers on the next close of + * /dev/dtrace). When (and only when) helper tracing is disabled, the + * buffer size may also be set via dtrace_helptrace_bufsize. + */ +int dtrace_helptrace_enable = 0; +int dtrace_helptrace_disable = 0; +int dtrace_helptrace_bufsize = 16 * 1024 * 1024; +uint32_t dtrace_helptrace_nlocals; +static dtrace_helptrace_t *dtrace_helptrace_buffer; +static uint32_t dtrace_helptrace_next = 0; +static int dtrace_helptrace_wrapped = 0; /* * DTrace Error Hashing @@ -434,7 +491,7 @@ static kmutex_t dtrace_errlock; * no way for a global variable key signature to match a thread-local key * signature. */ -#if defined(sun) +#ifdef illumos #define DTRACE_TLS_THRKEY(where) { \ uint_t intr = 0; \ uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \ @@ -444,27 +501,27 @@ static kmutex_t dtrace_errlock; (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \ (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \ } -#else -#define DTRACE_TLS_THRKEY(where) { \ - uint_t intr = 0; \ - (where) = ((curthread->l_lid + (curthread->l_proc->p_pid << 16) + \ - DIF_VARIABLE_MAX) & \ - (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \ -} -#if 0 +#endif +#ifdef __FreeBSD__ #define DTRACE_TLS_THRKEY(where) { \ - solaris_cpu_t *_c = &solaris_cpu[curcpu_id]; \ + solaris_cpu_t *_c = &solaris_cpu[curcpu]; \ uint_t intr = 0; \ uint_t actv = _c->cpu_intr_actv; \ for (; actv; actv >>= 1) \ intr++; \ ASSERT(intr < (1 << 3)); \ + (where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \ + (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \ +} +#endif +#ifdef __NetBSD__ +#define DTRACE_TLS_THRKEY(where) { \ + uint_t intr = 0; \ (where) = ((curthread->l_lid + (curthread->l_proc->p_pid << 16) + \ DIF_VARIABLE_MAX) & \ (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \ } #endif -#endif #define DT_BSWAP_8(x) ((x) & 0xff) #define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8)) @@ -476,7 +533,7 @@ static kmutex_t dtrace_errlock; #define DTRACE_STORE(type, tomax, offset, what) \ *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what); -#ifndef __i386 +#ifndef __x86 #define DTRACE_ALIGNCHECK(addr, size, flags) \ if (addr & (size - 1)) { \ *flags |= CPU_DTRACE_BADALIGN; \ @@ -494,10 +551,18 @@ static kmutex_t dtrace_errlock; * disallow all negative sizes. Ranges of size 0 are allowed. */ #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \ - ((testaddr) - (baseaddr) < (basesz) && \ - (testaddr) + (testsz) - (baseaddr) <= (basesz) && \ + ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \ + (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \ (testaddr) + (testsz) >= (testaddr)) +#define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \ +do { \ + if ((remp) != NULL) { \ + *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \ + } \ +_NOTE(CONSTCOND) } while (0) + + /* * Test whether alloc_sz bytes will fit in the scratch region. We isolate * alloc_sz on the righthand side of the comparison in order to avoid overflow @@ -584,21 +649,19 @@ static dtrace_probe_t *dtrace_probe_look static void dtrace_enabling_provide(dtrace_provider_t *); static int dtrace_enabling_match(dtrace_enabling_t *, int *); static void dtrace_enabling_matchall(void); +static void dtrace_enabling_reap(void); static dtrace_state_t *dtrace_anon_grab(void); static uint64_t dtrace_helper(int, dtrace_mstate_t *, dtrace_state_t *, uint64_t, uint64_t); -#if defined(sun) static dtrace_helpers_t *dtrace_helpers_create(proc_t *); -#endif static void dtrace_buffer_drop(dtrace_buffer_t *); +static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when); static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t, dtrace_state_t *, dtrace_mstate_t *); static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t, dtrace_optval_t); static int dtrace_ecb_create_enable(dtrace_probe_t *, void *); -#if defined(sun) static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *); -#endif uint16_t dtrace_load16(uintptr_t); uint32_t dtrace_load32(uintptr_t); uint64_t dtrace_load64(uintptr_t); @@ -607,6 +670,12 @@ void dtrace_dynvar_clean(dtrace_dstate_t dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *, size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *); uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *); +static int dtrace_priv_proc(dtrace_state_t *); +static void dtrace_getf_barrier(void); +static int dtrace_canload_remains(uint64_t, size_t, size_t *, + dtrace_mstate_t *, dtrace_vstate_t *); +static int dtrace_canstore_remains(uint64_t, size_t, size_t *, + dtrace_mstate_t *, dtrace_vstate_t *); /* * DTrace Probe Context Functions @@ -631,7 +700,7 @@ dtrace_panic(const char *format, ...) va_list alist; va_start(alist, format); -#ifdef __NetBSD__ +#ifndef illumos vpanic(format, alist); #else dtrace_vpanic(format, alist); @@ -694,10 +763,12 @@ dtrace_error(uint32_t *counter) * Use the DTRACE_LOADFUNC macro to define functions for each of loading a * uint8_t, a uint16_t, a uint32_t and a uint64_t. */ +/* BEGIN CSTYLED */ DTRACE_LOADFUNC(8) DTRACE_LOADFUNC(16) DTRACE_LOADFUNC(32) DTRACE_LOADFUNC(64) +/* END CSTYLED */ static int dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate) @@ -715,19 +786,43 @@ dtrace_inscratch(uintptr_t dest, size_t } static int -dtrace_canstore_statvar(uint64_t addr, size_t sz, +dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain, dtrace_statvar_t **svars, int nsvars) { int i; + size_t maxglobalsize, maxlocalsize; + + if (nsvars == 0) + return (0); + + maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t); + maxlocalsize = maxglobalsize * NCPU; for (i = 0; i < nsvars; i++) { dtrace_statvar_t *svar = svars[i]; + uint8_t scope; + size_t size; - if (svar == NULL || svar->dtsv_size == 0) + if (svar == NULL || (size = svar->dtsv_size) == 0) continue; - if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size)) + scope = svar->dtsv_var.dtdv_scope; + + /* + * We verify that our size is valid in the spirit of providing + * defense in depth: we want to prevent attackers from using + * DTrace to escalate an orthogonal kernel heap corruption bug + * into the ability to store to arbitrary locations in memory. + */ + VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) || + (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize)); + + if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, + svar->dtsv_size)) { + DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data, + svar->dtsv_size); return (1); + } } return (0); @@ -743,12 +838,26 @@ static int dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) { + return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate)); +} + +/* + * Implementation of dtrace_canstore which communicates the upper bound of the + * allowed memory region. + */ +static int +dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain, + dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) +{ /* * First, check to see if the address is in scratch space... */ if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base, - mstate->dtms_scratch_size)) + mstate->dtms_scratch_size)) { + DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base, + mstate->dtms_scratch_size); return (1); + } /* * Now check to see if it's a dynamic variable. This check will pick @@ -761,6 +870,7 @@ dtrace_canstore(uint64_t addr, size_t sz uintptr_t base = (uintptr_t)dstate->dtds_base + (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t)); uintptr_t chunkoffs; + dtrace_dynvar_t *dvar; /* * Before we assume that we can store here, we need to make @@ -777,6 +887,8 @@ dtrace_canstore(uint64_t addr, size_t sz * * (3) Not span a chunk boundary * + * (4) Not be in the tuple space of a dynamic variable + * */ if (addr < base) return (0); @@ -789,6 +901,16 @@ dtrace_canstore(uint64_t addr, size_t sz if (chunkoffs + sz > dstate->dtds_chunksize) return (0); + dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs); + + if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) + return (0); + + if (chunkoffs < sizeof (dtrace_dynvar_t) + + ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t))) + return (0); + + DTRACE_RANGE_REMAIN(remain, addr, dvar, dstate->dtds_chunksize); return (1); } @@ -796,11 +918,11 @@ dtrace_canstore(uint64_t addr, size_t sz * Finally, check the static local and global variables. These checks * take the longest, so we perform them last. */ - if (dtrace_canstore_statvar(addr, sz, + if (dtrace_canstore_statvar(addr, sz, remain, vstate->dtvs_locals, vstate->dtvs_nlocals)) return (1); - if (dtrace_canstore_statvar(addr, sz, + if (dtrace_canstore_statvar(addr, sz, remain, vstate->dtvs_globals, vstate->dtvs_nglobals)) return (1); @@ -821,27 +943,167 @@ static int dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) { + return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate)); +} + +/* + * Implementation of dtrace_canload which communicates the uppoer bound of the + * allowed memory region. + */ +static int +dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain, + dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) +{ volatile uintptr_t *illval = &cpu_core[curcpu_id].cpuc_dtrace_illval; + file_t *fp; /* * If we hold the privilege to read from kernel memory, then * everything is readable. */ - if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) + if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) { + DTRACE_RANGE_REMAIN(remain, addr, addr, sz); return (1); + } /* * You can obviously read that which you can store. */ - if (dtrace_canstore(addr, sz, mstate, vstate)) + if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate)) return (1); /* * We're allowed to read from our own string table. */ - if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab, - mstate->dtms_difo->dtdo_strlen)) + if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab, + mstate->dtms_difo->dtdo_strlen)) { + DTRACE_RANGE_REMAIN(remain, addr, + mstate->dtms_difo->dtdo_strtab, + mstate->dtms_difo->dtdo_strlen); return (1); + } + + if (vstate->dtvs_state != NULL && + dtrace_priv_proc(vstate->dtvs_state)) { + proc_t *p; + + /* + * When we have privileges to the current process, there are + * several context-related kernel structures that are safe to + * read, even absent the privilege to read from kernel memory. + * These reads are safe because these structures contain only + * state that (1) we're permitted to read, (2) is harmless or + * (3) contains pointers to additional kernel state that we're + * not permitted to read (and as such, do not present an + * opportunity for privilege escalation). Finally (and + * critically), because of the nature of their relation with + * the current thread context, the memory associated with these + * structures cannot change over the duration of probe context, + * and it is therefore impossible for this memory to be + * deallocated and reallocated as something else while it's + * being operated upon. + */ + if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) { + DTRACE_RANGE_REMAIN(remain, addr, curthread, + sizeof (kthread_t)); + return (1); + } + + if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr, + sz, curthread->t_procp, sizeof (proc_t))) { + DTRACE_RANGE_REMAIN(remain, addr, curthread->t_procp, + sizeof (proc_t)); + return (1); + } + +#ifndef __NetBSD__ + if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz, + curthread->t_cred, sizeof (cred_t))) { + DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cred, + sizeof (cred_t)); + return (1); + } +#endif + +#ifdef illumos + if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz, + &(p->p_pidp->pid_id), sizeof (pid_t))) { + DTRACE_RANGE_REMAIN(remain, addr, &(p->p_pidp->pid_id), + sizeof (pid_t)); + return (1); + } + + if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz, + curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) { + DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cpu, + offsetof(cpu_t, cpu_pause_thread)); + return (1); + } +#endif + } + + if ((fp = mstate->dtms_getf) != NULL) { + uintptr_t psz = sizeof (void *); + vnode_t *vp; + vnodeops_t *op; + + /* + * When getf() returns a file_t, the enabling is implicitly + * granted the (transient) right to read the returned file_t + * as well as the v_path and v_op->vnop_name of the underlying + * vnode. These accesses are allowed after a successful + * getf() because the members that they refer to cannot change + * once set -- and the barrier logic in the kernel's closef() + * path assures that the file_t and its referenced vode_t + * cannot themselves be stale (that is, it impossible for + * either dtms_getf itself or its f_vnode member to reference + * freed memory). + */ + if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) { + DTRACE_RANGE_REMAIN(remain, addr, fp, sizeof (file_t)); + return (1); + } + + if ((vp = fp->f_vnode) != NULL) { + size_t slen; +#ifdef illumos + if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) { + DTRACE_RANGE_REMAIN(remain, addr, &vp->v_path, + psz); + return (1); + } + slen = strlen(vp->v_path) + 1; + if (DTRACE_INRANGE(addr, sz, vp->v_path, slen)) { + DTRACE_RANGE_REMAIN(remain, addr, vp->v_path, + slen); + return (1); + } +#endif + + if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) { + DTRACE_RANGE_REMAIN(remain, addr, &vp->v_op, + psz); + return (1); + } + +#ifdef illumos + if ((op = vp->v_op) != NULL && + DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) { + DTRACE_RANGE_REMAIN(remain, addr, + &op->vnop_name, psz); + return (1); + } + + if (op != NULL && op->vnop_name != NULL && + DTRACE_INRANGE(addr, sz, op->vnop_name, + (slen = strlen(op->vnop_name) + 1))) { + DTRACE_RANGE_REMAIN(remain, addr, + op->vnop_name, slen); + return (1); + } +#endif + } + } DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV); *illval = addr; @@ -855,21 +1117,42 @@ dtrace_canload(uint64_t addr, size_t sz, * calls in the event that the user has all privileges. */ static int -dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, - dtrace_vstate_t *vstate) +dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain, + dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) { - size_t strsz; + size_t rsize; /* * If we hold the privilege to read from kernel memory, then * everything is readable. */ - if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) - return (1); - strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz); - if (dtrace_canload(addr, strsz, mstate, vstate)) + if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) { + DTRACE_RANGE_REMAIN(remain, addr, addr, sz); return (1); + } + + /* + * Even if the caller is uninterested in querying the remaining valid + * range, it is required to ensure that the access is allowed. + */ + if (remain == NULL) { + remain = &rsize; + } + if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) { + size_t strsz; + /* + * Perform the strlen after determining the length of the + * memory region which is accessible. This prevents timing + * information from being used to find NULs in memory which is + * not accessible to the caller. + */ + strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, + MIN(sz, *remain)); + if (strsz <= *remain) { + return (1); + } + } return (0); } @@ -879,26 +1162,101 @@ dtrace_strcanload(uint64_t addr, size_t * region in which a load may be issued given the user's privilege level. */ static int -dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate, - dtrace_vstate_t *vstate) +dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain, + dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) { size_t sz; ASSERT(type->dtdt_flags & DIF_TF_BYREF); /* + * Calculate the max size before performing any checks since even + * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function + * return the max length via 'remain'. + */ + if (type->dtdt_kind == DIF_TYPE_STRING) { + dtrace_state_t *state = vstate->dtvs_state; + + if (state != NULL) { + sz = state->dts_options[DTRACEOPT_STRSIZE]; + } else { + /* + * In helper context, we have a NULL state; fall back + * to using the system-wide default for the string size + * in this case. + */ + sz = dtrace_strsize_default; + } + } else { + sz = type->dtdt_size; + } + + /* * If we hold the privilege to read from kernel memory, then * everything is readable. */ - if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) + if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) { + DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz); return (1); + } - if (type->dtdt_kind == DIF_TYPE_STRING) - sz = dtrace_strlen(src, - vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1; - else - sz = type->dtdt_size; + if (type->dtdt_kind == DIF_TYPE_STRING) { + return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate, + vstate)); + } + return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate, + vstate)); +} + +/* + * Convert a string to a signed integer using safe loads. + * + * NOTE: This function uses various macros from strtolctype.h to manipulate + * digit values, etc -- these have all been checked to ensure they make + * no additional function calls. + */ +static int64_t +dtrace_strtoll(char *input, int base, size_t limit) +{ + uintptr_t pos = (uintptr_t)input; + int64_t val = 0; + int x; + boolean_t neg = B_FALSE; + char c, cc, ccc; + uintptr_t end = pos + limit; + + /* + * Consume any whitespace preceding digits. + */ + while ((c = dtrace_load8(pos)) == ' ' || c == '\t') + pos++; + + /* + * Handle an explicit sign if one is present. + */ + if (c == '-' || c == '+') { + if (c == '-') + neg = B_TRUE; + c = dtrace_load8(++pos); + } - return (dtrace_canload((uintptr_t)src, sz, mstate, vstate)); + /* + * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it + * if present. + */ + if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' || + cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) { + pos += 2; + c = ccc; + } + + /* + * Read in contiguous digits until the first non-digit character. + */ + for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base; + c = dtrace_load8(++pos)) + val = val * base + x; + + return (neg ? -val : val); } /* @@ -1036,14 +1394,14 @@ dtrace_strcpy(const void *src, void *dst * specified type; we assume that we can store to directly. */ static void -dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type) +dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit) { ASSERT(type->dtdt_flags & DIF_TF_BYREF); if (type->dtdt_kind == DIF_TYPE_STRING) { - dtrace_strcpy(src, dst, type->dtdt_size); + dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit)); } else { - dtrace_bcopy(src, dst, type->dtdt_size); + dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit)); } } @@ -1193,16 +1551,7 @@ dtrace_priv_proc_common_user(dtrace_stat */ ASSERT(s_cr != NULL); -#if defined(sun) - if ((cr = CRED()) != NULL && - s_cr->cr_uid == cr->cr_uid && - s_cr->cr_uid == cr->cr_ruid && - s_cr->cr_uid == cr->cr_suid && - s_cr->cr_gid == cr->cr_gid && - s_cr->cr_gid == cr->cr_rgid && - s_cr->cr_gid == cr->cr_sgid) - return (1); -#else +#ifdef __NetBSD__ if ((cr = CRED()) != NULL) { uid_t uid; gid_t gid; @@ -1215,10 +1564,18 @@ dtrace_priv_proc_common_user(dtrace_stat uid == kauth_cred_getsvuid(cr) && gid == kauth_cred_getgid(cr) && gid == kauth_cred_getegid(cr) && - gid == kauth_cred_getsvgid(cr)) { + gid == kauth_cred_getsvgid(cr)) return 1; - } } +#else + if ((cr = CRED()) != NULL && + s_cr->cr_uid == cr->cr_uid && + s_cr->cr_uid == cr->cr_ruid && + s_cr->cr_uid == cr->cr_suid && + s_cr->cr_gid == cr->cr_gid && + s_cr->cr_gid == cr->cr_rgid && + s_cr->cr_gid == cr->cr_sgid) + return (1); #endif return (0); @@ -1232,7 +1589,7 @@ dtrace_priv_proc_common_user(dtrace_stat static int dtrace_priv_proc_common_zone(dtrace_state_t *state) { -#if defined(sun) +#ifdef illumos cred_t *cr, *s_cr = state->dts_cred.dcr_cred; /* @@ -1241,7 +1598,7 @@ dtrace_priv_proc_common_zone(dtrace_stat */ ASSERT(s_cr != NULL); - if ((cr = CRED()) != NULL && + if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone) s_cr->cr_zone == cr->cr_zone) return (1); @@ -1342,38 +1699,214 @@ dtrace_priv_kernel_destructive(dtrace_st } /* - * Note: not called from probe context. This function is called - * asynchronously (and at a regular interval) from outside of probe context to - * clean the dirty dynamic variable lists on all CPUs. Dynamic variable - * cleaning is explained in detail in . + * Determine if the dte_cond of the specified ECB allows for processing of + * the current probe to continue. Note that this routine may allow continued + * processing, but with access(es) stripped from the mstate's dtms_access + * field. */ -void -dtrace_dynvar_clean(dtrace_dstate_t *dstate) +static int +dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate, + dtrace_ecb_t *ecb) { - dtrace_dynvar_t *dirty; - dtrace_dstate_percpu_t *dcpu; - int i, work = 0; + dtrace_probe_t *probe = ecb->dte_probe; + dtrace_provider_t *prov = probe->dtpr_provider; + dtrace_pops_t *pops = &prov->dtpv_pops; + int mode = DTRACE_MODE_NOPRIV_DROP; - for (i = 0; i < NCPU; i++) { - dcpu = &dstate->dtds_percpu[i]; + ASSERT(ecb->dte_cond); - ASSERT(dcpu->dtdsc_rinsing == NULL); +#ifdef illumos + if (pops->dtps_mode != NULL) { + mode = pops->dtps_mode(prov->dtpv_arg, + probe->dtpr_id, probe->dtpr_arg); - /* - * If the dirty list is NULL, there is no dirty work to do. - */ - if (dcpu->dtdsc_dirty == NULL) - continue; + ASSERT((mode & DTRACE_MODE_USER) || + (mode & DTRACE_MODE_KERNEL)); + ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) || + (mode & DTRACE_MODE_NOPRIV_DROP)); + } + + /* + * If the dte_cond bits indicate that this consumer is only allowed to + * see user-mode firings of this probe, call the provider's dtps_mode() + * entry point to check that the probe was fired while in a user + * context. If that's not the case, use the policy specified by the + * provider to determine if we drop the probe or merely restrict + * operation. + */ + if (ecb->dte_cond & DTRACE_COND_USERMODE) { + ASSERT(mode != DTRACE_MODE_NOPRIV_DROP); + + if (!(mode & DTRACE_MODE_USER)) { + if (mode & DTRACE_MODE_NOPRIV_DROP) + return (0); + + mstate->dtms_access &= ~DTRACE_ACCESS_ARGS; + } + } +#endif + + /* + * This is more subtle than it looks. We have to be absolutely certain + * that CRED() isn't going to change out from under us so it's only + * legit to examine that structure if we're in constrained situations. + * Currently, the only times we'll this check is if a non-super-user + * has enabled the profile or syscall providers -- providers that + * allow visibility of all processes. For the profile case, the check + * above will ensure that we're examining a user context. + */ + if (ecb->dte_cond & DTRACE_COND_OWNER) { + cred_t *cr; + cred_t *s_cr = state->dts_cred.dcr_cred; + proc_t *proc; + + ASSERT(s_cr != NULL); + +#ifdef __NetBSD__ + uid_t uid = kauth_cred_getuid(s_cr); + gid_t gid = kauth_cred_getgid(s_cr); + + if ((cr = CRED()) == NULL || + uid != kauth_cred_geteuid(cr) || + uid != kauth_cred_getuid(cr) || + uid != kauth_cred_getsvuid(cr) || + gid != kauth_cred_getegid(cr) || + gid != kauth_cred_getgid(cr) || + gid != kauth_cred_getsvgid(cr) || + (proc = ttoproc(curthread)) == NULL || + (proc->p_flag & SNOCD)) { + if (mode & DTRACE_MODE_NOPRIV_DROP) + return (0); + } +#else /* __NetBSD__ */ + if ((cr = CRED()) == NULL || + s_cr->cr_uid != cr->cr_uid || + s_cr->cr_uid != cr->cr_ruid || + s_cr->cr_uid != cr->cr_suid || + s_cr->cr_gid != cr->cr_gid || + s_cr->cr_gid != cr->cr_rgid || + s_cr->cr_gid != cr->cr_sgid || + (proc = ttoproc(curthread)) == NULL || + (proc->p_flag & SNOCD)) { + if (mode & DTRACE_MODE_NOPRIV_DROP) + return (0); + +#ifdef illumos + mstate->dtms_access &= ~DTRACE_ACCESS_PROC; +#endif + } +#endif /* __NetBSD__ */ + } + +#ifdef illumos + /* + * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not + * in our zone, check to see if our mode policy is to restrict rather + * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC + * and DTRACE_ACCESS_ARGS + */ + if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) { + cred_t *cr; + cred_t *s_cr = state->dts_cred.dcr_cred; + + ASSERT(s_cr != NULL); + + if ((cr = CRED()) == NULL || + s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) { + if (mode & DTRACE_MODE_NOPRIV_DROP) + return (0); + + mstate->dtms_access &= + ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS); + } + } +#endif + + return (1); +} + +/* + * Note: not called from probe context. This function is called + * asynchronously (and at a regular interval) from outside of probe context to + * clean the dirty dynamic variable lists on all CPUs. Dynamic variable + * cleaning is explained in detail in . + */ +void +dtrace_dynvar_clean(dtrace_dstate_t *dstate) +{ + dtrace_dynvar_t *dirty; + dtrace_dstate_percpu_t *dcpu; + dtrace_dynvar_t **rinsep; + int i, j, work = 0; + + for (i = 0; i < NCPU; i++) { + dcpu = &dstate->dtds_percpu[i]; + + rinsep = &dcpu->dtdsc_rinsing; /* - * If the clean list is non-NULL, then we're not going to do - * any work for this CPU -- it means that there has not been - * a dtrace_dynvar() allocation on this CPU (or from this CPU) - * since the last time we cleaned house. + * If the dirty list is NULL, there is no dirty work to do. */ - if (dcpu->dtdsc_clean != NULL) + if (dcpu->dtdsc_dirty == NULL) continue; + if (dcpu->dtdsc_rinsing != NULL) { + /* + * If the rinsing list is non-NULL, then it is because + * this CPU was selected to accept another CPU's + * dirty list -- and since that time, dirty buffers + * have accumulated. This is a highly unlikely + * condition, but we choose to ignore the dirty + * buffers -- they'll be picked up a future cleanse. + */ + continue; + } + + if (dcpu->dtdsc_clean != NULL) { + /* + * If the clean list is non-NULL, then we're in a + * situation where a CPU has done deallocations (we + * have a non-NULL dirty list) but no allocations (we + * also have a non-NULL clean list). We can't simply + * move the dirty list into the clean list on this + * CPU, yet we also don't want to allow this condition + * to persist, lest a short clean list prevent a + * massive dirty list from being cleaned (which in + * turn could lead to otherwise avoidable dynamic + * drops). To deal with this, we look for some CPU + * with a NULL clean list, NULL dirty list, and NULL + * rinsing list -- and then we borrow this CPU to + * rinse our dirty list. + */ + for (j = 0; j < NCPU; j++) { + dtrace_dstate_percpu_t *rinser; + + rinser = &dstate->dtds_percpu[j]; + + if (rinser->dtdsc_rinsing != NULL) + continue; + + if (rinser->dtdsc_dirty != NULL) + continue; + + if (rinser->dtdsc_clean != NULL) + continue; + + rinsep = &rinser->dtdsc_rinsing; + break; + } + + if (j == NCPU) { + /* + * We were unable to find another CPU that + * could accept this dirty list -- we are + * therefore unable to clean it now. + */ + dtrace_dynvar_failclean++; + continue; + } + } + work = 1; /* @@ -1389,7 +1922,7 @@ dtrace_dynvar_clean(dtrace_dstate_t *dst * on a hash chain, either the dirty list or the * rinsing list for some CPU must be non-NULL.) */ - dcpu->dtdsc_rinsing = dirty; + *rinsep = dirty; dtrace_membar_producer(); } while (dtrace_casptr(&dcpu->dtdsc_dirty, dirty, NULL) != dirty); @@ -1797,7 +2330,7 @@ retry: /* * The clean list appears to be non-empty. We want to - * move the clean list to the free list; we start by + * move the clean list to our free list; we start by * moving the clean pointer aside. */ if (dtrace_casptr(&dcpu->dtdsc_clean, @@ -1833,6 +2366,7 @@ retry: * owners of the clean lists out before resetting * the clean lists. */ + dcpu = &dstate->dtds_percpu[me]; rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean); ASSERT(rval == NULL); goto retry; @@ -1881,7 +2415,7 @@ retry: * this hash chain, or another CPU is deleting an element from this * hash chain. The simplest way to deal with both of these cases * (though not necessarily the most efficient) is to free our - * allocated block and tail-call ourselves. Note that the free is + * allocated block and re-attempt it all. Note that the free is * to the dirty list and _not_ to the free list. This is to prevent * races with allocators, above. */ @@ -1894,7 +2428,7 @@ retry: dvar->dtdv_next = free; } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free); - return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate)); + goto top; } /*ARGSUSED*/ @@ -1974,6 +2508,75 @@ dtrace_aggregate_lquantize(uint64_t *lqu lquanta[levels + 1] += incr; } +static int +dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low, + uint16_t high, uint16_t nsteps, int64_t value) +{ + int64_t this = 1, last, next; + int base = 1, order; + + ASSERT(factor <= nsteps); + ASSERT(nsteps % factor == 0); + + for (order = 0; order < low; order++) + this *= factor; + + /* + * If our value is less than our factor taken to the power of the + * low order of magnitude, it goes into the zeroth bucket. + */ + if (value < (last = this)) + return (0); + + for (this *= factor; order <= high; order++) { + int nbuckets = this > nsteps ? nsteps : this; + + if ((next = this * factor) < this) { + /* + * We should not generally get log/linear quantizations + * with a high magnitude that allows 64-bits to + * overflow, but we nonetheless protect against this + * by explicitly checking for overflow, and clamping + * our value accordingly. + */ + value = this - 1; + } + + if (value < this) { + /* + * If our value lies within this order of magnitude, + * determine its position by taking the offset within + * the order of magnitude, dividing by the bucket + * width, and adding to our (accumulated) base. + */ + return (base + (value - last) / (this / nbuckets)); + } + + base += nbuckets - (nbuckets / factor); + last = this; + this = next; + } + + /* + * Our value is greater than or equal to our factor taken to the + * power of one plus the high magnitude -- return the top bucket. + */ + return (base); +} + +static void +dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr) +{ + uint64_t arg = *llquanta++; + uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg); + uint16_t low = DTRACE_LLQUANTIZE_LOW(arg); + uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg); + uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg); + + llquanta[dtrace_aggregate_llquantize_bucket(factor, + low, high, nsteps, nval)] += incr; +} + /*ARGSUSED*/ static void dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg) @@ -2334,9 +2937,10 @@ dtrace_speculation_commit(dtrace_state_t { dtrace_speculation_t *spec; dtrace_buffer_t *src, *dest; - uintptr_t daddr, saddr, dlimit; + uintptr_t daddr, saddr, dlimit, slimit; dtrace_speculation_state_t current, new = 0; intptr_t offs; + uint64_t timestamp; if (which == 0) return; @@ -2412,7 +3016,37 @@ dtrace_speculation_commit(dtrace_state_t } /* - * We have the space; copy the buffer across. (Note that this is a + * We have sufficient space to copy the speculative buffer into the + * primary buffer. First, modify the speculative buffer, filling + * in the timestamp of all entries with the current time. The data + * must have the commit() time rather than the time it was traced, + * so that all entries in the primary buffer are in timestamp order. + */ + timestamp = dtrace_gethrtime(); + saddr = (uintptr_t)src->dtb_tomax; + slimit = saddr + src->dtb_offset; + while (saddr < slimit) { + size_t size; + dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr; + + if (dtrh->dtrh_epid == DTRACE_EPIDNONE) { + saddr += sizeof (dtrace_epid_t); + continue; + } + ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs); + size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size; + + ASSERT3U(saddr + size, <=, slimit); + ASSERT3U(size, >=, sizeof (dtrace_rechdr_t)); + ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX); + + DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp); + + saddr += size; + } + + /* + * Copy the buffer across. (Note that this is a * highly subobtimal bcopy(); in the unlikely event that this becomes * a serious performance issue, a high-performance DTrace-specific * bcopy() should obviously be invented.) @@ -2736,7 +3370,6 @@ dtrace_dif_varstr(uintptr_t addr, dtrace return (ret); } -#ifdef notyet /* * Return a string from a memoy address which is known to have one or * more concatenated, individually zero terminated, sub-strings. @@ -2774,7 +3407,6 @@ dtrace_dif_varstrz(uintptr_t addr, size_ mstate->dtms_scratch_ptr += strsz; return (ret); } -#endif /* * This function implements the DIF emulator's variable lookups. The emulator @@ -2827,7 +3459,7 @@ dtrace_dif_variable(dtrace_mstate_t *mst return (mstate->dtms_arg[ndx]); -#if defined(sun) +#ifdef illumos case DIF_VAR_UREGS: { klwp_t *lwp; @@ -2844,9 +3476,41 @@ dtrace_dif_variable(dtrace_mstate_t *mst return (0); } #endif +#ifdef __FreeBSD__ + case DIF_VAR_UREGS: { + struct trapframe *tframe; + + if (!dtrace_priv_proc(state)) + return (0); + + if ((tframe = curthread->td_frame) == NULL) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR); + cpu_core[curcpu].cpuc_dtrace_illval = 0; + return (0); + } + + return (dtrace_getreg(tframe, ndx)); + } +#endif +#ifdef __NetBSD__ + case DIF_VAR_UREGS: { + struct trapframe *tframe; + + if (!dtrace_priv_proc(state)) + return (0); + + if ((tframe = lwp_trapframe(curlwp)) == NULL) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR); + cpu_core[curcpu_id].cpuc_dtrace_illval = 0; + return (0); + } + + return (dtrace_getreg(tframe, ndx)); + } +#endif case DIF_VAR_CURTHREAD: - if (!dtrace_priv_kernel(state)) + if (!dtrace_priv_proc(state)) return (0); return ((uint64_t)(uintptr_t)curthread); @@ -2868,7 +3532,7 @@ dtrace_dif_variable(dtrace_mstate_t *mst } return (mstate->dtms_walltimestamp); -#if defined(sun) +#ifdef illumos case DIF_VAR_IPL: if (!dtrace_priv_kernel(state)) return (0); @@ -3005,7 +3669,7 @@ dtrace_dif_variable(dtrace_mstate_t *mst if (!dtrace_priv_proc(state)) return (0); -#if defined(sun) +#ifdef illumos /* * Note that we are assuming that an unanchored probe is * always due to a high-level interrupt. (And we're assuming @@ -3031,7 +3695,7 @@ dtrace_dif_variable(dtrace_mstate_t *mst if (!dtrace_priv_proc(state)) return (0); -#if defined(sun) +#ifdef illumos /* * See comment in DIF_VAR_PID. */ @@ -3046,11 +3710,14 @@ dtrace_dif_variable(dtrace_mstate_t *mst */ return ((uint64_t)curthread->t_procp->p_ppid); #else - return ((uint64_t)curproc->p_pptr->p_pid); + if (curproc->p_pid == proc0.p_pid) + return (curproc->p_pid); + else + return (curproc->p_pptr->p_pid); #endif case DIF_VAR_TID: -#if defined(sun) +#ifdef illumos /* * See comment in DIF_VAR_PID. */ @@ -3061,7 +3728,7 @@ dtrace_dif_variable(dtrace_mstate_t *mst return ((uint64_t)curthread->t_tid); case DIF_VAR_EXECARGS: { -#if 0 +#ifdef __FreeBSD__ struct pargs *p_args = curthread->td_proc->p_args; if (p_args == NULL) @@ -3069,13 +3736,13 @@ dtrace_dif_variable(dtrace_mstate_t *mst return (dtrace_dif_varstrz( (uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate)); -#endif - /* XXX FreeBSD extension */ +#else return 0; +#endif } case DIF_VAR_EXECNAME: -#if defined(sun) +#ifdef illumos if (!dtrace_priv_proc(state)) return (0); @@ -3096,11 +3763,11 @@ dtrace_dif_variable(dtrace_mstate_t *mst state, mstate)); #else return (dtrace_dif_varstr( - (uintptr_t) curthread->l_proc->p_comm, state, mstate)); + (uintptr_t) curproc->p_comm, state, mstate)); #endif case DIF_VAR_ZONENAME: -#if defined(sun) +#ifdef illumos if (!dtrace_priv_proc(state)) return (0); @@ -3127,7 +3794,7 @@ dtrace_dif_variable(dtrace_mstate_t *mst if (!dtrace_priv_proc(state)) return (0); -#if defined(sun) +#ifdef illumos /* * See comment in DIF_VAR_PID. */ @@ -3144,15 +3811,19 @@ dtrace_dif_variable(dtrace_mstate_t *mst * credential, since this is never NULL after process birth. */ return ((uint64_t)curthread->t_procp->p_cred->cr_uid); -#else - return (uint64_t)kauth_cred_getuid(curthread->t_procp->p_cred); +#endif +#ifdef __FreeBSD__ + return ((uint64_t)curthread->td_ucred->cr_uid); +#endif +#ifdef __NetBSD__ + return ((uint64_t)kauth_cred_getuid(curthread->t_procp->p_cred)); #endif case DIF_VAR_GID: if (!dtrace_priv_proc(state)) return (0); -#if defined(sun) +#ifdef illumos /* * See comment in DIF_VAR_PID. */ @@ -3169,12 +3840,16 @@ dtrace_dif_variable(dtrace_mstate_t *mst * credential, since this is never NULL after process birth. */ return ((uint64_t)curthread->t_procp->p_cred->cr_gid); -#else - return (uint64_t)kauth_cred_getgid(curthread->t_procp->p_cred); +#endif +#ifdef __FreeBSD__ + return ((uint64_t)curthread->td_ucred->cr_gid); +#endif +#ifdef __NetBSD__ + return ((uint64_t)kauth_cred_getgid(curthread->t_procp->p_cred)); #endif case DIF_VAR_ERRNO: { -#if defined(sun) +#ifdef illumos klwp_t *lwp; if (!dtrace_priv_proc(state)) return (0); @@ -3195,20 +3870,482 @@ dtrace_dif_variable(dtrace_mstate_t *mst return (0); return ((uint64_t)lwp->lwp_errno); -#else -#if 0 - return (curthread->l_errno); -#else - return 0; /* XXX TBD errno support at lwp level? */ #endif +#ifdef __FreeBSD__ + return (curthread->td_errno); +#endif +#ifdef __NetBSD__ + return 0; /* XXX TBD errno support at lwp level? */ #endif } +#ifndef illumos + case DIF_VAR_CPU: { + return curcpu_id; + } +#endif default: DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); return (0); } } + +typedef enum dtrace_json_state { + DTRACE_JSON_REST = 1, + DTRACE_JSON_OBJECT, + DTRACE_JSON_STRING, + DTRACE_JSON_STRING_ESCAPE, + DTRACE_JSON_STRING_ESCAPE_UNICODE, + DTRACE_JSON_COLON, + DTRACE_JSON_COMMA, + DTRACE_JSON_VALUE, + DTRACE_JSON_IDENTIFIER, + DTRACE_JSON_NUMBER, + DTRACE_JSON_NUMBER_FRAC, + DTRACE_JSON_NUMBER_EXP, + DTRACE_JSON_COLLECT_OBJECT +} dtrace_json_state_t; + +/* + * This function possesses just enough knowledge about JSON to extract a single + * value from a JSON string and store it in the scratch buffer. It is able + * to extract nested object values, and members of arrays by index. + * + * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to + * be looked up as we descend into the object tree. e.g. + * + * foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL + * with nelems = 5. + * + * The run time of this function must be bounded above by strsize to limit the + * amount of work done in probe context. As such, it is implemented as a + * simple state machine, reading one character at a time using safe loads + * until we find the requested element, hit a parsing error or run off the + * end of the object or string. + * + * As there is no way for a subroutine to return an error without interrupting + * clause execution, we simply return NULL in the event of a missing key or any + * other error condition. Each NULL return in this function is commented with + * the error condition it represents -- parsing or otherwise. + * + * The set of states for the state machine closely matches the JSON + * specification (http://json.org/). Briefly: + * + * DTRACE_JSON_REST: + * Skip whitespace until we find either a top-level Object, moving + * to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE. + * + * DTRACE_JSON_OBJECT: + * Locate the next key String in an Object. Sets a flag to denote + * the next String as a key string and moves to DTRACE_JSON_STRING. + * + * DTRACE_JSON_COLON: + * Skip whitespace until we find the colon that separates key Strings + * from their values. Once found, move to DTRACE_JSON_VALUE. + * + * DTRACE_JSON_VALUE: + * Detects the type of the next value (String, Number, Identifier, Object + * or Array) and routes to the states that process that type. Here we also + * deal with the element selector list if we are requested to traverse down + * into the object tree. + * + * DTRACE_JSON_COMMA: + * Skip whitespace until we find the comma that separates key-value pairs + * in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays + * (similarly DTRACE_JSON_VALUE). All following literal value processing + * states return to this state at the end of their value, unless otherwise + * noted. + * + * DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP: + * Processes a Number literal from the JSON, including any exponent + * component that may be present. Numbers are returned as strings, which + * may be passed to strtoll() if an integer is required. + * + * DTRACE_JSON_IDENTIFIER: + * Processes a "true", "false" or "null" literal in the JSON. + * + * DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE, + * DTRACE_JSON_STRING_ESCAPE_UNICODE: + * Processes a String literal from the JSON, whether the String denotes + * a key, a value or part of a larger Object. Handles all escape sequences + * present in the specification, including four-digit unicode characters, + * but merely includes the escape sequence without converting it to the + * actual escaped character. If the String is flagged as a key, we + * move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA. + * + * DTRACE_JSON_COLLECT_OBJECT: + * This state collects an entire Object (or Array), correctly handling + * embedded strings. If the full element selector list matches this nested + * object, we return the Object in full as a string. If not, we use this + * state to skip to the next value at this level and continue processing. + * + * NOTE: This function uses various macros from strtolctype.h to manipulate + * digit values, etc -- these have all been checked to ensure they make + * no additional function calls. + */ +static char * +dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems, + char *dest) +{ + dtrace_json_state_t state = DTRACE_JSON_REST; + int64_t array_elem = INT64_MIN; + int64_t array_pos = 0; + uint8_t escape_unicount = 0; + boolean_t string_is_key = B_FALSE; + boolean_t collect_object = B_FALSE; + boolean_t found_key = B_FALSE; + boolean_t in_array = B_FALSE; + uint32_t braces = 0, brackets = 0; + char *elem = elemlist; + char *dd = dest; + uintptr_t cur; + + for (cur = json; cur < json + size; cur++) { + char cc = dtrace_load8(cur); + if (cc == '\0') + return (NULL); + + switch (state) { + case DTRACE_JSON_REST: + if (isspace(cc)) + break; + + if (cc == '{') { + state = DTRACE_JSON_OBJECT; + break; + } + + if (cc == '[') { + in_array = B_TRUE; + array_pos = 0; + array_elem = dtrace_strtoll(elem, 10, size); + found_key = array_elem == 0 ? B_TRUE : B_FALSE; + state = DTRACE_JSON_VALUE; + break; + } + + /* + * ERROR: expected to find a top-level object or array. + */ + return (NULL); + case DTRACE_JSON_OBJECT: + if (isspace(cc)) + break; + + if (cc == '"') { + state = DTRACE_JSON_STRING; + string_is_key = B_TRUE; + break; + } + + /* + * ERROR: either the object did not start with a key + * string, or we've run off the end of the object + * without finding the requested key. + */ + return (NULL); + case DTRACE_JSON_STRING: + if (cc == '\\') { + *dd++ = '\\'; + state = DTRACE_JSON_STRING_ESCAPE; + break; + } + + if (cc == '"') { + if (collect_object) { + /* + * We don't reset the dest here, as + * the string is part of a larger + * object being collected. + */ + *dd++ = cc; + collect_object = B_FALSE; + state = DTRACE_JSON_COLLECT_OBJECT; + break; + } + *dd = '\0'; + dd = dest; /* reset string buffer */ + if (string_is_key) { + if (dtrace_strncmp(dest, elem, + size) == 0) + found_key = B_TRUE; + } else if (found_key) { + if (nelems > 1) { + /* + * We expected an object, not + * this string. + */ + return (NULL); + } + return (dest); + } + state = string_is_key ? DTRACE_JSON_COLON : + DTRACE_JSON_COMMA; + string_is_key = B_FALSE; + break; + } + + *dd++ = cc; + break; + case DTRACE_JSON_STRING_ESCAPE: + *dd++ = cc; + if (cc == 'u') { + escape_unicount = 0; + state = DTRACE_JSON_STRING_ESCAPE_UNICODE; + } else { + state = DTRACE_JSON_STRING; + } + break; + case DTRACE_JSON_STRING_ESCAPE_UNICODE: + if (!isxdigit(cc)) { + /* + * ERROR: invalid unicode escape, expected + * four valid hexidecimal digits. + */ + return (NULL); + } + + *dd++ = cc; + if (++escape_unicount == 4) + state = DTRACE_JSON_STRING; + break; + case DTRACE_JSON_COLON: + if (isspace(cc)) + break; + + if (cc == ':') { + state = DTRACE_JSON_VALUE; + break; + } + + /* + * ERROR: expected a colon. + */ + return (NULL); + case DTRACE_JSON_COMMA: + if (isspace(cc)) + break; + + if (cc == ',') { + if (in_array) { + state = DTRACE_JSON_VALUE; + if (++array_pos == array_elem) + found_key = B_TRUE; + } else { + state = DTRACE_JSON_OBJECT; + } + break; + } + + /* + * ERROR: either we hit an unexpected character, or + * we reached the end of the object or array without + * finding the requested key. + */ + return (NULL); + case DTRACE_JSON_IDENTIFIER: + if (islower(cc)) { + *dd++ = cc; + break; + } + + *dd = '\0'; + dd = dest; /* reset string buffer */ + + if (dtrace_strncmp(dest, "true", 5) == 0 || + dtrace_strncmp(dest, "false", 6) == 0 || + dtrace_strncmp(dest, "null", 5) == 0) { + if (found_key) { + if (nelems > 1) { + /* + * ERROR: We expected an object, + * not this identifier. + */ + return (NULL); + } + return (dest); + } else { + cur--; + state = DTRACE_JSON_COMMA; + break; + } + } + + /* + * ERROR: we did not recognise the identifier as one + * of those in the JSON specification. + */ + return (NULL); + case DTRACE_JSON_NUMBER: + if (cc == '.') { + *dd++ = cc; + state = DTRACE_JSON_NUMBER_FRAC; + break; + } + + if (cc == 'x' || cc == 'X') { + /* + * ERROR: specification explicitly excludes + * hexidecimal or octal numbers. + */ + return (NULL); + } + + /* FALLTHRU */ + case DTRACE_JSON_NUMBER_FRAC: + if (cc == 'e' || cc == 'E') { + *dd++ = cc; + state = DTRACE_JSON_NUMBER_EXP; + break; + } + + if (cc == '+' || cc == '-') { + /* + * ERROR: expect sign as part of exponent only. + */ + return (NULL); + } + /* FALLTHRU */ + case DTRACE_JSON_NUMBER_EXP: + if (isdigit(cc) || cc == '+' || cc == '-') { + *dd++ = cc; + break; + } + + *dd = '\0'; + dd = dest; /* reset string buffer */ + if (found_key) { + if (nelems > 1) { + /* + * ERROR: We expected an object, not + * this number. + */ + return (NULL); + } + return (dest); + } + + cur--; + state = DTRACE_JSON_COMMA; + break; + case DTRACE_JSON_VALUE: + if (isspace(cc)) + break; + + if (cc == '{' || cc == '[') { + if (nelems > 1 && found_key) { + in_array = cc == '[' ? B_TRUE : B_FALSE; + /* + * If our element selector directs us + * to descend into this nested object, + * then move to the next selector + * element in the list and restart the + * state machine. + */ + while (*elem != '\0') + elem++; + elem++; /* skip the inter-element NUL */ + nelems--; + dd = dest; + if (in_array) { + state = DTRACE_JSON_VALUE; + array_pos = 0; + array_elem = dtrace_strtoll( + elem, 10, size); + found_key = array_elem == 0 ? + B_TRUE : B_FALSE; + } else { + found_key = B_FALSE; + state = DTRACE_JSON_OBJECT; + } + break; + } + + /* + * Otherwise, we wish to either skip this + * nested object or return it in full. + */ + if (cc == '[') + brackets = 1; + else + braces = 1; + *dd++ = cc; + state = DTRACE_JSON_COLLECT_OBJECT; + break; + } + + if (cc == '"') { + state = DTRACE_JSON_STRING; + break; + } + + if (islower(cc)) { + /* + * Here we deal with true, false and null. + */ + *dd++ = cc; + state = DTRACE_JSON_IDENTIFIER; + break; + } + + if (cc == '-' || isdigit(cc)) { + *dd++ = cc; + state = DTRACE_JSON_NUMBER; + break; + } + + /* + * ERROR: unexpected character at start of value. + */ + return (NULL); + case DTRACE_JSON_COLLECT_OBJECT: + if (cc == '\0') + /* + * ERROR: unexpected end of input. + */ + return (NULL); + + *dd++ = cc; + if (cc == '"') { + collect_object = B_TRUE; + state = DTRACE_JSON_STRING; + break; + } + + if (cc == ']') { + if (brackets-- == 0) { + /* + * ERROR: unbalanced brackets. + */ + return (NULL); + } + } else if (cc == '}') { + if (braces-- == 0) { + /* + * ERROR: unbalanced braces. + */ + return (NULL); + } + } else if (cc == '{') { + braces++; + } else if (cc == '[') { + brackets++; + } + + if (brackets == 0 && braces == 0) { + if (found_key) { + *dd = '\0'; + return (dest); + } + dd = dest; /* reset string buffer */ + state = DTRACE_JSON_COMMA; + } + break; + } + } + return (NULL); +} + /* * Emulate the execution of DTrace ID subroutines invoked by the call opcode. * Notice that we don't bother validating the proper number of arguments or @@ -3225,7 +4362,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, volatile uintptr_t *illval = &cpu_core[curcpu_id].cpuc_dtrace_illval; dtrace_vstate_t *vstate = &state->dts_vstate; -#if defined(sun) +#ifdef illumos union { mutex_impl_t mi; uint64_t mx; @@ -3235,7 +4372,15 @@ dtrace_dif_subr(uint_t subr, uint_t rd, krwlock_t ri; uintptr_t rw; } r; -#else +#endif +#ifdef __FreeBSD__ + struct thread *lowner; + union { + struct lock_object *li; + uintptr_t lx; + } l; +#endif +#ifdef __NetBSD__ union { kmutex_t mi; uint64_t mx; @@ -3249,10 +4394,11 @@ dtrace_dif_subr(uint_t subr, uint_t rd, switch (subr) { case DIF_SUBR_RAND: - regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875; + regs[rd] = dtrace_xoroshiro128_plus_next( + state->dts_rstate[curcpu_id]); break; -#if defined(sun) +#ifdef illumos case DIF_SUBR_MUTEX_OWNED: if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t), mstate, vstate)) { @@ -3340,7 +4486,101 @@ dtrace_dif_subr(uint_t subr, uint_t rd, regs[rd] = _RW_ISWRITER(&r.ri); break; -#else +#endif /* illumos */ +#ifdef __FreeBSD__ + case DIF_SUBR_MUTEX_OWNED: + if (!dtrace_canload(tupregs[0].dttk_value, + sizeof (struct lock_object), mstate, vstate)) { + regs[rd] = 0; + break; + } + l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value); + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + + case DIF_SUBR_MUTEX_OWNER: + if (!dtrace_canload(tupregs[0].dttk_value, + sizeof (struct lock_object), mstate, vstate)) { + regs[rd] = 0; + break; + } + l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value); + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + LOCK_CLASS(l.li)->lc_owner(l.li, &lowner); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + regs[rd] = (uintptr_t)lowner; + break; + + case DIF_SUBR_MUTEX_TYPE_ADAPTIVE: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx), + mstate, vstate)) { + regs[rd] = 0; + break; + } + l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value); + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SLEEPLOCK) != 0; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + + case DIF_SUBR_MUTEX_TYPE_SPIN: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx), + mstate, vstate)) { + regs[rd] = 0; + break; + } + l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value); + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + + case DIF_SUBR_RW_READ_HELD: + case DIF_SUBR_SX_SHARED_HELD: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t), + mstate, vstate)) { + regs[rd] = 0; + break; + } + l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value); + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) && + lowner == NULL; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + + case DIF_SUBR_RW_WRITE_HELD: + case DIF_SUBR_SX_EXCLUSIVE_HELD: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t), + mstate, vstate)) { + regs[rd] = 0; + break; + } + l.lx = dtrace_loadptr(tupregs[0].dttk_value); + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) && + lowner != NULL; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + break; + + case DIF_SUBR_RW_ISWRITER: + case DIF_SUBR_SX_ISEXCLUSIVE: + if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t), + mstate, vstate)) { + regs[rd] = 0; + break; + } + l.lx = dtrace_loadptr(tupregs[0].dttk_value); + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + LOCK_CLASS(l.li)->lc_owner(l.li, &lowner); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + regs[rd] = (lowner == curthread); + break; + +#endif /* __FreeBSD__ */ +#ifdef __NetBSD__ case DIF_SUBR_MUTEX_OWNED: if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t), mstate, vstate)) { @@ -3426,7 +4666,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, regs[rd] = _RW_ISWRITER(&r.ri); break; -#endif /* ! defined(sun) */ +#endif /* __NetBSD__ */ case DIF_SUBR_BCOPY: { /* @@ -3536,7 +4776,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, break; } -#if defined(sun) +#ifdef illumos case DIF_SUBR_MSGSIZE: case DIF_SUBR_MSGDSIZE: { uintptr_t baddr = tupregs[0].dttk_value, daddr; @@ -3604,7 +4844,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); for (p = curthread->t_procp; p != NULL; p = p->p_parent) { -#if defined(sun) +#ifdef illumos if (p->p_pidp->pid_id == pid) { #else if (p->p_pid == pid) { @@ -3643,30 +4883,30 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uintptr_t kaddr = tupregs[0].dttk_value; uintptr_t uaddr = tupregs[1].dttk_value; uint64_t size = tupregs[2].dttk_value; + size_t lim; if (!dtrace_destructive_disallow && dtrace_priv_proc_control(state) && - !dtrace_istoxic(kaddr, size)) { + !dtrace_istoxic(kaddr, size) && + dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) { DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); - dtrace_copyoutstr(kaddr, uaddr, size, flags); + dtrace_copyoutstr(kaddr, uaddr, lim, flags); DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); } break; } case DIF_SUBR_STRLEN: { - size_t sz; + size_t size = state->dts_options[DTRACEOPT_STRSIZE]; uintptr_t addr = (uintptr_t)tupregs[0].dttk_value; - sz = dtrace_strlen((char *)addr, - state->dts_options[DTRACEOPT_STRSIZE]); + size_t lim; - if (!dtrace_canload(addr, sz + 1, mstate, vstate)) { + if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) { regs[rd] = 0; break; } - regs[rd] = sz; - + regs[rd] = dtrace_strlen((char *)addr, lim); break; } @@ -3679,12 +4919,19 @@ dtrace_dif_subr(uint_t subr, uint_t rd, * is DIF_SUBR_STRRCHR, we will look for the last occurrence * of the specified character instead of the first. */ - uintptr_t saddr = tupregs[0].dttk_value; uintptr_t addr = tupregs[0].dttk_value; - uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE]; + uintptr_t addr_limit; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + size_t lim; char c, target = (char)tupregs[1].dttk_value; - for (regs[rd] = 0; addr < limit; addr++) { + if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) { + regs[rd] = 0; + break; + } + addr_limit = addr + lim; + + for (regs[rd] = 0; addr < addr_limit; addr++) { if ((c = dtrace_load8(addr)) == target) { regs[rd] = addr; @@ -3695,12 +4942,6 @@ dtrace_dif_subr(uint_t subr, uint_t rd, if (c == '\0') break; } - - if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) { - regs[rd] = 0; - break; - } - break; } @@ -3858,7 +5099,8 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uintptr_t addr = tupregs[0].dttk_value; uintptr_t tokaddr = tupregs[1].dttk_value; uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; - uintptr_t limit, toklimit = tokaddr + size; + uintptr_t limit, toklimit; + size_t clim; uint8_t c = 0, tokmap[32]; /* 256 / 8 */ char *dest = (char *)mstate->dtms_scratch_ptr; int i; @@ -3867,10 +5109,11 @@ dtrace_dif_subr(uint_t subr, uint_t rd, * Check both the token buffer and (later) the input buffer, * since both could be non-scratch addresses. */ - if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) { + if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) { regs[rd] = 0; break; } + toklimit = tokaddr + clim; if (!DTRACE_INSCRATCH(mstate, size)) { DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); @@ -3887,6 +5130,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, * it behaves like an implicit clause-local variable. */ addr = mstate->dtms_strtok; + limit = mstate->dtms_strtok_limit; } else { /* * If the user-specified address is non-NULL we must @@ -3896,10 +5140,12 @@ dtrace_dif_subr(uint_t subr, uint_t rd, * (when we fetch addr from mstate->dtms_strtok) * would fail this access check. */ - if (!dtrace_strcanload(addr, size, mstate, vstate)) { + if (!dtrace_strcanload(addr, size, &clim, mstate, + vstate)) { regs[rd] = 0; break; } + limit = addr + clim; } /* @@ -3918,10 +5164,10 @@ dtrace_dif_subr(uint_t subr, uint_t rd, tokmap[c >> 3] |= (1 << (c & 0x7)); } - for (limit = addr + size; addr < limit; addr++) { + for (; addr < limit; addr++) { /* - * We're looking for a character that is _not_ contained - * in the token string. + * We're looking for a character that is _not_ + * contained in the token string. */ if ((c = dtrace_load8(addr)) == '\0') break; @@ -3939,6 +5185,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, */ regs[rd] = 0; mstate->dtms_strtok = 0; + mstate->dtms_strtok_limit = 0; break; } @@ -3949,29 +5196,151 @@ dtrace_dif_subr(uint_t subr, uint_t rd, if ((c = dtrace_load8(addr)) == '\0') break; - if (tokmap[c >> 3] & (1 << (c & 0x7))) - break; + if (tokmap[c >> 3] & (1 << (c & 0x7))) + break; + + ASSERT(i < size); + dest[i++] = c; + } + + ASSERT(i < size); + dest[i] = '\0'; + regs[rd] = (uintptr_t)dest; + mstate->dtms_scratch_ptr += size; + mstate->dtms_strtok = addr; + mstate->dtms_strtok_limit = limit; + break; + } + + case DIF_SUBR_SUBSTR: { + uintptr_t s = tupregs[0].dttk_value; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + char *d = (char *)mstate->dtms_scratch_ptr; + int64_t index = (int64_t)tupregs[1].dttk_value; + int64_t remaining = (int64_t)tupregs[2].dttk_value; + size_t len = dtrace_strlen((char *)s, size); + int64_t i; + + if (!dtrace_canload(s, len + 1, mstate, vstate)) { + regs[rd] = 0; + break; + } + + if (!DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + if (nargs <= 2) + remaining = (int64_t)size; + + if (index < 0) { + index += len; + + if (index < 0 && index + remaining > 0) { + remaining += index; + index = 0; + } + } + + if (index >= len || index < 0) { + remaining = 0; + } else if (remaining < 0) { + remaining += len - index; + } else if (index + remaining > size) { + remaining = size - index; + } + + for (i = 0; i < remaining; i++) { + if ((d[i] = dtrace_load8(s + index + i)) == '\0') + break; + } + + d[i] = '\0'; + + mstate->dtms_scratch_ptr += size; + regs[rd] = (uintptr_t)d; + break; + } + + case DIF_SUBR_JSON: { + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + uintptr_t json = tupregs[0].dttk_value; + size_t jsonlen = dtrace_strlen((char *)json, size); + uintptr_t elem = tupregs[1].dttk_value; + size_t elemlen = dtrace_strlen((char *)elem, size); + + char *dest = (char *)mstate->dtms_scratch_ptr; + char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1; + char *ee = elemlist; + int nelems = 1; + uintptr_t cur; + + if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) || + !dtrace_canload(elem, elemlen + 1, mstate, vstate)) { + regs[rd] = 0; + break; + } + + if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + regs[rd] = 0; + break; + } + + /* + * Read the element selector and split it up into a packed list + * of strings. + */ + for (cur = elem; cur < elem + elemlen; cur++) { + char cc = dtrace_load8(cur); + + if (cur == elem && cc == '[') { + /* + * If the first element selector key is + * actually an array index then ignore the + * bracket. + */ + continue; + } + + if (cc == ']') + continue; - ASSERT(i < size); - dest[i++] = c; + if (cc == '.' || cc == '[') { + nelems++; + cc = '\0'; + } + + *ee++ = cc; } + *ee++ = '\0'; - ASSERT(i < size); - dest[i] = '\0'; - regs[rd] = (uintptr_t)dest; - mstate->dtms_scratch_ptr += size; - mstate->dtms_strtok = addr; + if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist, + nelems, dest)) != 0) + mstate->dtms_scratch_ptr += jsonlen + 1; break; } - case DIF_SUBR_SUBSTR: { + case DIF_SUBR_TOUPPER: + case DIF_SUBR_TOLOWER: { uintptr_t s = tupregs[0].dttk_value; uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; - char *d = (char *)mstate->dtms_scratch_ptr; - int64_t index = (int64_t)tupregs[1].dttk_value; - int64_t remaining = (int64_t)tupregs[2].dttk_value; + char *dest = (char *)mstate->dtms_scratch_ptr, c; size_t len = dtrace_strlen((char *)s, size); - int64_t i = 0; + char lower, upper, convert; + int64_t i; + + if (subr == DIF_SUBR_TOUPPER) { + lower = 'a'; + upper = 'z'; + convert = 'A'; + } else { + lower = 'A'; + upper = 'Z'; + convert = 'a'; + } if (!dtrace_canload(s, len + 1, mstate, vstate)) { regs[rd] = 0; @@ -3984,39 +5353,24 @@ dtrace_dif_subr(uint_t subr, uint_t rd, break; } - if (nargs <= 2) - remaining = (int64_t)size; - - if (index < 0) { - index += len; - - if (index < 0 && index + remaining > 0) { - remaining += index; - index = 0; - } - } + for (i = 0; i < size - 1; i++) { + if ((c = dtrace_load8(s + i)) == '\0') + break; - if (index >= len || index < 0) { - remaining = 0; - } else if (remaining < 0) { - remaining += len - index; - } else if (index + remaining > size) { - remaining = size - index; - } + if (c >= lower && c <= upper) + c = convert + (c - lower); - for (i = 0; i < remaining; i++) { - if ((d[i] = dtrace_load8(s + index + i)) == '\0') - break; + dest[i] = c; } - d[i] = '\0'; - + ASSERT(i < size); + dest[i] = '\0'; + regs[rd] = (uintptr_t)dest; mstate->dtms_scratch_ptr += size; - regs[rd] = (uintptr_t)d; break; } -#if defined(sun) +#ifdef illumos case DIF_SUBR_GETMAJOR: #ifdef _LP64 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64; @@ -4232,10 +5586,12 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; uintptr_t s1 = tupregs[0].dttk_value; uintptr_t s2 = tupregs[1].dttk_value; - int i = 0; + int i = 0, j = 0; + size_t lim1, lim2; + char c; - if (!dtrace_strcanload(s1, size, mstate, vstate) || - !dtrace_strcanload(s2, size, mstate, vstate)) { + if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) || + !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) { regs[rd] = 0; break; } @@ -4253,7 +5609,8 @@ dtrace_dif_subr(uint_t subr, uint_t rd, break; } - if ((d[i++] = dtrace_load8(s1++)) == '\0') { + c = (i >= lim1) ? '\0' : dtrace_load8(s1++); + if ((d[i++] = c) == '\0') { i--; break; } @@ -4266,7 +5623,8 @@ dtrace_dif_subr(uint_t subr, uint_t rd, break; } - if ((d[i++] = dtrace_load8(s2++)) == '\0') + c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++); + if ((d[i++] = c) == '\0') break; } @@ -4278,11 +5636,45 @@ dtrace_dif_subr(uint_t subr, uint_t rd, break; } + case DIF_SUBR_STRTOLL: { + uintptr_t s = tupregs[0].dttk_value; + uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; + size_t lim; + int base = 10; + + if (nargs > 1) { + if ((base = tupregs[1].dttk_value) <= 1 || + base > ('z' - 'a' + 1) + ('9' - '0' + 1)) { + *flags |= CPU_DTRACE_ILLOP; + break; + } + } + + if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) { + regs[rd] = INT64_MIN; + break; + } + + regs[rd] = dtrace_strtoll((char *)s, base, lim); + break; + } + case DIF_SUBR_LLTOSTR: { int64_t i = (int64_t)tupregs[0].dttk_value; - int64_t val = i < 0 ? i * -1 : i; - uint64_t size = 22; /* enough room for 2^64 in decimal */ + uint64_t val, digit; + uint64_t size = 65; /* enough room for 2^64 in binary */ char *end = (char *)mstate->dtms_scratch_ptr + size - 1; + int base = 10; + + if (nargs > 1) { + if ((base = tupregs[1].dttk_value) <= 1 || + base > ('z' - 'a' + 1) + ('9' - '0' + 1)) { + *flags |= CPU_DTRACE_ILLOP; + break; + } + } + + val = (base == 10 && i < 0) ? i * -1 : i; if (!DTRACE_INSCRATCH(mstate, size)) { DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); @@ -4290,13 +5682,24 @@ dtrace_dif_subr(uint_t subr, uint_t rd, break; } - for (*end-- = '\0'; val; val /= 10) - *end-- = '0' + (val % 10); + for (*end-- = '\0'; val; val /= base) { + if ((digit = val % base) <= '9' - '0') { + *end-- = '0' + digit; + } else { + *end-- = 'a' + (digit - ('9' - '0') - 1); + } + } + + if (i == 0 && base == 16) + *end-- = '0'; + + if (base == 16) + *end-- = 'x'; - if (i == 0) + if (i == 0 || base == 8 || base == 16) *end-- = '0'; - if (i < 0) + if (i < 0 && base == 10) *end-- = '-'; regs[rd] = (uintptr_t)end + 1; @@ -4465,13 +5868,39 @@ dtrace_dif_subr(uint_t subr, uint_t rd, break; } + case DIF_SUBR_GETF: { + uintptr_t fd = tupregs[0].dttk_value; + struct filedesc *fdp; + file_t *fp; + + if (!dtrace_priv_proc(state)) { + regs[rd] = 0; + break; + } +#ifdef __FreeBSD_ + fdp = curproc->p_fd; + FILEDESC_SLOCK(fdp); + fp = fget_locked(fdp, fd); + mstate->dtms_getf = fp; + regs[rd] = (uintptr_t)fp; + FILEDESC_SUNLOCK(fdp); +#endif +#ifdef __NetBSD__ + regs[rd] = 0; +#endif + break; + } case DIF_SUBR_CLEANPATH: { char *dest = (char *)mstate->dtms_scratch_ptr, c; uint64_t size = state->dts_options[DTRACEOPT_STRSIZE]; uintptr_t src = tupregs[0].dttk_value; + size_t lim; int i = 0, j = 0; +#ifdef illumos + zone_t *z; +#endif - if (!dtrace_strcanload(src, size, mstate, vstate)) { + if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) { regs[rd] = 0; break; } @@ -4486,7 +5915,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, * Move forward, loading each character. */ do { - c = dtrace_load8(src + i++); + c = (i >= lim) ? '\0' : dtrace_load8(src + i++); next: if (j + 5 >= size) /* 5 = strlen("/..c\0") */ break; @@ -4496,7 +5925,7 @@ next: continue; } - c = dtrace_load8(src + i++); + c = (i >= lim) ? '\0' : dtrace_load8(src + i++); if (c == '/') { /* @@ -4517,7 +5946,7 @@ next: continue; } - c = dtrace_load8(src + i++); + c = (i >= lim) ? '\0' : dtrace_load8(src + i++); if (c == '/') { /* @@ -4540,7 +5969,7 @@ next: continue; } - c = dtrace_load8(src + i++); + c = (i >= lim) ? '\0' : dtrace_load8(src + i++); if (c != '/' && c != '\0') { /* @@ -4568,6 +5997,25 @@ next: } while (c != '\0'); dest[j] = '\0'; + +#ifdef illumos + if (mstate->dtms_getf != NULL && + !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) && + (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) { + /* + * If we've done a getf() as a part of this ECB and we + * don't have kernel access (and we're not in the global + * zone), check if the path we cleaned up begins with + * the zone's root path, and trim it off if so. Note + * that this is an output cleanliness issue, not a + * security issue: knowing one's zone root path does + * not enable privilege escalation. + */ + if (strstr(dest, z->zone_rootpath) == dest) + dest += strlen(z->zone_rootpath) - 1; + } +#endif + regs[rd] = (uintptr_t)dest; mstate->dtms_scratch_ptr += size; break; @@ -4592,6 +6040,12 @@ next: ipaddr_t ip4; uint8_t *ptr8, val; + if (!dtrace_canload(tupregs[argi].dttk_value, + sizeof (ipaddr_t), mstate, vstate)) { + regs[rd] = 0; + break; + } + /* * Safely load the IPv4 address. */ @@ -4645,6 +6099,12 @@ next: * just the IPv4 string is returned for inet_ntoa6. */ + if (!dtrace_canload(tupregs[argi].dttk_value, + sizeof (struct in6_addr), mstate, vstate)) { + regs[rd] = 0; + break; + } + /* * Safely load the IPv6 address. */ @@ -4673,7 +6133,7 @@ next: tryzero = -1; numzero = 1; for (i = 0; i < sizeof (struct in6_addr); i++) { -#if defined(sun) +#ifdef illumos if (ip6._S6_un._S6_u8[i] == 0 && #else if (ip6.__u6_addr.__u6_addr8[i] == 0 && @@ -4684,7 +6144,7 @@ next: } if (tryzero != -1 && -#if defined(sun) +#ifdef illumos (ip6._S6_un._S6_u8[i] != 0 || #else (ip6.__u6_addr.__u6_addr8[i] != 0 || @@ -4700,7 +6160,7 @@ next: numzero = i - i % 2 - tryzero; tryzero = -1; -#if defined(sun) +#ifdef illumos if (ip6._S6_un._S6_u8[i] == 0 && #else if (ip6.__u6_addr.__u6_addr8[i] == 0 && @@ -4721,7 +6181,7 @@ next: i >= DTRACE_V4MAPPED_OFFSET; i--) { ASSERT(end >= base); -#if defined(sun) +#ifdef illumos val = ip6._S6_un._S6_u8[i]; #else val = ip6.__u6_addr.__u6_addr8[i]; @@ -4766,7 +6226,7 @@ next: if (i < 14 && i != firstzero - 2) *end-- = ':'; -#if defined(sun) +#ifdef illumos val = (ip6._S6_un._S6_u8[i] << 8) + ip6._S6_un._S6_u8[i + 1]; #else @@ -4812,21 +6272,44 @@ inetout: regs[rd] = (uintptr_t)end + 1; break; } - case DIF_SUBR_TYPEREF: { - uintptr_t size = 4 * sizeof(uintptr_t); - uintptr_t *typeref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t)); - size_t scratch_size = ((uintptr_t) typeref - mstate->dtms_scratch_ptr) + size; - - /* address, num_elements, type_str, type_len */ - typeref[0] = tupregs[0].dttk_value; - typeref[1] = tupregs[1].dttk_value; - typeref[2] = tupregs[2].dttk_value; - typeref[3] = tupregs[3].dttk_value; +#ifndef illumos + case DIF_SUBR_MEMSTR: { + char *str = (char *)mstate->dtms_scratch_ptr; + uintptr_t mem = tupregs[0].dttk_value; + char c = tupregs[1].dttk_value; + size_t size = tupregs[2].dttk_value; + uint8_t n; + int i; + + regs[rd] = 0; - regs[rd] = (uintptr_t) typeref; - mstate->dtms_scratch_ptr += scratch_size; + if (size == 0) + break; + + if (!dtrace_canload(mem, size - 1, mstate, vstate)) + break; + + if (!DTRACE_INSCRATCH(mstate, size)) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH); + break; + } + + if (dtrace_memstr_max != 0 && size > dtrace_memstr_max) { + *flags |= CPU_DTRACE_ILLOP; + break; + } + + for (i = 0; i < size - 1; i++) { + n = dtrace_load8(mem++); + str[i] = (n == 0) ? c : n; + } + str[size - 1] = 0; + + regs[rd] = (uintptr_t)str; + mstate->dtms_scratch_ptr += size; break; } +#endif } } @@ -5002,102 +6485,95 @@ dtrace_dif_emulate(dtrace_difo_t *difo, pc = DIF_INSTR_LABEL(instr); break; case DIF_OP_RLDSB: - if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 1, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDSB: regs[rd] = (int8_t)dtrace_load8(regs[r1]); break; case DIF_OP_RLDSH: - if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 2, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDSH: regs[rd] = (int16_t)dtrace_load16(regs[r1]); break; case DIF_OP_RLDSW: - if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 4, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDSW: regs[rd] = (int32_t)dtrace_load32(regs[r1]); break; case DIF_OP_RLDUB: - if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 1, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDUB: regs[rd] = dtrace_load8(regs[r1]); break; case DIF_OP_RLDUH: - if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 2, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDUH: regs[rd] = dtrace_load16(regs[r1]); break; case DIF_OP_RLDUW: - if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 4, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDUW: regs[rd] = dtrace_load32(regs[r1]); break; case DIF_OP_RLDX: - if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) { - *flags |= CPU_DTRACE_KPRIV; - *illval = regs[r1]; + if (!dtrace_canload(regs[r1], 8, mstate, vstate)) break; - } /*FALLTHROUGH*/ case DIF_OP_LDX: regs[rd] = dtrace_load64(regs[r1]); break; case DIF_OP_ULDSB: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); regs[rd] = (int8_t) dtrace_fuword8((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); break; case DIF_OP_ULDSH: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); regs[rd] = (int16_t) dtrace_fuword16((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); break; case DIF_OP_ULDSW: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); regs[rd] = (int32_t) dtrace_fuword32((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); break; case DIF_OP_ULDUB: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); regs[rd] = dtrace_fuword8((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); break; case DIF_OP_ULDUH: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); regs[rd] = dtrace_fuword16((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); break; case DIF_OP_ULDUW: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); regs[rd] = dtrace_fuword32((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); break; case DIF_OP_ULDX: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); regs[rd] = dtrace_fuword64((void *)(uintptr_t)regs[r1]); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); break; case DIF_OP_RET: rval = regs[rd]; @@ -5116,15 +6592,17 @@ dtrace_dif_emulate(dtrace_difo_t *difo, size_t sz = state->dts_options[DTRACEOPT_STRSIZE]; uintptr_t s1 = regs[r1]; uintptr_t s2 = regs[r2]; + size_t lim1, lim2; if (s1 != 0 && - !dtrace_strcanload(s1, sz, mstate, vstate)) + !dtrace_strcanload(s1, sz, &lim1, mstate, vstate)) break; if (s2 != 0 && - !dtrace_strcanload(s2, sz, mstate, vstate)) + !dtrace_strcanload(s2, sz, &lim2, mstate, vstate)) break; - cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz); + cc_r = dtrace_strncmp((char *)s1, (char *)s2, + MIN(lim1, lim2)); cc_n = cc_r < 0; cc_z = cc_r == 0; @@ -5176,12 +6654,14 @@ dtrace_dif_emulate(dtrace_difo_t *difo, ASSERT(id >= DIF_VAR_OTHER_UBASE); id -= DIF_VAR_OTHER_UBASE; + VERIFY(id < vstate->dtvs_nglobals); svar = vstate->dtvs_globals[id]; ASSERT(svar != NULL); v = &svar->dtsv_var; if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { uintptr_t a = (uintptr_t)svar->dtsv_data; + size_t lim; ASSERT(a != 0); ASSERT(svar->dtsv_size != 0); @@ -5195,11 +6675,11 @@ dtrace_dif_emulate(dtrace_difo_t *difo, } if (!dtrace_vcanload( (void *)(uintptr_t)regs[rd], &v->dtdv_type, - mstate, vstate)) + &lim, mstate, vstate)) break; dtrace_vcopy((void *)(uintptr_t)regs[rd], - (void *)a, &v->dtdv_type); + (void *)a, &v->dtdv_type, lim); break; } @@ -5238,6 +6718,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { uintptr_t a = (uintptr_t)svar->dtsv_data; size_t sz = v->dtdv_type.dtdt_size; + size_t lim; sz += sizeof (uint64_t); ASSERT(svar->dtsv_size == NCPU * sz); @@ -5267,7 +6748,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, ASSERT(id >= DIF_VAR_OTHER_UBASE); id -= DIF_VAR_OTHER_UBASE; - ASSERT(id < vstate->dtvs_nlocals); + VERIFY(id < vstate->dtvs_nlocals); ASSERT(vstate->dtvs_locals != NULL); svar = vstate->dtvs_locals[id]; @@ -5277,6 +6758,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { uintptr_t a = (uintptr_t)svar->dtsv_data; size_t sz = v->dtdv_type.dtdt_size; + size_t lim; sz += sizeof (uint64_t); ASSERT(svar->dtsv_size == NCPU * sz); @@ -5292,11 +6774,11 @@ dtrace_dif_emulate(dtrace_difo_t *difo, if (!dtrace_vcanload( (void *)(uintptr_t)regs[rd], &v->dtdv_type, - mstate, vstate)) + &lim, mstate, vstate)) break; dtrace_vcopy((void *)(uintptr_t)regs[rd], - (void *)a, &v->dtdv_type); + (void *)a, &v->dtdv_type, lim); break; } @@ -5345,6 +6827,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, id = DIF_INSTR_VAR(instr); ASSERT(id >= DIF_VAR_OTHER_UBASE); id -= DIF_VAR_OTHER_UBASE; + VERIFY(id < vstate->dtvs_ntlocals); key = &tupregs[DIF_DTR_NREGS]; key[0].dttk_value = (uint64_t)id; @@ -5369,13 +6852,15 @@ dtrace_dif_emulate(dtrace_difo_t *difo, break; if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { + size_t lim; + if (!dtrace_vcanload( (void *)(uintptr_t)regs[rd], - &v->dtdv_type, mstate, vstate)) + &v->dtdv_type, &lim, mstate, vstate)) break; dtrace_vcopy((void *)(uintptr_t)regs[rd], - dvar->dtdv_data, &v->dtdv_type); + dvar->dtdv_data, &v->dtdv_type, lim); } else { *((uint64_t *)dvar->dtdv_data) = regs[rd]; } @@ -5412,6 +6897,11 @@ dtrace_dif_emulate(dtrace_difo_t *difo, regs[r2] ? regs[r2] : dtrace_strsize_default) + 1; } else { + if (regs[r2] > LONG_MAX) { + *flags |= CPU_DTRACE_ILLOP; + break; + } + tupregs[ttop].dttk_size = regs[r2]; } @@ -5453,8 +6943,10 @@ dtrace_dif_emulate(dtrace_difo_t *difo, if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) { DTRACE_TLS_THRKEY(key[nkeys].dttk_value); key[nkeys++].dttk_size = 0; + VERIFY(id < vstate->dtvs_ntlocals); v = &vstate->dtvs_tlocals[id]; } else { + VERIFY(id < vstate->dtvs_nglobals); v = &vstate->dtvs_globals[id]->dtsv_var; } @@ -5508,13 +7000,15 @@ dtrace_dif_emulate(dtrace_difo_t *difo, break; if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) { + size_t lim; + if (!dtrace_vcanload( (void *)(uintptr_t)regs[rd], &v->dtdv_type, - mstate, vstate)) + &lim, mstate, vstate)) break; dtrace_vcopy((void *)(uintptr_t)regs[rd], - dvar->dtdv_data, &v->dtdv_type); + dvar->dtdv_data, &v->dtdv_type, lim); } else { *((uint64_t *)dvar->dtdv_data) = regs[rd]; } @@ -5551,6 +7045,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, *illval = regs[rd]; break; } + if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate)) break; @@ -5679,9 +7174,15 @@ dtrace_action_breakpoint(dtrace_ecb_t *e c[i++] = ')'; c[i] = '\0'; -#if defined(sun) +#ifdef illumos debug_enter(c); -#else +#endif + +#ifdef __FreeBSD__ + kdb_enter(KDB_WHY_DTRACE, "breakpoint action"); +#endif + +#ifdef __NetBSD__ #ifdef DDB db_printf("%s\n", c); Debugger(); @@ -5731,7 +7232,7 @@ dtrace_action_raise(uint64_t sig) return; } -#if defined(sun) +#ifdef illumos /* * raise() has a queue depth of 1 -- we ignore all subsequent * invocations of the raise() action. @@ -5741,7 +7242,15 @@ dtrace_action_raise(uint64_t sig) curthread->t_sig_check = 1; aston(curthread); -#else +#endif + +#ifdef __FreeBSD__ + PROC_LOCK(p); + kern_psignal(p, sig); + PROC_UNLOCK(p); +#endif + +#ifdef __NetBSD__ struct proc *p = curproc; mutex_enter(proc_lock); psignal(p, sig); @@ -5755,13 +7264,21 @@ dtrace_action_stop(void) if (dtrace_destructive_disallow) return; -#if defined(sun) +#ifdef illumos if (!curthread->t_dtrace_stop) { curthread->t_dtrace_stop = 1; curthread->t_sig_check = 1; aston(curthread); } -#else +#endif + +#ifdef __FreeBSD__ + PROC_LOCK(p); + kern_psignal(p, SIGSTOP); + PROC_UNLOCK(p); +#endif + +#ifdef __NetBSD__ struct proc *p = curproc; mutex_enter(proc_lock); psignal(p, SIGSTOP); @@ -5772,10 +7289,9 @@ dtrace_action_stop(void) static void dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val) { -#if 0 /* XXX TBD - needs solaris_cpu */ hrtime_t now; volatile uint16_t *flags; -#if defined(sun) +#ifdef illumos cpu_t *cpu = CPU; #else cpu_t *cpu = &solaris_cpu[curcpu_id]; @@ -5817,7 +7333,6 @@ dtrace_action_chill(dtrace_mstate_t *mst */ mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP; cpu->cpu_dtrace_chilled += val; -#endif } static void @@ -5829,6 +7344,7 @@ dtrace_action_ustack(dtrace_mstate_t *ms uint64_t *pcs = &buf[1], *fps; char *str = (char *)&pcs[nframes]; int size, offs = 0, i, j; + size_t rem; uintptr_t old = mstate->dtms_scratch_ptr, saved; uint16_t *flags = &cpu_core[curcpu_id].cpuc_dtrace_flags; char *sym; @@ -5900,12 +7416,18 @@ dtrace_action_ustack(dtrace_mstate_t *ms continue; } + if (!dtrace_strcanload((uintptr_t)sym, strsize, &rem, mstate, + &(state->dts_vstate))) { + str[offs++] = '\0'; + continue; + } + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); /* * Now copy in the string that the helper returned to us. */ - for (j = 0; offs + j < strsize; j++) { + for (j = 0; offs + j < strsize && j < rem; j++) { if ((str[offs + j] = sym[j]) == '\0') break; } @@ -5933,6 +7455,63 @@ out: mstate->dtms_scratch_ptr = old; } +static void +dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size, + size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind) +{ + volatile uint16_t *flags; + uint64_t val = *valp; + size_t valoffs = *valoffsp; + + flags = (volatile uint16_t *)&cpu_core[curcpu_id].cpuc_dtrace_flags; + ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF); + + /* + * If this is a string, we're going to only load until we find the zero + * byte -- after which we'll store zero bytes. + */ + if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) { + char c = '\0' + 1; + size_t s; + + for (s = 0; s < size; s++) { + if (c != '\0' && dtkind == DIF_TF_BYREF) { + c = dtrace_load8(val++); + } else if (c != '\0' && dtkind == DIF_TF_BYUREF) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + c = dtrace_fuword8((void *)(uintptr_t)val++); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + if (*flags & CPU_DTRACE_FAULT) + break; + } + + DTRACE_STORE(uint8_t, tomax, valoffs++, c); + + if (c == '\0' && intuple) + break; + } + } else { + uint8_t c; + while (valoffs < end) { + if (dtkind == DIF_TF_BYREF) { + c = dtrace_load8(val++); + } else if (dtkind == DIF_TF_BYUREF) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + c = dtrace_fuword8((void *)(uintptr_t)val++); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + if (*flags & CPU_DTRACE_FAULT) + break; + } + + DTRACE_STORE(uint8_t, tomax, + valoffs++, c); + } + } + + *valp = val; + *valoffsp = valoffs; +} + /* * If you're looking for the epicenter of DTrace, you just found it. This * is the function called by the provider to fire a probe -- from which all @@ -5954,7 +7533,10 @@ dtrace_probe(dtrace_id_t id, uintptr_t a volatile uint16_t *flags; hrtime_t now; -#if defined(sun) + if (panicstr != NULL) + return; + +#ifdef illumos /* * Kick out immediately if this CPU is still being born (in which case * curthread will be set to -1) or the current thread can't allow @@ -5979,7 +7561,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t a return; } -#if defined(sun) +#ifdef illumos if (panic_quiesce) { #else if (panicstr != NULL) { @@ -5991,7 +7573,8 @@ dtrace_probe(dtrace_id_t id, uintptr_t a return; } - now = dtrace_gethrtime(); + now = mstate.dtms_timestamp = dtrace_gethrtime(); + mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP; vtime = dtrace_vtime_references != 0; if (vtime && curthread->t_dtrace_start) @@ -6015,6 +7598,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t a dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid]; dtrace_vstate_t *vstate = &state->dts_vstate; dtrace_provider_t *prov = probe->dtpr_provider; + uint64_t tracememsize = 0; int committed = 0; caddr_t tomax; @@ -6031,6 +7615,8 @@ dtrace_probe(dtrace_id_t id, uintptr_t a uint64_t val = 0; mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE; + mstate.dtms_getf = NULL; + *flags &= ~CPU_DTRACE_ERROR; if (prov == dtrace_provider) { @@ -6082,7 +7668,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t a probe->dtpr_id, probe->dtpr_arg) == 0) continue; -#if defined(sun) +#ifdef illumos /* * This is more subtle than it looks. We have to be * absolutely certain that CRED() isn't going to @@ -6133,7 +7719,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t a if (now - state->dts_alive > dtrace_deadman_timeout) { /* * We seem to be dead. Unless we (a) have kernel - * destructive permissions (b) have expicitly enabled + * destructive permissions (b) have explicitly enabled * destructive actions and (c) destructive actions have * not been disabled, we're going to transition into * the KILLED state, from which no further processing @@ -6161,8 +7747,18 @@ dtrace_probe(dtrace_id_t id, uintptr_t a tomax = buf->dtb_tomax; ASSERT(tomax != NULL); - if (ecb->dte_size != 0) - DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid); + if (ecb->dte_size != 0) { + dtrace_rechdr_t dtrh; + if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) { + mstate.dtms_timestamp = dtrace_gethrtime(); + mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP; + } + ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t)); + dtrh.dtrh_epid = ecb->dte_epid; + DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, + mstate.dtms_timestamp); + *((dtrace_rechdr_t *)(tomax + offs)) = dtrh; + } mstate.dtms_epid = ecb->dte_epid; mstate.dtms_present |= DTRACE_MSTATE_EPID; @@ -6174,7 +7770,7 @@ dtrace_probe(dtrace_id_t id, uintptr_t a if (pred != NULL) { dtrace_difo_t *dp = pred->dtp_difo; - int rval; + uint64_t rval; rval = dtrace_dif_emulate(dp, &mstate, vstate, state); @@ -6309,7 +7905,9 @@ dtrace_probe(dtrace_id_t id, uintptr_t a continue; switch (act->dta_kind) { - case DTRACEACT_SPECULATE: + case DTRACEACT_SPECULATE: { + dtrace_rechdr_t *dtrh; + ASSERT(buf == &state->dts_buffer[cpuid]); buf = dtrace_speculation_buffer(state, cpuid, val); @@ -6331,10 +7929,23 @@ dtrace_probe(dtrace_id_t id, uintptr_t a tomax = buf->dtb_tomax; ASSERT(tomax != NULL); - if (ecb->dte_size != 0) - DTRACE_STORE(uint32_t, tomax, offs, - ecb->dte_epid); + if (ecb->dte_size == 0) + continue; + + ASSERT3U(ecb->dte_size, >=, + sizeof (dtrace_rechdr_t)); + dtrh = ((void *)(tomax + offs)); + dtrh->dtrh_epid = ecb->dte_epid; + /* + * When the speculation is committed, all of + * the records in the speculative buffer will + * have their timestamps set to the commit + * time. Until then, it is set to a sentinel + * value, for debugability. + */ + DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX); continue; + } case DTRACEACT_PRINTM: { /* The DIF returns a 'memref'. */ @@ -6353,83 +7964,23 @@ dtrace_probe(dtrace_id_t id, uintptr_t a continue; } - /* Store the size in the buffer first. */ - DTRACE_STORE(uintptr_t, tomax, - valoffs, size); - - /* - * Offset the buffer address to the start - * of the data. - */ - valoffs += sizeof(uintptr_t); - - /* - * Reset to the memory address rather than - * the memref array, then let the BYREF - * code below do the work to store the - * memory data in the buffer. - */ - val = memref[0]; - break; - } - - case DTRACEACT_PRINTT: { - /* The DIF returns a 'typeref'. */ - uintptr_t *typeref = (uintptr_t *)(uintptr_t) val; - char c = '\0' + 1; - size_t s; - - /* - * Get the type string length and round it - * up so that the data that follows is - * aligned for easy access. - */ - size_t typs = strlen((char *) typeref[2]) + 1; - typs = roundup(typs, sizeof(uintptr_t)); - - /* - *Get the size from the typeref using the - * number of elements and the type size. - */ - size = typeref[1] * typeref[3]; - - /* - * Check if the size exceeds the allocated - * buffer size. - */ - if (size + typs + 2 * sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) { - /* Flag a drop! */ - *flags |= CPU_DTRACE_DROP; - - } - - /* Store the size in the buffer first. */ - DTRACE_STORE(uintptr_t, tomax, - valoffs, size); - valoffs += sizeof(uintptr_t); - - /* Store the type size in the buffer. */ - DTRACE_STORE(uintptr_t, tomax, - valoffs, typeref[3]); - valoffs += sizeof(uintptr_t); - - val = typeref[2]; - - for (s = 0; s < typs; s++) { - if (c != '\0') - c = dtrace_load8(val++); + /* Store the size in the buffer first. */ + DTRACE_STORE(uintptr_t, tomax, + valoffs, size); - DTRACE_STORE(uint8_t, tomax, - valoffs++, c); - } + /* + * Offset the buffer address to the start + * of the data. + */ + valoffs += sizeof(uintptr_t); /* * Reset to the memory address rather than - * the typeref array, then let the BYREF + * the memref array, then let the BYREF * code below do the work to store the * memory data in the buffer. */ - val = typeref[0]; + val = memref[0]; break; } @@ -6466,6 +8017,11 @@ dtrace_probe(dtrace_id_t id, uintptr_t a case DTRACEACT_PRINTA: case DTRACEACT_SYSTEM: case DTRACEACT_FREOPEN: + case DTRACEACT_TRACEMEM: + break; + + case DTRACEACT_TRACEMEM_DYNSIZE: + tracememsize = val; break; case DTRACEACT_SYM: @@ -6477,14 +8033,15 @@ dtrace_probe(dtrace_id_t id, uintptr_t a case DTRACEACT_USYM: case DTRACEACT_UMOD: case DTRACEACT_UADDR: { -#if defined(sun) +#ifdef illumos struct pid *pid = curthread->t_procp->p_pidp; #endif + if (!dtrace_priv_proc(state)) continue; DTRACE_STORE(uint64_t, tomax, -#if defined(sun) +#ifdef illumos valoffs, (uint64_t)pid->pid_id); #else valoffs, (uint64_t) curproc->p_pid); @@ -6533,43 +8090,25 @@ dtrace_probe(dtrace_id_t id, uintptr_t a ASSERT(0); } - if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) { + if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF || + dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) { uintptr_t end = valoffs + size; - if (!dtrace_vcanload((void *)(uintptr_t)val, - &dp->dtdo_rtype, &mstate, vstate)) - continue; - - /* - * If this is a string, we're going to only - * load until we find the zero byte -- after - * which we'll store zero bytes. - */ - if (dp->dtdo_rtype.dtdt_kind == - DIF_TYPE_STRING) { - char c = '\0' + 1; - int intuple = act->dta_intuple; - size_t s; - - for (s = 0; s < size; s++) { - if (c != '\0') - c = dtrace_load8(val++); - - DTRACE_STORE(uint8_t, tomax, - valoffs++, c); - - if (c == '\0' && intuple) - break; - } - - continue; + if (tracememsize != 0 && + valoffs + tracememsize < end) { + end = valoffs + tracememsize; + tracememsize = 0; } - while (valoffs < end) { - DTRACE_STORE(uint8_t, tomax, valoffs++, - dtrace_load8(val++)); - } + if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF && + !dtrace_vcanload((void *)(uintptr_t)val, + &dp->dtdo_rtype, NULL, &mstate, vstate)) + continue; + dtrace_store_by_ref(dp, tomax, size, &valoffs, + &val, end, act->dta_intuple, + dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ? + DIF_TF_BYREF: DIF_TF_BYUREF); continue; } @@ -6927,15 +8466,11 @@ dtrace_cred2priv(cred_t *cr, uint32_t *p { uint32_t priv; -#if defined(sun) +#ifdef illumos if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) { /* - * For DTRACE_PRIV_ALL, the uid and zoneid don't matter, - * but for GCC they do. + * For DTRACE_PRIV_ALL, the uid and zoneid don't matter. */ - *uidp = 0; - *zoneidp = 0; - priv = DTRACE_PRIV_ALL; } else { *uidp = crgetuid(cr); @@ -7447,18 +8982,13 @@ dtrace_register(const char *name, const if (pops->dtps_provide == NULL) { ASSERT(pops->dtps_provide_module != NULL); provider->dtpv_pops.dtps_provide = - (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop; + (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop; } if (pops->dtps_provide_module == NULL) { ASSERT(pops->dtps_provide != NULL); -#if defined(sun) provider->dtpv_pops.dtps_provide_module = (void (*)(void *, modctl_t *))dtrace_nullop; -#else - provider->dtpv_pops.dtps_provide_module = - (void (*)(void *, dtrace_modctl_t *))dtrace_nullop; -#endif } if (pops->dtps_suspend == NULL) { @@ -7530,17 +9060,17 @@ dtrace_unregister(dtrace_provider_id_t i { dtrace_provider_t *old = (dtrace_provider_t *)id; dtrace_provider_t *prev = NULL; - int i, self = 0; + int i, self = 0, noreap = 0; dtrace_probe_t *probe, *first = NULL; if (old->dtpv_pops.dtps_enable == - (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) { + (int (*)(void *, dtrace_id_t, void *))dtrace_nullop) { /* * If DTrace itself is the provider, we're called with locks * already held. */ ASSERT(old == dtrace_provider); -#if defined(sun) +#ifdef illumos ASSERT(dtrace_devi != NULL); #endif ASSERT(MUTEX_HELD(&dtrace_provider_lock)); @@ -7555,7 +9085,9 @@ dtrace_unregister(dtrace_provider_id_t i } } else { mutex_enter(&dtrace_provider_lock); +#ifdef illumos mutex_enter(&mod_lock); +#endif mutex_enter(&dtrace_lock); } @@ -7569,7 +9101,9 @@ dtrace_unregister(dtrace_provider_id_t i dtrace_anon.dta_state->dts_necbs > 0))) { if (!self) { mutex_exit(&dtrace_lock); +#ifdef illumos mutex_exit(&mod_lock); +#endif mutex_exit(&dtrace_provider_lock); } return (EBUSY); @@ -7589,14 +9123,36 @@ dtrace_unregister(dtrace_provider_id_t i continue; /* + * If we are trying to unregister a defunct provider, and the + * provider was made defunct within the interval dictated by + * dtrace_unregister_defunct_reap, we'll (asynchronously) + * attempt to reap our enablings. To denote that the provider + * should reattempt to unregister itself at some point in the + * future, we will return a differentiable error code (EAGAIN + * instead of EBUSY) in this case. + */ + if (dtrace_gethrtime() - old->dtpv_defunct > + dtrace_unregister_defunct_reap) + noreap = 1; + + /* * We have at least one ECB; we can't remove this provider. */ if (!self) { mutex_exit(&dtrace_lock); +#ifdef illumos mutex_exit(&mod_lock); +#endif mutex_exit(&dtrace_provider_lock); } - return (EBUSY); + + if (noreap) + return (EBUSY); + + (void) taskq_dispatch(dtrace_taskq, + (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP); + + return (EAGAIN); } /* @@ -7640,16 +9196,20 @@ dtrace_unregister(dtrace_provider_id_t i kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1); kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1); kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1); -#if defined(sun) +#ifdef illumos vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1); -#else +#endif +#ifdef __FreeBSD__ + free_unr(dtrace_arena, probe->dtpr_id); +#endif +#ifdef __NetBSD__ vmem_free(dtrace_arena, (uintptr_t)(probe->dtpr_id), 1); #endif kmem_free(probe, sizeof (dtrace_probe_t)); } if ((prev = dtrace_provider) == old) { -#if defined(sun) +#ifdef illumos ASSERT(self || dtrace_devi == NULL); ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL); #endif @@ -7668,7 +9228,9 @@ dtrace_unregister(dtrace_provider_id_t i if (!self) { mutex_exit(&dtrace_lock); +#ifdef illumos mutex_exit(&mod_lock); +#endif mutex_exit(&dtrace_provider_lock); } @@ -7688,12 +9250,12 @@ dtrace_invalidate(dtrace_provider_id_t i dtrace_provider_t *pvp = (dtrace_provider_t *)id; ASSERT(pvp->dtpv_pops.dtps_enable != - (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop); + (int (*)(void *, dtrace_id_t, void *))dtrace_nullop); mutex_enter(&dtrace_provider_lock); mutex_enter(&dtrace_lock); - pvp->dtpv_defunct = 1; + pvp->dtpv_defunct = dtrace_gethrtime(); mutex_exit(&dtrace_lock); mutex_exit(&dtrace_provider_lock); @@ -7729,7 +9291,7 @@ dtrace_condense(dtrace_provider_id_t id) * Make sure this isn't the dtrace provider itself. */ ASSERT(prov->dtpv_pops.dtps_enable != - (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop); + (int (*)(void *, dtrace_id_t, void *))dtrace_nullop); mutex_enter(&dtrace_provider_lock); mutex_enter(&dtrace_lock); @@ -7759,9 +9321,13 @@ dtrace_condense(dtrace_provider_id_t id) kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1); kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1); kmem_free(probe, sizeof (dtrace_probe_t)); -#if defined(sun) +#ifdef illumos vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1); -#else +#endif +#ifdef __FreeBSD__ + free_unr(dtrace_arena, i + 1); +#endif +#ifdef __NetBSD__ vmem_free(dtrace_arena, ((uintptr_t)i + 1), 1); #endif } @@ -7792,7 +9358,6 @@ dtrace_probe_create(dtrace_provider_id_t dtrace_probe_t *probe, **probes; dtrace_provider_t *provider = (dtrace_provider_t *)prov; dtrace_id_t id; - vmem_addr_t offset; if (provider == dtrace_provider) { ASSERT(MUTEX_HELD(&dtrace_lock)); @@ -7800,9 +9365,19 @@ dtrace_probe_create(dtrace_provider_id_t mutex_enter(&dtrace_lock); } +#ifdef illumos + id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1, + VM_BESTFIT | VM_SLEEP); +#endif +#ifdef __FreeBSD__ + id = alloc_unr(dtrace_arena); +#endif +#ifdef __NetBSD__ + vmem_addr_t offset; if (vmem_alloc(dtrace_arena, 1, VM_BESTFIT | VM_SLEEP, &offset) != 0) ASSERT(0); id = (dtrace_id_t)(uintptr_t)offset; +#endif probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP); probe->dtpr_id = id; @@ -7829,7 +9404,6 @@ dtrace_probe_create(dtrace_provider_id_t } probes = kmem_zalloc(nsize, KM_SLEEP); - dtrace_probes_size = nsize; if (dtrace_probes == NULL) { ASSERT(osize == 0); @@ -7888,8 +9462,8 @@ dtrace_probe_lookup_match(dtrace_probe_t * name and probe name. */ dtrace_id_t -dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod, - const char *func, const char *name) +dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod, + char *func, char *name) { dtrace_probekey_t pkey; dtrace_id_t id; @@ -7897,11 +9471,11 @@ dtrace_probe_lookup(dtrace_provider_id_t pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name; pkey.dtpk_pmatch = &dtrace_match_string; - pkey.dtpk_mod = __UNCONST(mod); + pkey.dtpk_mod = mod; pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul; - pkey.dtpk_func = __UNCONST(func); + pkey.dtpk_func = func; pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul; - pkey.dtpk_name = __UNCONST(name); + pkey.dtpk_name = name; pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul; pkey.dtpk_id = DTRACE_IDNONE; @@ -7951,21 +9525,6 @@ dtrace_probe_description(const dtrace_pr (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1); } -#ifdef notyet /* XXX TBD */ -#if !defined(sun) -static int -dtrace_probe_provide_cb(linker_file_t lf, void *arg) -{ - dtrace_provider_t *prv = (dtrace_provider_t *) arg; - - prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, lf); - - return(0); -} -#endif -#endif /* notyet */ - - /* * Called to indicate that a probe -- or probes -- should be provided by a * specfied provider. If the specified description is NULL, the provider will @@ -7984,9 +9543,10 @@ dtrace_probe_provide_cb(linker_file_t lf static void dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv) { -#if defined(sun) +#ifdef illumos modctl_t *ctl; -#else +#endif +#ifdef __NetBSD__ module_t *mod; #endif int all = 0; @@ -8004,6 +9564,7 @@ dtrace_probe_provide(dtrace_probedesc_t */ prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc); +#ifdef illumos /* * Now call the per-module provide operation. We will grab * mod_lock to prevent the list from being modified. Note @@ -8012,7 +9573,6 @@ dtrace_probe_provide(dtrace_probedesc_t */ mutex_enter(&mod_lock); -#if defined(sun) ctl = &modules; do { if (ctl->mod_busy || ctl->mod_mp == NULL) @@ -8021,29 +9581,25 @@ dtrace_probe_provide(dtrace_probedesc_t prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); } while ((ctl = ctl->mod_next) != &modules); -#else + + mutex_exit(&mod_lock); +#endif +#ifdef __NetBSD__ + kernconfig_lock(); /* Fake netbsd module first */ - if (mod_nbsd == NULL) { - mod_nbsd = kmem_zalloc(sizeof(*mod_nbsd), KM_SLEEP); - mod_nbsd->mod_info = kmem_zalloc(sizeof(modinfo_t), KM_SLEEP); - mod_nbsd->mod_refcnt = 1; - *((char **)(intptr_t)&mod_nbsd->mod_info->mi_name) = __UNCONST("netbsd"); - } + prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, module_kernel()); - kernconfig_lock(); - prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, mod_nbsd); TAILQ_FOREACH(mod, &module_list, mod_chain) { - prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, mod); + if (module_source(mod) != MODULE_SOURCE_KERNEL) + prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, mod); } kernconfig_unlock(); #endif - - mutex_exit(&mod_lock); } while (all && (prv = prv->dtpv_next) != NULL); } -#if defined(sun) +#ifdef illumos /* * Iterate over each probe, and call the Framework-to-Provider API function * denoted by offs. @@ -8204,6 +9760,10 @@ dtrace_helper_provide_one(dof_helper_t * probe = (dof_probe_t *)(uintptr_t)(daddr + prb_sec->dofs_offset + i * prb_sec->dofs_entsize); + /* See the check in dtrace_helper_provider_validate(). */ + if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) + continue; + dhpb.dthpb_mod = dhp->dofhp_mod; dhpb.dthpb_func = strtab + probe->dofpr_func; dhpb.dthpb_name = strtab + probe->dofpr_name; @@ -8256,7 +9816,6 @@ dtrace_helper_provide(dof_helper_t *dhp, dtrace_enabling_matchall(); } -#if defined(sun) static void dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid) { @@ -8304,7 +9863,6 @@ dtrace_helper_provider_remove(dof_helper dtrace_helper_provider_remove_one(dhp, sec, pid); } } -#endif /* * DTrace Meta Provider-to-Framework API Functions @@ -8464,6 +10022,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err; int kcheckload; uint_t pc; + int maxglobal = -1, maxlocal = -1, maxtlocal = -1; kcheckload = cr == NULL || (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0; @@ -8695,6 +10254,19 @@ dtrace_difo_validate(dtrace_difo_t *dp, subr == DIF_SUBR_COPYOUTSTR) { dp->dtdo_destructive = 1; } + if (subr == DIF_SUBR_GETF) { + /* + * If we have a getf() we need to record that + * in our state. Note that our state can be + * NULL if this is a helper -- but in that + * case, the call to getf() is itself illegal, + * and will be caught (slightly later) when + * the helper is validated. + */ + if (vstate->dtvs_state != NULL) + vstate->dtvs_state->dts_getf++; + } + break; case DIF_OP_PUSHTR: if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF) @@ -8724,7 +10296,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, "expected 'ret' as last DIF instruction\n"); } - if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) { + if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) { /* * If we're not returning by reference, the size must be either * 0 or the size of one of the base types. @@ -8779,6 +10351,9 @@ dtrace_difo_validate(dtrace_difo_t *dp, switch (v->dtdv_scope) { case DIFV_SCOPE_GLOBAL: + if (maxglobal == -1 || ndx > maxglobal) + maxglobal = ndx; + if (ndx < vstate->dtvs_nglobals) { dtrace_statvar_t *svar; @@ -8789,11 +10364,17 @@ dtrace_difo_validate(dtrace_difo_t *dp, break; case DIFV_SCOPE_THREAD: + if (maxtlocal == -1 || ndx > maxtlocal) + maxtlocal = ndx; + if (ndx < vstate->dtvs_ntlocals) existing = &vstate->dtvs_tlocals[ndx]; break; case DIFV_SCOPE_LOCAL: + if (maxlocal == -1 || ndx > maxlocal) + maxlocal = ndx; + if (ndx < vstate->dtvs_nlocals) { dtrace_statvar_t *svar; @@ -8812,9 +10393,10 @@ dtrace_difo_validate(dtrace_difo_t *dp, break; } - if (v->dtdv_scope == DIFV_SCOPE_GLOBAL && - vt->dtdt_size > dtrace_global_maxsize) { - err += efunc(i, "oversized by-ref global\n"); + if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL || + v->dtdv_scope == DIFV_SCOPE_LOCAL) && + vt->dtdt_size > dtrace_statvar_maxsize) { + err += efunc(i, "oversized by-ref static\n"); break; } } @@ -8841,10 +10423,40 @@ dtrace_difo_validate(dtrace_difo_t *dp, } } + for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) { + dif_instr_t instr = dp->dtdo_buf[pc]; + + uint_t v = DIF_INSTR_VAR(instr); + uint_t op = DIF_INSTR_OP(instr); + + switch (op) { + case DIF_OP_LDGS: + case DIF_OP_LDGAA: + case DIF_OP_STGS: + case DIF_OP_STGAA: + if (v > DIF_VAR_OTHER_UBASE + maxglobal) + err += efunc(pc, "invalid variable %u\n", v); + break; + case DIF_OP_LDTS: + case DIF_OP_LDTAA: + case DIF_OP_STTS: + case DIF_OP_STTAA: + if (v > DIF_VAR_OTHER_UBASE + maxtlocal) + err += efunc(pc, "invalid variable %u\n", v); + break; + case DIF_OP_LDLS: + case DIF_OP_STLS: + if (v > DIF_VAR_OTHER_UBASE + maxlocal) + err += efunc(pc, "invalid variable %u\n", v); + break; + default: + break; + } + } + return (err); } -#if defined(sun) /* * Validate a DTrace DIF object that it is to be used as a helper. Helpers * are much more constrained than normal DIFOs. Specifically, they may @@ -8975,7 +10587,9 @@ dtrace_difo_validate_helper(dtrace_difo_ subr == DIF_SUBR_INET_NTOA || subr == DIF_SUBR_INET_NTOA6 || subr == DIF_SUBR_INET_NTOP || + subr == DIF_SUBR_JSON || subr == DIF_SUBR_LLTOSTR || + subr == DIF_SUBR_STRTOLL || subr == DIF_SUBR_RINDEX || subr == DIF_SUBR_STRCHR || subr == DIF_SUBR_STRJOIN || @@ -8987,9 +10601,13 @@ dtrace_difo_validate_helper(dtrace_difo_ subr == DIF_SUBR_NTOHS || subr == DIF_SUBR_NTOHL || subr == DIF_SUBR_NTOHLL || - subr == DIF_SUBR_MEMREF || - subr == DIF_SUBR_TYPEREF) + subr == DIF_SUBR_MEMREF) + break; + +#if defined(__FreeBSD__) || defined(__NetBSD__) + if (subr == DIF_SUBR_MEMSTR) break; +#endif err += efunc(pc, "invalid subr %u\n", subr); break; @@ -9002,7 +10620,6 @@ dtrace_difo_validate_helper(dtrace_difo_ return (err); } -#endif /* * Returns 1 if the expression in the DIF object can be cached on a per-thread @@ -9155,6 +10772,9 @@ dtrace_difo_chunksize(dtrace_difo_t *dp, if (srd == 0) return; + if (sval > LONG_MAX) + return; + tupregs[ttop++].dttk_size = sval; } @@ -9216,6 +10836,19 @@ dtrace_difo_chunksize(dtrace_difo_t *dp, */ size = P2ROUNDUP(size, sizeof (uint64_t)); + /* + * Before setting the chunk size, check that we're not going + * to set it to a negative value... + */ + if (size > LONG_MAX) + return; + + /* + * ...and make certain that we didn't badly overflow. + */ + if (size < ksize || size < sizeof (dtrace_dynvar_t)) + return; + if (size > vstate->dtvs_dynvars.dtds_chunksize) vstate->dtvs_dynvars.dtds_chunksize = size; } @@ -9334,7 +10967,6 @@ dtrace_difo_init(dtrace_difo_t *dp, dtra dtrace_difo_hold(dp); } -#if defined(sun) static dtrace_difo_t * dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate) { @@ -9378,7 +11010,6 @@ dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_difo_init(new, vstate); return (new); } -#endif static void dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate) @@ -9626,7 +11257,7 @@ dtrace_actdesc_create(dtrace_actkind_t k { dtrace_actdesc_t *act; -#if defined(sun) +#ifdef illumos ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL && arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA)); #endif @@ -9665,7 +11296,7 @@ dtrace_actdesc_release(dtrace_actdesc_t if (DTRACEACT_ISPRINTFLIKE(kind)) { char *str = (char *)(uintptr_t)act->dtad_arg; -#if defined(sun) +#ifdef illumos ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) || (str == NULL && act->dtad_kind == DTRACEACT_PRINTA)); #endif @@ -9694,9 +11325,9 @@ dtrace_ecb_add(dtrace_state_t *state, dt /* * The default size is the size of the default action: recording - * the epid. + * the header. */ - ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t); + ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t); ecb->dte_alignment = sizeof (dtrace_epid_t); epid = state->dts_epid++; @@ -9792,125 +11423,99 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) } } -static void +static int dtrace_ecb_resize(dtrace_ecb_t *ecb) { - uint32_t maxalign = sizeof (dtrace_epid_t); - uint32_t align = sizeof (uint8_t), offs, diff; dtrace_action_t *act; - int wastuple = 0; + uint32_t curneeded = UINT32_MAX; uint32_t aggbase = UINT32_MAX; - dtrace_state_t *state = ecb->dte_state; /* - * If we record anything, we always record the epid. (And we always - * record it first.) + * If we record anything, we always record the dtrace_rechdr_t. (And + * we always record it first.) */ - offs = sizeof (dtrace_epid_t); - ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t); + ecb->dte_size = sizeof (dtrace_rechdr_t); + ecb->dte_alignment = sizeof (dtrace_epid_t); for (act = ecb->dte_action; act != NULL; act = act->dta_next) { dtrace_recdesc_t *rec = &act->dta_rec; + ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1); - if ((align = rec->dtrd_alignment) > maxalign) - maxalign = align; - - if (!wastuple && act->dta_intuple) { - /* - * This is the first record in a tuple. Align the - * offset to be at offset 4 in an 8-byte aligned - * block. - */ - diff = offs + sizeof (dtrace_aggid_t); - - if ((diff = (diff & (sizeof (uint64_t) - 1)))) - offs += sizeof (uint64_t) - diff; - - aggbase = offs - sizeof (dtrace_aggid_t); - ASSERT(!(aggbase & (sizeof (uint64_t) - 1))); - } - - /*LINTED*/ - if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) { - /* - * The current offset is not properly aligned; align it. - */ - offs += align - diff; - } - - rec->dtrd_offset = offs; - - if (offs + rec->dtrd_size > ecb->dte_needed) { - ecb->dte_needed = offs + rec->dtrd_size; - - if (ecb->dte_needed > state->dts_needed) - state->dts_needed = ecb->dte_needed; - } + ecb->dte_alignment = MAX(ecb->dte_alignment, + rec->dtrd_alignment); if (DTRACEACT_ISAGG(act->dta_kind)) { dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act; - dtrace_action_t *first = agg->dtag_first, *prev; - ASSERT(rec->dtrd_size != 0 && first != NULL); - ASSERT(wastuple); + ASSERT(rec->dtrd_size != 0); + ASSERT(agg->dtag_first != NULL); + ASSERT(act->dta_prev->dta_intuple); ASSERT(aggbase != UINT32_MAX); + ASSERT(curneeded != UINT32_MAX); agg->dtag_base = aggbase; - while ((prev = first->dta_prev) != NULL && - DTRACEACT_ISAGG(prev->dta_kind)) { - agg = (dtrace_aggregation_t *)prev; - first = agg->dtag_first; - } + curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment); + rec->dtrd_offset = curneeded; + if (curneeded + rec->dtrd_size < curneeded) + return (EINVAL); + curneeded += rec->dtrd_size; + ecb->dte_needed = MAX(ecb->dte_needed, curneeded); - if (prev != NULL) { - offs = prev->dta_rec.dtrd_offset + - prev->dta_rec.dtrd_size; - } else { - offs = sizeof (dtrace_epid_t); - } - wastuple = 0; + aggbase = UINT32_MAX; + curneeded = UINT32_MAX; + } else if (act->dta_intuple) { + if (curneeded == UINT32_MAX) { + /* + * This is the first record in a tuple. Align + * curneeded to be at offset 4 in an 8-byte + * aligned block. + */ + ASSERT(act->dta_prev == NULL || + !act->dta_prev->dta_intuple); + ASSERT3U(aggbase, ==, UINT32_MAX); + curneeded = P2PHASEUP(ecb->dte_size, + sizeof (uint64_t), sizeof (dtrace_aggid_t)); + + aggbase = curneeded - sizeof (dtrace_aggid_t); + ASSERT(IS_P2ALIGNED(aggbase, + sizeof (uint64_t))); + } + curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment); + rec->dtrd_offset = curneeded; + if (curneeded + rec->dtrd_size < curneeded) + return (EINVAL); + curneeded += rec->dtrd_size; } else { - if (!act->dta_intuple) - ecb->dte_size = offs + rec->dtrd_size; - - offs += rec->dtrd_size; + /* tuples must be followed by an aggregation */ + ASSERT(act->dta_prev == NULL || + !act->dta_prev->dta_intuple); + + ecb->dte_size = P2ROUNDUP(ecb->dte_size, + rec->dtrd_alignment); + rec->dtrd_offset = ecb->dte_size; + if (ecb->dte_size + rec->dtrd_size < ecb->dte_size) + return (EINVAL); + ecb->dte_size += rec->dtrd_size; + ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size); } - - wastuple = act->dta_intuple; } if ((act = ecb->dte_action) != NULL && !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) && - ecb->dte_size == sizeof (dtrace_epid_t)) { + ecb->dte_size == sizeof (dtrace_rechdr_t)) { /* - * If the size is still sizeof (dtrace_epid_t), then all + * If the size is still sizeof (dtrace_rechdr_t), then all * actions store no data; set the size to 0. */ - ecb->dte_alignment = maxalign; ecb->dte_size = 0; - - /* - * If the needed space is still sizeof (dtrace_epid_t), then - * all actions need no additional space; set the needed - * size to 0. - */ - if (ecb->dte_needed == sizeof (dtrace_epid_t)) - ecb->dte_needed = 0; - - return; } - /* - * Set our alignment, and make sure that the dte_size and dte_needed - * are aligned to the size of an EPID. - */ - ecb->dte_alignment = maxalign; - ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) & - ~(sizeof (dtrace_epid_t) - 1); - ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) & - ~(sizeof (dtrace_epid_t) - 1); - ASSERT(ecb->dte_size <= ecb->dte_needed); + ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t)); + ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t))); + ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, + ecb->dte_needed); + return (0); } static dtrace_action_t * @@ -9923,7 +11528,6 @@ dtrace_ecb_aggregation_create(dtrace_ecb dtrace_recdesc_t *frec; dtrace_aggid_t aggid; dtrace_state_t *state = ecb->dte_state; - vmem_addr_t offset; agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP); agg->dtag_ecb = ecb; @@ -9965,6 +11569,35 @@ dtrace_ecb_aggregation_create(dtrace_ecb break; } + case DTRACEAGG_LLQUANTIZE: { + uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg); + uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg); + uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg); + uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg); + int64_t v; + + agg->dtag_initial = desc->dtad_arg; + agg->dtag_aggregate = dtrace_aggregate_llquantize; + + if (factor < 2 || low >= high || nsteps < factor) + goto err; + + /* + * Now check that the number of steps evenly divides a power + * of the factor. (This assures both integer bucket size and + * linearity within each magnitude.) + */ + for (v = factor; v < nsteps; v *= factor) + continue; + + if ((v % nsteps) || (nsteps % factor)) + goto err; + + size = (dtrace_aggregate_llquantize_bucket(factor, + low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t); + break; + } + case DTRACEAGG_AVG: agg->dtag_aggregate = dtrace_aggregate_avg; size = sizeof (uint64_t) * 2; @@ -10030,11 +11663,22 @@ success: /* * We need to allocate an id for this aggregation. */ + +#ifdef illumos + aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1, + VM_BESTFIT | VM_SLEEP); +#endif +#ifdef __FreeBSD__ + aggid = alloc_unr(state->dts_aggid_arena); +#endif +#ifdef __NetBSD__ + vmem_addr_t offset; + if (vmem_alloc(state->dts_aggid_arena, 1, VM_BESTFIT | VM_SLEEP, &offset) != 0) ASSERT(0); aggid = (dtrace_aggid_t)(uintptr_t)offset; - +#endif if (aggid - 1 >= state->dts_naggregations) { dtrace_aggregation_t **oaggs = state->dts_aggregations; @@ -10083,9 +11727,13 @@ dtrace_ecb_aggregation_destroy(dtrace_ec dtrace_aggid_t aggid = agg->dtag_id; ASSERT(DTRACEACT_ISAGG(act->dta_kind)); -#if defined(sun) +#ifdef illumos vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1); -#else +#endif +#ifdef __FreeBSD__ + free_unr(state->dts_aggid_arena, aggid); +#endif +#ifdef __NetBSD__ vmem_free(state->dts_aggid_arena, (uintptr_t)aggid, 1); #endif @@ -10141,16 +11789,18 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb, case DTRACEACT_PRINTA: case DTRACEACT_SYSTEM: case DTRACEACT_FREOPEN: + case DTRACEACT_DIFEXPR: /* * We know that our arg is a string -- turn it into a * format. */ if (arg == 0) { - ASSERT(desc->dtad_kind == DTRACEACT_PRINTA); + ASSERT(desc->dtad_kind == DTRACEACT_PRINTA || + desc->dtad_kind == DTRACEACT_DIFEXPR); format = 0; } else { ASSERT(arg != 0); -#if defined(sun) +#ifdef illumos ASSERT(arg > KERNELBASE); #endif format = dtrace_format_add(state, @@ -10159,7 +11809,8 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb, /*FALLTHROUGH*/ case DTRACEACT_LIBACT: - case DTRACEACT_DIFEXPR: + case DTRACEACT_TRACEMEM: + case DTRACEACT_TRACEMEM_DYNSIZE: if (dp == NULL) return (EINVAL); @@ -10258,7 +11909,7 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb, break; case DTRACEACT_SPECULATE: - if (ecb->dte_size > sizeof (dtrace_epid_t)) + if (ecb->dte_size > sizeof (dtrace_rechdr_t)) return (EINVAL); if (dp == NULL) @@ -10271,10 +11922,6 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb, size = dp->dtdo_rtype.dtdt_size; break; - case DTRACEACT_PRINTT: - size = dp->dtdo_rtype.dtdt_size; - break; - case DTRACEACT_COMMIT: { dtrace_action_t *act = ecb->dte_action; @@ -10379,7 +12026,7 @@ dtrace_ecb_action_remove(dtrace_ecb_t *e ecb->dte_action = NULL; ecb->dte_action_last = NULL; - ecb->dte_size = sizeof (dtrace_epid_t); + ecb->dte_size = 0; } static void @@ -10542,12 +12189,12 @@ dtrace_ecb_create(dtrace_state_t *state, * of creating our own (saving both time and space). */ dtrace_ecb_t *cached = dtrace_ecb_create_cache; - dtrace_action_t *xact = cached->dte_action; + dtrace_action_t *act = cached->dte_action; - if (xact != NULL) { - ASSERT(xact->dta_refcnt > 0); - xact->dta_refcnt++; - ecb->dte_action = xact; + if (act != NULL) { + ASSERT(act->dta_refcnt > 0); + act->dta_refcnt++; + ecb->dte_action = act; ecb->dte_action_last = cached->dte_action_last; ecb->dte_needed = cached->dte_needed; ecb->dte_size = cached->dte_size; @@ -10564,7 +12211,10 @@ dtrace_ecb_create(dtrace_state_t *state, } } - dtrace_ecb_resize(ecb); + if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) { + dtrace_ecb_destroy(ecb); + return (NULL); + } return (dtrace_ecb_create_cache = ecb); } @@ -10650,11 +12300,13 @@ dtrace_buffer_switch(dtrace_buffer_t *bu caddr_t tomax = buf->dtb_tomax; caddr_t xamot = buf->dtb_xamot; dtrace_icookie_t cookie; + hrtime_t now; ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH)); ASSERT(!(buf->dtb_flags & DTRACEBUF_RING)); cookie = dtrace_interrupt_disable(); + now = dtrace_gethrtime(); buf->dtb_tomax = xamot; buf->dtb_xamot = tomax; buf->dtb_xamot_drops = buf->dtb_drops; @@ -10665,6 +12317,8 @@ dtrace_buffer_switch(dtrace_buffer_t *bu buf->dtb_drops = 0; buf->dtb_errors = 0; buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED); + buf->dtb_interval = now - buf->dtb_switched; + buf->dtb_switched = now; dtrace_interrupt_enable(cookie); } @@ -10695,22 +12349,41 @@ dtrace_buffer_activate(dtrace_state_t *s dtrace_interrupt_enable(cookie); } +#ifdef __FreeBSD__ +/* + * Activate the specified per-CPU buffer. This is used instead of + * dtrace_buffer_activate() when APs have not yet started, i.e. when + * activating anonymous state. + */ +static void +dtrace_buffer_activate_cpu(dtrace_state_t *state, int cpu) +{ + + if (state->dts_buffer[cpu].dtb_tomax != NULL) + state->dts_buffer[cpu].dtb_flags &= ~DTRACEBUF_INACTIVE; +} +#endif + static int dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags, - processorid_t cpu) + processorid_t cpu, int *factor) { -#if defined(sun) +#ifdef illumos cpu_t *cp; -#else +#endif +#ifdef __NetBSD__ CPU_INFO_ITERATOR cpuind; struct cpu_info *cinfo; #endif dtrace_buffer_t *buf; + int allocated = 0, desired = 0; -#if defined(sun) +#ifdef illumos ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(MUTEX_HELD(&dtrace_lock)); + *factor = 1; + if (size > dtrace_nonroot_maxsize && !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE)) return (EFBIG); @@ -10734,7 +12407,8 @@ dtrace_buffer_alloc(dtrace_buffer_t *buf ASSERT(buf->dtb_xamot == NULL); - if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL) + if ((buf->dtb_tomax = kmem_zalloc(size, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) goto err; buf->dtb_size = size; @@ -10745,7 +12419,8 @@ dtrace_buffer_alloc(dtrace_buffer_t *buf if (flags & DTRACEBUF_NOSWITCH) continue; - if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL) + if ((buf->dtb_xamot = kmem_zalloc(size, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) goto err; } while ((cp = cp->cpu_next) != cpu_list); @@ -10759,42 +12434,53 @@ err: continue; buf = &bufs[cp->cpu_id]; + desired += 2; if (buf->dtb_xamot != NULL) { ASSERT(buf->dtb_tomax != NULL); ASSERT(buf->dtb_size == size); kmem_free(buf->dtb_xamot, size); + allocated++; } if (buf->dtb_tomax != NULL) { ASSERT(buf->dtb_size == size); kmem_free(buf->dtb_tomax, size); + allocated++; } buf->dtb_tomax = NULL; buf->dtb_xamot = NULL; buf->dtb_size = 0; } while ((cp = cp->cpu_next) != cpu_list); - - return (ENOMEM); #else -#if defined(__amd64__) + *factor = 1; +#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ + defined(__mips__) || defined(__powerpc__) || defined(__riscv__) /* * FreeBSD isn't good at limiting the amount of memory we * ask to malloc, so let's place a limit here before trying * to do something that might well end in tears at bedtime. */ if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1))) - return(ENOMEM); + return (ENOMEM); #endif ASSERT(MUTEX_HELD(&dtrace_lock)); - for (CPU_INFO_FOREACH(cpuind, cinfo)) { - if (cpu != DTRACE_CPUALL && cpu != cpu_index(cinfo)) +#ifdef __NetBSD__ + for (CPU_INFO_FOREACH(cpuind, cinfo)) +#else + CPU_FOREACH(i) +#endif + { +#ifdef __NetBSD__ + int i = cpu_index(cinfo); +#endif + if (cpu != DTRACE_CPUALL && cpu != i) continue; - buf = &bufs[cpu_index(cinfo)]; + buf = &bufs[i]; /* * If there is already a buffer allocated for this CPU, it @@ -10808,7 +12494,8 @@ err: ASSERT(buf->dtb_xamot == NULL); - if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL) + if ((buf->dtb_tomax = kmem_zalloc(size, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) goto err; buf->dtb_size = size; @@ -10819,7 +12506,8 @@ err: if (flags & DTRACEBUF_NOSWITCH) continue; - if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL) + if ((buf->dtb_xamot = kmem_zalloc(size, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) goto err; } @@ -10830,21 +12518,32 @@ err: * Error allocating memory, so free the buffers that were * allocated before the failed allocation. */ - for (CPU_INFO_FOREACH(cpuind, cinfo)) { +#ifdef __NetBSD__ + for (CPU_INFO_FOREACH(cpuind, cinfo)) +#else + CPU_FOREACH(i) +#endif + { +#ifdef __NetBSD__ + int i = cpu_index(cinfo); +#endif if (cpu != DTRACE_CPUALL && cpu != cpu_index(cinfo)) continue; - buf = &bufs[cpu_index(cinfo)]; + buf = &bufs[i]; + desired += 2; if (buf->dtb_xamot != NULL) { ASSERT(buf->dtb_tomax != NULL); ASSERT(buf->dtb_size == size); kmem_free(buf->dtb_xamot, size); + allocated++; } if (buf->dtb_tomax != NULL) { ASSERT(buf->dtb_size == size); kmem_free(buf->dtb_tomax, size); + allocated++; } buf->dtb_tomax = NULL; @@ -10852,9 +12551,10 @@ err: buf->dtb_size = 0; } +#endif + *factor = desired / (allocated > 0 ? allocated : 1); return (ENOMEM); -#endif } /* @@ -11020,7 +12720,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *b if (epid == DTRACE_EPIDNONE) { size = sizeof (uint32_t); } else { - ASSERT(epid <= state->dts_necbs); + ASSERT3U(epid, <=, state->dts_necbs); ASSERT(state->dts_ecbs[epid - 1] != NULL); size = state->dts_ecbs[epid - 1]->dte_size; @@ -11148,11 +12848,41 @@ dtrace_buffer_polish(dtrace_buffer_t *bu buf->dtb_xamot_offset - buf->dtb_offset); } - if (buf->dtb_offset > buf->dtb_xamot_offset) { - bzero(buf->dtb_tomax + buf->dtb_offset, - buf->dtb_size - buf->dtb_offset); - bzero(buf->dtb_tomax, buf->dtb_xamot_offset); - } + if (buf->dtb_offset > buf->dtb_xamot_offset) { + bzero(buf->dtb_tomax + buf->dtb_offset, + buf->dtb_size - buf->dtb_offset); + bzero(buf->dtb_tomax, buf->dtb_xamot_offset); + } +} + +/* + * This routine determines if data generated at the specified time has likely + * been entirely consumed at user-level. This routine is called to determine + * if an ECB on a defunct probe (but for an active enabling) can be safely + * disabled and destroyed. + */ +static int +dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when) +{ + int i; + + for (i = 0; i < NCPU; i++) { + dtrace_buffer_t *buf = &bufs[i]; + + if (buf->dtb_size == 0) + continue; + + if (buf->dtb_flags & DTRACEBUF_RING) + return (0); + + if (!buf->dtb_switched && buf->dtb_offset != 0) + return (0); + + if (buf->dtb_switched - buf->dtb_interval < when) + return (0); + } + + return (1); } static void @@ -11269,9 +12999,15 @@ dtrace_enabling_dump(dtrace_enabling_t * for (i = 0; i < enab->dten_ndesc; i++) { dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe; +#ifdef __FreeBSD__ + printf("dtrace: enabling probe %d (%s:%s:%s:%s)\n", i, + desc->dtpd_provider, desc->dtpd_mod, + desc->dtpd_func, desc->dtpd_name); +#else cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i, desc->dtpd_provider, desc->dtpd_mod, desc->dtpd_func, desc->dtpd_name); +#endif } } @@ -11314,6 +13050,7 @@ dtrace_enabling_destroy(dtrace_enabling_ ASSERT(enab->dten_vstate->dtvs_state != NULL); ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0); enab->dten_vstate->dtvs_state->dts_nretained--; + dtrace_retained_gen++; } if (enab->dten_prev == NULL) { @@ -11356,6 +13093,7 @@ dtrace_enabling_retain(dtrace_enabling_t return (ENOSPC); state->dts_nretained++; + dtrace_retained_gen++; if (dtrace_retained == NULL) { dtrace_retained = enab; @@ -11540,10 +13278,11 @@ dtrace_enabling_matchall(void) * block pending our completion. */ for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) { -#if defined(sun) +#ifdef illumos cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred; - if (INGLOBALZONE(curproc) || getzoneid() == crgetzoneid(cr)) + if (INGLOBALZONE(curproc) || + cr != NULL && getzoneid() == crgetzoneid(cr)) #endif (void) dtrace_enabling_match(enab, NULL); } @@ -11604,6 +13343,7 @@ dtrace_enabling_provide(dtrace_provider_ { int i, all = 0; dtrace_probedesc_t desc; + dtrace_genid_t gen; ASSERT(MUTEX_HELD(&dtrace_lock)); ASSERT(MUTEX_HELD(&dtrace_provider_lock)); @@ -11614,15 +13354,25 @@ dtrace_enabling_provide(dtrace_provider_ } do { - dtrace_enabling_t *enab = dtrace_retained; + dtrace_enabling_t *enab; void *parg = prv->dtpv_arg; - for (; enab != NULL; enab = enab->dten_next) { +retry: + gen = dtrace_retained_gen; + for (enab = dtrace_retained; enab != NULL; + enab = enab->dten_next) { for (i = 0; i < enab->dten_ndesc; i++) { desc = enab->dten_desc[i]->dted_probe; mutex_exit(&dtrace_lock); prv->dtpv_pops.dtps_provide(parg, &desc); mutex_enter(&dtrace_lock); + /* + * Process the retained enablings again if + * they have changed while we weren't holding + * dtrace_lock. + */ + if (gen != dtrace_retained_gen) + goto retry; } } } while (all && (prv = prv->dtpv_next) != NULL); @@ -11633,6 +13383,84 @@ dtrace_enabling_provide(dtrace_provider_ } /* + * Called to reap ECBs that are attached to probes from defunct providers. + */ +static void +dtrace_enabling_reap(void) +{ + dtrace_provider_t *prov; + dtrace_probe_t *probe; + dtrace_ecb_t *ecb; + hrtime_t when; + int i; + + mutex_enter(&cpu_lock); + mutex_enter(&dtrace_lock); + + for (i = 0; i < dtrace_nprobes; i++) { + if ((probe = dtrace_probes[i]) == NULL) + continue; + + if (probe->dtpr_ecb == NULL) + continue; + + prov = probe->dtpr_provider; + + if ((when = prov->dtpv_defunct) == 0) + continue; + + /* + * We have ECBs on a defunct provider: we want to reap these + * ECBs to allow the provider to unregister. The destruction + * of these ECBs must be done carefully: if we destroy the ECB + * and the consumer later wishes to consume an EPID that + * corresponds to the destroyed ECB (and if the EPID metadata + * has not been previously consumed), the consumer will abort + * processing on the unknown EPID. To reduce (but not, sadly, + * eliminate) the possibility of this, we will only destroy an + * ECB for a defunct provider if, for the state that + * corresponds to the ECB: + * + * (a) There is no speculative tracing (which can effectively + * cache an EPID for an arbitrary amount of time). + * + * (b) The principal buffers have been switched twice since the + * provider became defunct. + * + * (c) The aggregation buffers are of zero size or have been + * switched twice since the provider became defunct. + * + * We use dts_speculates to determine (a) and call a function + * (dtrace_buffer_consumed()) to determine (b) and (c). Note + * that as soon as we've been unable to destroy one of the ECBs + * associated with the probe, we quit trying -- reaping is only + * fruitful in as much as we can destroy all ECBs associated + * with the defunct provider's probes. + */ + while ((ecb = probe->dtpr_ecb) != NULL) { + dtrace_state_t *state = ecb->dte_state; + dtrace_buffer_t *buf = state->dts_buffer; + dtrace_buffer_t *aggbuf = state->dts_aggbuffer; + + if (state->dts_speculates) + break; + + if (!dtrace_buffer_consumed(buf, when)) + break; + + if (!dtrace_buffer_consumed(aggbuf, when)) + break; + + dtrace_ecb_disable(ecb); + ASSERT(probe->dtpr_ecb != ecb); + dtrace_ecb_destroy(ecb); + } + } + + mutex_exit(&dtrace_lock); + mutex_exit(&cpu_lock); +} +/* * DTrace DOF Functions */ /*ARGSUSED*/ @@ -11754,10 +13582,107 @@ dtrace_dof_copyin(uintptr_t uarg, int *e return (dof); } -#if 0 -#if !defined(sun) +#ifdef __FreeBSD__ +static dof_hdr_t * +dtrace_dof_copyin_proc(struct proc *p, uintptr_t uarg, int *errp) +{ + dof_hdr_t hdr, *dof; + struct thread *td; + size_t loadsz; + + ASSERT(!MUTEX_HELD(&dtrace_lock)); + + td = curthread; + + /* + * First, we're going to copyin() the sizeof (dof_hdr_t). + */ + if (proc_readmem(td, p, uarg, &hdr, sizeof(hdr)) != sizeof(hdr)) { + dtrace_dof_error(NULL, "failed to copyin DOF header"); + *errp = EFAULT; + return (NULL); + } + + /* + * Now we'll allocate the entire DOF and copy it in -- provided + * that the length isn't outrageous. + */ + if (hdr.dofh_loadsz >= dtrace_dof_maxsize) { + dtrace_dof_error(&hdr, "load size exceeds maximum"); + *errp = E2BIG; + return (NULL); + } + loadsz = (size_t)hdr.dofh_loadsz; + + if (loadsz < sizeof (hdr)) { + dtrace_dof_error(&hdr, "invalid load size"); + *errp = EINVAL; + return (NULL); + } + + dof = kmem_alloc(loadsz, KM_SLEEP); + + if (proc_readmem(td, p, uarg, dof, loadsz) != loadsz || + dof->dofh_loadsz != loadsz) { + kmem_free(dof, hdr.dofh_loadsz); + *errp = EFAULT; + return (NULL); + } + + return (dof); +} +#endif /* __FreeBSD__ */ +#ifdef __NetBSD__ +static dof_hdr_t * +dtrace_dof_copyin_pid(pid_t pid, const void *uarg, int *errp) +{ + dof_hdr_t hdr, *dof; + size_t loadsz; + int err; + + err = copyin_pid(pid, uarg, &hdr, sizeof(hdr)); + if (err != 0) { + *errp = err; + return (NULL); + } + + /* + * Now we'll allocate the entire DOF and copy it in -- provided + * that the length isn't outrageous. + */ + if (hdr.dofh_loadsz >= dtrace_dof_maxsize) { + dtrace_dof_error(&hdr, "load size exceeds maximum"); + *errp = E2BIG; + return (NULL); + } + loadsz = (size_t)hdr.dofh_loadsz; + + if (loadsz < sizeof (hdr)) { + dtrace_dof_error(&hdr, "invalid load size"); + *errp = EINVAL; + return (NULL); + } + + dof = kmem_alloc(loadsz, KM_SLEEP); + + err = copyin_pid(pid, uarg, dof, loadsz); + if (err == 0 && dof->dofh_loadsz != loadsz) + err = EFAULT; + if (err != 0) { + kmem_free(dof, loadsz); + *errp = EFAULT; + return (NULL); + } + + return (dof); +} +#endif + +#ifdef __FreeBSD__ static __inline uchar_t -dtrace_dof_char(char c) { +dtrace_dof_char(char c) +{ + switch (c) { case '0': case '1': @@ -11786,19 +13711,18 @@ dtrace_dof_char(char c) { return (c - 'a' + 10); } /* Should not reach here. */ - return (0); + return (UCHAR_MAX); } -#endif -#endif +#endif /* __FreeBSD__ */ static dof_hdr_t * dtrace_dof_property(const char *name) { - dof_hdr_t *dof = NULL; -#if defined(sun) +#ifdef illumos uchar_t *buf; uint64_t loadsz; unsigned int len, i; + dof_hdr_t *dof = NULL; /* * Unfortunately, array of values in .conf files are always (and @@ -11833,52 +13757,89 @@ dtrace_dof_property(const char *name) dof = kmem_alloc(loadsz, KM_SLEEP); bcopy(buf, dof, loadsz); ddi_prop_free(buf); -#else - printf("dtrace: XXX %s not implemented (name=%s)\n", __func__, name); -#if 0 /* XXX TBD dtrace_dof_provide */ - char *p; - char *p_env; - if ((p_env = getenv(name)) == NULL) - return (NULL); + return (dof); +#endif /* illumos */ +#ifdef __FreeBSD__ + uint8_t *dofbuf; + u_char *data, *eol; + caddr_t doffile; + size_t bytes, len, i; + dof_hdr_t *dof; + u_char c1, c2; - len = strlen(p_env) / 2; + dof = NULL; - buf = kmem_alloc(len, KM_SLEEP); + doffile = preload_search_by_type("dtrace_dof"); + if (doffile == NULL) + return (NULL); - dof = (dof_hdr_t *) buf; + data = preload_fetch_addr(doffile); + len = preload_fetch_size(doffile); + for (;;) { + /* Look for the end of the line. All lines end in a newline. */ + eol = memchr(data, '\n', len); + if (eol == NULL) + return (NULL); - p = p_env; + if (strncmp(name, data, strlen(name)) == 0) + break; - for (i = 0; i < len; i++) { - buf[i] = (dtrace_dof_char(p[0]) << 4) | - dtrace_dof_char(p[1]); - p += 2; + eol++; /* skip past the newline */ + len -= eol - data; + data = eol; } - freeenv(p_env); + /* We've found the data corresponding to the specified key. */ - if (len < sizeof (dof_hdr_t)) { - kmem_free(buf, len); + data += strlen(name) + 1; /* skip past the '=' */ + len = eol - data; + if (len % 2 != 0) { + dtrace_dof_error(NULL, "invalid DOF encoding length"); + goto doferr; + } + bytes = len / 2; + if (bytes < sizeof(dof_hdr_t)) { dtrace_dof_error(NULL, "truncated header"); - return (NULL); + goto doferr; + } + + /* + * Each byte is represented by the two ASCII characters in its hex + * representation. + */ + dofbuf = malloc(bytes, M_SOLARIS, M_WAITOK); + for (i = 0; i < bytes; i++) { + c1 = dtrace_dof_char(data[i * 2]); + c2 = dtrace_dof_char(data[i * 2 + 1]); + if (c1 == UCHAR_MAX || c2 == UCHAR_MAX) { + dtrace_dof_error(NULL, "invalid hex char in DOF"); + goto doferr; + } + dofbuf[i] = c1 * 16 + c2; } - if (len < (loadsz = dof->dofh_loadsz)) { - kmem_free(buf, len); + dof = (dof_hdr_t *)dofbuf; + if (bytes < dof->dofh_loadsz) { dtrace_dof_error(NULL, "truncated DOF"); - return (NULL); + goto doferr; } - if (loadsz >= dtrace_dof_maxsize) { - kmem_free(buf, len); + if (dof->dofh_loadsz >= dtrace_dof_maxsize) { dtrace_dof_error(NULL, "oversized DOF"); - return (NULL); + goto doferr; } -#endif -#endif return (dof); + +doferr: + free(dof, M_SOLARIS); + return (NULL); +#endif /* __FreeBSD__ */ +#ifdef __NetBSD__ + printf("dtrace: XXX %s not implemented (name=%s)\n", __func__, name); + return (NULL); +#endif /* __NetBSD__ */ } static void @@ -11994,7 +13955,7 @@ dtrace_dof_difo(dof_hdr_t *dof, dof_sec_ size_t ttl = 0; dof_difohdr_t *dofd; uintptr_t daddr = (uintptr_t)dof; - size_t maxx = dtrace_difo_maxsize; + size_t max = dtrace_difo_maxsize; int i, l, n; static const struct { @@ -12055,7 +14016,7 @@ dtrace_dof_difo(dof_hdr_t *dof, dof_sec_ dofd->dofd_links[l])) == NULL) goto err; /* invalid section link */ - if (ttl + subsec->dofs_size > maxx) { + if (ttl + subsec->dofs_size > max) { dtrace_dof_error(dof, "exceeds maximum size"); goto err; } @@ -12219,15 +14180,20 @@ dtrace_dof_actdesc(dof_hdr_t *dof, dof_s (uintptr_t)sec->dofs_offset + offs); kind = (dtrace_actkind_t)desc->dofa_kind; - if (DTRACEACT_ISPRINTFLIKE(kind) && + if ((DTRACEACT_ISPRINTFLIKE(kind) && (kind != DTRACEACT_PRINTA || + desc->dofa_strtab != DOF_SECIDX_NONE)) || + (kind == DTRACEACT_DIFEXPR && desc->dofa_strtab != DOF_SECIDX_NONE)) { dof_sec_t *strtab; char *str, *fmt; uint64_t i; /* - * printf()-like actions must have a format string. + * The argument to these actions is an index into the + * DOF string table. For printf()-like actions, this + * is the format string. For print(), this is the + * CTF type of the expression result. */ if ((strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL) @@ -12365,12 +14331,13 @@ err: /* * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the - * specified DOF. At present, this amounts to simply adding 'ubase' to the - * site of any user SETX relocations to account for load object base address. - * In the future, if we need other relocations, this function can be extended. + * specified DOF. SETX relocations are computed using 'ubase', the base load + * address of the object containing the DOF, and DOFREL relocations are relative + * to the relocation offset within the DOF. */ static int -dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase) +dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase, + uint64_t udaddr) { uintptr_t daddr = (uintptr_t)dof; dof_relohdr_t *dofr = @@ -12408,6 +14375,7 @@ dtrace_dof_relocate(dof_hdr_t *dof, dof_ case DOF_RELO_NONE: break; case DOF_RELO_SETX: + case DOF_RELO_DOFREL: if (r->dofr_offset >= ts->dofs_size || r->dofr_offset + sizeof (uint64_t) > ts->dofs_size) { dtrace_dof_error(dof, "bad relocation offset"); @@ -12419,7 +14387,11 @@ dtrace_dof_relocate(dof_hdr_t *dof, dof_ return (-1); } - *(uint64_t *)taddr += ubase; + if (r->dofr_type == DOF_RELO_SETX) + *(uint64_t *)taddr += ubase; + else + *(uint64_t *)taddr += + udaddr + ts->dofs_offset + r->dofr_offset; break; default: dtrace_dof_error(dof, "invalid relocation type"); @@ -12440,7 +14412,7 @@ dtrace_dof_relocate(dof_hdr_t *dof, dof_ */ static int dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr, - dtrace_enabling_t **enabp, uint64_t ubase, int noprobes) + dtrace_enabling_t **enabp, uint64_t ubase, uint64_t udaddr, int noprobes) { uint64_t len = dof->dofh_loadsz, seclen; uintptr_t daddr = (uintptr_t)dof; @@ -12565,7 +14537,7 @@ dtrace_dof_slurp(dof_hdr_t *dof, dtrace_ if (!(sec->dofs_flags & DOF_SECF_LOAD)) continue; /* just ignore non-loadable sections */ - if (sec->dofs_align & (sec->dofs_align - 1)) { + if (!ISP2(sec->dofs_align)) { dtrace_dof_error(dof, "bad section alignment"); return (-1); } @@ -12602,7 +14574,7 @@ dtrace_dof_slurp(dof_hdr_t *dof, dtrace_ switch (sec->dofs_type) { case DOF_SECT_URELHDR: - if (dtrace_dof_relocate(dof, sec, ubase) != 0) + if (dtrace_dof_relocate(dof, sec, ubase, udaddr) != 0) return (-1); break; } @@ -12696,7 +14668,7 @@ dtrace_dof_options(dof_hdr_t *dof, dtrac static int dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size) { - size_t hashsize, maxper, minn, chunksize = dstate->dtds_chunksize; + size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize; void *base; uintptr_t limit; dtrace_dynvar_t *dvar, *next, *start; @@ -12710,10 +14682,12 @@ dtrace_dstate_init(dtrace_dstate_t *dsta if ((dstate->dtds_chunksize = chunksize) == 0) dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE; - if (size < (minn = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t))) - size = minn; + VERIFY(dstate->dtds_chunksize < LONG_MAX); - if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL) + if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t))) + size = min; + + if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL) return (ENOMEM); dstate->dtds_size = size; @@ -12750,10 +14724,22 @@ dtrace_dstate_init(dtrace_dstate_t *dsta ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t)); limit = (uintptr_t)base + size; + VERIFY((uintptr_t)start < limit); + VERIFY((uintptr_t)start >= (uintptr_t)base); + maxper = (limit - (uintptr_t)start) / NCPU; maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize; - for (i = 0; i < NCPU; i++) { +#ifdef illumos + for (i = 0; i < NCPU; i++) +#endif +#ifdef __FreeBSD__ + CPU_FOREACH(i) +#endif +#ifdef __NetBSD__ + for (i = 0; i < NCPU; i++) +#endif + { dstate->dtds_percpu[i].dtdsc_free = dvar = start; /* @@ -12771,7 +14757,7 @@ dtrace_dstate_init(dtrace_dstate_t *dsta start = (dtrace_dynvar_t *)limit; } - ASSERT(limit <= (uintptr_t)base + size); + VERIFY(limit <= (uintptr_t)base + size); for (;;) { next = (dtrace_dynvar_t *)((uintptr_t)dvar + @@ -12780,6 +14766,8 @@ dtrace_dstate_init(dtrace_dstate_t *dsta if ((uintptr_t)next + dstate->dtds_chunksize >= limit) break; + VERIFY((uintptr_t)dvar >= (uintptr_t)base && + (uintptr_t)dvar <= (uintptr_t)base + size); dvar->dtdv_next = next; dvar = next; } @@ -12829,6 +14817,56 @@ dtrace_vstate_fini(dtrace_vstate_t *vsta } } +#ifdef __FreeBSD__ +static void +dtrace_state_clean(void *arg) +{ + dtrace_state_t *state = arg; + dtrace_optval_t *opt = state->dts_options; + + if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) + return; + + dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars); + dtrace_speculation_clean(state); + + callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC, + dtrace_state_clean, state); +} + +static void +dtrace_state_deadman(void *arg) +{ + dtrace_state_t *state = arg; + hrtime_t now; + + dtrace_sync(); + + dtrace_debug_output(); + + now = dtrace_gethrtime(); + + if (state != dtrace_anon.dta_state && + now - state->dts_laststatus >= dtrace_deadman_user) + return; + + /* + * We must be sure that dts_alive never appears to be less than the + * value upon entry to dtrace_state_deadman(), and because we lack a + * dtrace_cas64(), we cannot store to it atomically. We thus instead + * store INT64_MAX to it, followed by a memory barrier, followed by + * the new value. This assures that dts_alive never appears to be + * less than its true value, regardless of the order in which the + * stores to the underlying storage are issued. + */ + state->dts_alive = INT64_MAX; + dtrace_membar_producer(); + state->dts_alive = now; + + callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC, + dtrace_state_deadman, state); +} +#else static void dtrace_state_clean(dtrace_state_t *state) { @@ -12866,25 +14904,29 @@ dtrace_state_deadman(dtrace_state_t *sta state->dts_alive = now; } -#if !defined(sun) -struct dtrace_state_worker *dtrace_state_worker_add(void (*)(dtrace_state_t *), - dtrace_state_t *, hrtime_t); -void dtrace_state_worker_remove(struct dtrace_state_worker *); -#endif +#endif /* illumos */ static dtrace_state_t * -#if defined(sun) +#ifdef illumos +dtrace_state_create(dev_t *devp, cred_t *cr) +#endif +#ifdef __FreeBSD__ +dtrace_state_create(struct cdev *dev, struct ucred *cred __unused) +#endif +#ifdef __NetBSD__ dtrace_state_create(dev_t *devp, cred_t *cr) -#else -dtrace_state_create(dev_t dev, cred_t *cr) #endif { -#if defined(sun) +#ifdef illumos minor_t minor; major_t major; #else int m = 0; #endif +#ifdef __FreeBSD__ + cred_t *cr = NULL; +#endif + int cpu_it; char c[30]; dtrace_state_t *state; dtrace_optval_t *opt; @@ -12893,7 +14935,7 @@ dtrace_state_create(dev_t dev, cred_t *c ASSERT(MUTEX_HELD(&dtrace_lock)); ASSERT(MUTEX_HELD(&cpu_lock)); -#if defined(sun) +#ifdef illumos minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1, VM_BESTFIT | VM_SLEEP); @@ -12903,17 +14945,25 @@ dtrace_state_create(dev_t dev, cred_t *c } state = ddi_get_soft_state(dtrace_softstate, minor); -#else - m = minor(dev) & 0x0F; +#endif +#ifdef __FreeBSD__ + if (dev != NULL) { + cr = dev->si_cred; + m = dev2unit(dev); + } +#endif +#ifdef __NetBSD__ + m = minor(*devp) & 0x0F; /* Allocate memory for the state. */ state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP); #endif + state->dts_epid = DTRACE_EPIDNONE + 1; (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m); -#if defined(sun) +#ifdef illumos state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1, NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER); @@ -12927,10 +14977,15 @@ dtrace_state_create(dev_t dev, cred_t *c if (devp != NULL) *devp = state->dts_dev; -#else +#endif +#ifdef __FreeBSD__ + state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx); + state->dts_dev = dev; +#endif +#ifdef __NetBSD__ state->dts_aggid_arena = vmem_create(c, 1, INT_MAX, 1, NULL, NULL, NULL, 0, VM_SLEEP, IPL_NONE); - state->dts_dev = dev; + state->dts_dev = *devp; #endif /* @@ -12942,10 +14997,31 @@ dtrace_state_create(dev_t dev, cred_t *c state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP); state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP); -#if defined(sun) + /* + * Allocate and initialise the per-process per-CPU random state. + * SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is + * assumed to be seeded at this point (if from Fortuna seed file). + */ + (void) read_random(&state->dts_rstate[0], 2 * sizeof(uint64_t)); + for (cpu_it = 1; cpu_it < NCPU; cpu_it++) { + /* + * Each CPU is assigned a 2^64 period, non-overlapping + * subsequence. + */ + dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it-1], + state->dts_rstate[cpu_it]); + } + + +#ifdef illumos state->dts_cleaner = CYCLIC_NONE; state->dts_deadman = CYCLIC_NONE; -#else +#endif +#ifdef __FreeBSD__ + callout_init(&state->dts_cleaner, 1); + callout_init(&state->dts_deadman, 1); +#endif +#ifdef __NetBSD__ state->dts_cleaner = NULL; state->dts_deadman = NULL; #endif @@ -12992,11 +15068,7 @@ dtrace_state_create(dev_t dev, cred_t *c * credential from disappearing. This means that we can * examine the credential and the zone from probe context. */ -#if defined(sun) crhold(cr); -#else - kauth_cred_hold(cr); -#endif state->dts_cred.dcr_cred = cr; /* @@ -13036,7 +15108,7 @@ dtrace_state_create(dev_t dev, cred_t *c * we can do destructive things to processes which * have altered credentials. */ -#if defined(sun) +#ifdef illumos if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE), cr->cr_zone->zone_privset)) { state->dts_cred.dcr_action |= @@ -13078,7 +15150,7 @@ dtrace_state_create(dev_t dev, cred_t *c state->dts_cred.dcr_action |= DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE; -#if defined(sun) +#ifdef illumos /* * If we have all privs in whatever zone this is, * we can do destructive things to processes which @@ -13117,7 +15189,7 @@ dtrace_state_buffer(dtrace_state_t *stat { dtrace_optval_t *opt = state->dts_options, size; processorid_t cpu = 0;; - int flags = 0, rval; + int flags = 0, rval, factor, divisor = 1; ASSERT(MUTEX_HELD(&dtrace_lock)); ASSERT(MUTEX_HELD(&cpu_lock)); @@ -13147,7 +15219,7 @@ dtrace_state_buffer(dtrace_state_t *stat flags |= DTRACEBUF_INACTIVE; } - for (size = opt[which]; size >= sizeof (uint64_t); size >>= 1) { + for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) { /* * The size must be 8-byte aligned. If the size is not 8-byte * aligned, drop it down by the difference. @@ -13165,7 +15237,7 @@ dtrace_state_buffer(dtrace_state_t *stat return (E2BIG); } - rval = dtrace_buffer_alloc(buf, size, flags, cpu); + rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor); if (rval != ENOMEM) { opt[which] = size; @@ -13174,6 +15246,9 @@ dtrace_state_buffer(dtrace_state_t *stat if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL) return (rval); + + for (divisor = 2; divisor < factor; divisor <<= 1) + continue; } return (ENOMEM); @@ -13234,7 +15309,7 @@ dtrace_state_go(dtrace_state_t *state, p dtrace_optval_t *opt = state->dts_options, sz, nspec; dtrace_speculation_t *spec; dtrace_buffer_t *buf; -#if defined(sun) +#ifdef illumos cyc_handler_t hdlr; cyc_time_t when; #endif @@ -13275,7 +15350,8 @@ dtrace_state_go(dtrace_state_t *state, p goto out; } - spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP); + spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), + KM_NOSLEEP | KM_NORMALPRI); if (spec == NULL) { rval = ENOMEM; @@ -13286,7 +15362,8 @@ dtrace_state_go(dtrace_state_t *state, p state->dts_nspeculations = (int)nspec; for (i = 0; i < nspec; i++) { - if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) { + if ((buf = kmem_zalloc(bufsize, + KM_NOSLEEP | KM_NORMALPRI)) == NULL) { rval = ENOMEM; goto err; } @@ -13416,7 +15493,7 @@ dtrace_state_go(dtrace_state_t *state, p opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max; state->dts_alive = state->dts_laststatus = dtrace_gethrtime(); -#if defined(sun) +#ifdef illumos hdlr.cyh_func = (cyc_func_t)dtrace_state_clean; hdlr.cyh_arg = state; hdlr.cyh_level = CY_LOW_LEVEL; @@ -13434,7 +15511,14 @@ dtrace_state_go(dtrace_state_t *state, p when.cyt_interval = dtrace_deadman_interval; state->dts_deadman = cyclic_add(&hdlr, &when); -#else +#endif +#ifdef __FreeBSD__ + callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC, + dtrace_state_clean, state); + callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC, + dtrace_state_deadman, state); +#endif +#ifdef __NetBSD__ state->dts_cleaner = dtrace_state_worker_add( dtrace_state_clean, state, opt[DTRACEOPT_CLEANRATE]); state->dts_deadman = dtrace_state_worker_add( @@ -13443,6 +15527,24 @@ dtrace_state_go(dtrace_state_t *state, p state->dts_activity = DTRACE_ACTIVITY_WARMUP; +#ifdef illumos + if (state->dts_getf != 0 && + !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) { + /* + * We don't have kernel privs but we have at least one call + * to getf(); we need to bump our zone's count, and (if + * this is the first enabling to have an unprivileged call + * to getf()) we need to hook into closef(). + */ + state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++; + + if (dtrace_getf++ == 0) { + ASSERT(dtrace_closef == NULL); + dtrace_closef = dtrace_getf_barrier; + } + } +#endif + /* * Now it's time to actually fire the BEGIN probe. We need to disable * interrupts here both to record the CPU on which we fired the BEGIN @@ -13467,6 +15569,19 @@ dtrace_state_go(dtrace_state_t *state, p if (state->dts_activity == DTRACE_ACTIVITY_WARMUP) state->dts_activity = DTRACE_ACTIVITY_ACTIVE; +#ifdef __FreeBSD__ + /* + * We enable anonymous tracing before APs are started, so we must + * activate buffers using the current CPU. + */ + if (state == dtrace_anon.dta_state) + for (int i = 0; i < NCPU; i++) + dtrace_buffer_activate_cpu(state, i); + else + dtrace_xcall(DTRACE_CPUALL, + (dtrace_xcall_t)dtrace_buffer_activate, state); +#else + /* * Regardless of whether or not now we're in ACTIVE or DRAINING, we * want each CPU to transition its principal buffer out of the @@ -13477,6 +15592,7 @@ dtrace_state_go(dtrace_state_t *state, p */ dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_buffer_activate, state); +#endif goto out; err: @@ -13559,6 +15675,26 @@ dtrace_state_stop(dtrace_state_t *state, state->dts_activity = DTRACE_ACTIVITY_STOPPED; dtrace_sync(); +#ifdef illumos + if (state->dts_getf != 0 && + !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) { + /* + * We don't have kernel privs but we have at least one call + * to getf(); we need to lower our zone's count, and (if + * this is the last enabling to have an unprivileged call + * to getf()) we need to clear the closef() hook. + */ + ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0); + ASSERT(dtrace_closef == dtrace_getf_barrier); + ASSERT(dtrace_getf > 0); + + state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--; + + if (--dtrace_getf == 0) + dtrace_closef = NULL; + } +#endif + return (0); } @@ -13622,7 +15758,7 @@ dtrace_state_destroy(dtrace_state_t *sta { dtrace_ecb_t *ecb; dtrace_vstate_t *vstate = &state->dts_vstate; -#if defined(sun) +#ifdef illumos minor_t minor = getminor(state->dts_dev); #endif int i, bufsize = NCPU * sizeof (dtrace_buffer_t); @@ -13657,13 +15793,8 @@ dtrace_state_destroy(dtrace_state_t *sta /* * Release the credential hold we took in dtrace_state_create(). */ - if (state->dts_cred.dcr_cred != NULL) { -#if defined(sun) + if (state->dts_cred.dcr_cred != NULL) crfree(state->dts_cred.dcr_cred); -#else - kauth_cred_free(state->dts_cred.dcr_cred); -#endif - } /* * Now we can safely disable and destroy any enabled probes. Because @@ -13705,13 +15836,20 @@ dtrace_state_destroy(dtrace_state_t *sta for (i = 0; i < nspec; i++) dtrace_buffer_free(spec[i].dtsp_buffer); -#if defined(sun) +#ifdef illumos if (state->dts_cleaner != CYCLIC_NONE) cyclic_remove(state->dts_cleaner); if (state->dts_deadman != CYCLIC_NONE) cyclic_remove(state->dts_deadman); -#else +#endif +#ifdef __FreeBSD__ + callout_stop(&state->dts_cleaner); + callout_drain(&state->dts_cleaner); + callout_stop(&state->dts_deadman); + callout_drain(&state->dts_deadman); +#endif +#ifdef __NetBSD__ if (state->dts_cleaner != NULL) dtrace_state_worker_remove(state->dts_cleaner); @@ -13746,13 +15884,18 @@ dtrace_state_destroy(dtrace_state_t *sta dtrace_format_destroy(state); if (state->dts_aggid_arena != NULL) { +#if defined(illumos) || defined(__NetBSD__) vmem_destroy(state->dts_aggid_arena); +#else + delete_unrhdr(state->dts_aggid_arena); +#endif state->dts_aggid_arena = NULL; } -#if defined(sun) +#ifdef illumos ddi_soft_state_free(dtrace_softstate, minor); vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1); -#else +#endif +#ifdef __NetBSD__ kmem_free(state, sizeof(dtrace_state_t)); #endif } @@ -13803,7 +15946,7 @@ dtrace_anon_property(void) break; } -#if defined(sun) +#ifdef illumos /* * We want to create anonymous state, so we need to transition * the kernel debugger to indicate that DTrace is active. If @@ -13822,9 +15965,7 @@ dtrace_anon_property(void) * If we haven't allocated an anonymous state, we'll do so now. */ if ((state = dtrace_anon.dta_state) == NULL) { -#if defined(sun) state = dtrace_state_create(NULL, NULL); -#endif dtrace_anon.dta_state = state; if (state == NULL) { @@ -13846,7 +15987,7 @@ dtrace_anon_property(void) } rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(), - &dtrace_anon.dta_enabling, 0, B_TRUE); + &dtrace_anon.dta_enabling, 0, 0, B_TRUE); if (rv == 0) rv = dtrace_dof_options(dof, state); @@ -13893,10 +16034,10 @@ dtrace_helper_trace(dtrace_helper_action dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where) { uint32_t size, next, nnext, i; - dtrace_helptrace_t *ent; + dtrace_helptrace_t *ent, *buffer; uint16_t flags = cpu_core[curcpu_id].cpuc_dtrace_flags; - if (!dtrace_helptrace_enabled) + if ((buffer = dtrace_helptrace_buffer) == NULL) return; ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals); @@ -13924,10 +16065,12 @@ dtrace_helper_trace(dtrace_helper_action /* * We have our slot; fill it in. */ - if (nnext == size) + if (nnext == size) { + dtrace_helptrace_wrapped++; next = 0; + } - ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next]; + ent = (dtrace_helptrace_t *)((uintptr_t)buffer + next); ent->dtht_helper = helper; ent->dtht_where = where; ent->dtht_nlocals = vstate->dtvs_nlocals; @@ -13961,7 +16104,7 @@ dtrace_helper(int which, dtrace_mstate_t dtrace_helper_action_t *helper; dtrace_vstate_t *vstate; dtrace_difo_t *pred; - int i, trace = dtrace_helptrace_enabled; + int i, trace = dtrace_helptrace_buffer != NULL; ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS); @@ -14040,7 +16183,6 @@ err: return (0); } -#if defined(sun) static void dtrace_helper_action_destroy(dtrace_helper_action_t *helper, dtrace_vstate_t *vstate) @@ -14061,13 +16203,15 @@ dtrace_helper_action_destroy(dtrace_help } static int -dtrace_helper_destroygen(int gen) +dtrace_helper_destroygen(dtrace_helpers_t *help, int gen) { proc_t *p = curproc; - dtrace_helpers_t *help = p->p_dtrace_helpers; dtrace_vstate_t *vstate; int i; + if (help == NULL) + help = p->p_dtrace_helpers; + ASSERT(MUTEX_HELD(&dtrace_lock)); if (help == NULL || gen > help->dthps_generation) @@ -14165,9 +16309,9 @@ dtrace_helper_validate(dtrace_helper_act } static int -dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep) +dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep, + dtrace_helpers_t *help) { - dtrace_helpers_t *help; dtrace_helper_action_t *helper, *last; dtrace_actdesc_t *act; dtrace_vstate_t *vstate; @@ -14177,7 +16321,6 @@ dtrace_helper_action_add(int which, dtra if (which < 0 || which >= DTRACE_NHELPER_ACTIONS) return (EINVAL); - help = curproc->p_dtrace_helpers; last = help->dthps_actions[which]; vstate = &help->dthps_vstate; @@ -14301,15 +16444,12 @@ dtrace_helper_provider_register(proc_t * } static int -dtrace_helper_provider_add(dof_helper_t *dofhp, int gen) +dtrace_helper_provider_add(dof_helper_t *dofhp, dtrace_helpers_t *help, int gen) { - dtrace_helpers_t *help; dtrace_helper_provider_t *hprov, **tmp_provs; uint_t tmp_maxprovs, i; ASSERT(MUTEX_HELD(&dtrace_lock)); - - help = curproc->p_dtrace_helpers; ASSERT(help != NULL); /* @@ -14484,7 +16624,13 @@ dtrace_helper_provider_validate(dof_hdr_ if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) { dtrace_dof_error(dof, "function name too long"); - return (-1); + /* + * Keep going if the function name is too long. + * Unlike provider and probe names, we cannot reasonably + * impose restrictions on function names, since they're + * a property of the code being instrumented. We will + * skip this probe in dtrace_helper_provide_one(). + */ } if (probe->dofpr_name >= str_sec->dofs_size || @@ -14596,7 +16742,7 @@ dtrace_helper_provider_validate(dof_hdr_ } static int -dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp) +dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp, struct proc *p) { dtrace_helpers_t *help; dtrace_vstate_t *vstate; @@ -14606,13 +16752,12 @@ dtrace_helper_slurp(dof_hdr_t *dof, dof_ ASSERT(MUTEX_HELD(&dtrace_lock)); - if ((help = curproc->p_dtrace_helpers) == NULL) - help = dtrace_helpers_create(curproc); + if ((help = p->p_dtrace_helpers) == NULL) + help = dtrace_helpers_create(p); vstate = &help->dthps_vstate; - - if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab, - dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) { + if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab, dhp->dofhp_addr, + dhp->dofhp_dof, B_FALSE)) != 0) { dtrace_dof_destroy(dof); return (rv); } @@ -14620,22 +16765,20 @@ dtrace_helper_slurp(dof_hdr_t *dof, dof_ /* * Look for helper providers and validate their descriptions. */ - if (dhp != NULL) { - for (i = 0; i < dof->dofh_secnum; i++) { - dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr + - dof->dofh_secoff + i * dof->dofh_secsize); - - if (sec->dofs_type != DOF_SECT_PROVIDER) - continue; + for (i = 0; i < dof->dofh_secnum; i++) { + dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr + + dof->dofh_secoff + i * dof->dofh_secsize); - if (dtrace_helper_provider_validate(dof, sec) != 0) { - dtrace_enabling_destroy(enab); - dtrace_dof_destroy(dof); - return (-1); - } + if (sec->dofs_type != DOF_SECT_PROVIDER) + continue; - nprovs++; + if (dtrace_helper_provider_validate(dof, sec) != 0) { + dtrace_enabling_destroy(enab); + dtrace_dof_destroy(dof); + return (-1); } + + nprovs++; } /* @@ -14655,12 +16798,13 @@ dtrace_helper_slurp(dof_hdr_t *dof, dof_ continue; if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK, - ep)) != 0) { + ep, help)) != 0) { /* * Adding this helper action failed -- we are now going * to rip out the entire generation and return failure. */ - (void) dtrace_helper_destroygen(help->dthps_generation); + (void) dtrace_helper_destroygen(help, + help->dthps_generation); dtrace_enabling_destroy(enab); dtrace_dof_destroy(dof); return (-1); @@ -14675,11 +16819,18 @@ dtrace_helper_slurp(dof_hdr_t *dof, dof_ gen = help->dthps_generation++; dtrace_enabling_destroy(enab); - if (dhp != NULL && nprovs > 0) { + if (nprovs > 0) { + /* + * Now that this is in-kernel, we change the sense of the + * members: dofhp_dof denotes the in-kernel copy of the DOF + * and dofhp_addr denotes the address at user-level. + */ + dhp->dofhp_addr = dhp->dofhp_dof; dhp->dofhp_dof = (uint64_t)(uintptr_t)dof; - if (dtrace_helper_provider_add(dhp, gen) == 0) { + + if (dtrace_helper_provider_add(dhp, help, gen) == 0) { mutex_exit(&dtrace_lock); - dtrace_helper_provider_register(curproc, help, dhp); + dtrace_helper_provider_register(p, help, dhp); mutex_enter(&dtrace_lock); destroy = 0; @@ -14710,12 +16861,17 @@ dtrace_helpers_create(proc_t *p) return (help); } -static void -dtrace_helpers_destroy(void) +#ifdef illumos +static +#endif +void +dtrace_helpers_destroy(proc_t *p) { dtrace_helpers_t *help; dtrace_vstate_t *vstate; +#ifdef illumos proc_t *p = curproc; +#endif int i; mutex_enter(&dtrace_lock); @@ -14802,7 +16958,10 @@ dtrace_helpers_destroy(void) mutex_exit(&dtrace_lock); } -static void +#ifdef illumos +static +#endif +void dtrace_helpers_duplicate(proc_t *from, proc_t *to) { dtrace_helpers_t *help, *newhelp; @@ -14844,7 +17003,7 @@ dtrace_helpers_duplicate(proc_t *from, p new->dtha_actions = kmem_alloc(sz, KM_SLEEP); for (j = 0; j < new->dtha_nactions; j++) { - dp = helper->dtha_actions[j]; + dtrace_difo_t *dp = helper->dtha_actions[j]; ASSERT(dp != NULL); dp = dtrace_difo_duplicate(dp, vstate); @@ -14892,10 +17051,27 @@ dtrace_module_loaded(modctl_t *ctl) { dtrace_provider_t *prv; +#ifdef __NetBSD__ + /* + * We have just one symbol table and CTF table for the entire + * base kernel, so ignore any other built-in module entries. + * This means that the module name for a given symbol will change + * depending on whether the module is built-in or loaded separately. + */ + if (module_source(ctl) == MODULE_SOURCE_KERNEL && + strcmp(module_name(ctl), "netbsd")) { + return; + } +#endif + mutex_enter(&dtrace_provider_lock); +#ifdef illumos mutex_enter(&mod_lock); +#endif +#ifdef illumos ASSERT(ctl->mod_busy); +#endif /* * We're going to call each providers per-module provide operation @@ -14904,7 +17080,9 @@ dtrace_module_loaded(modctl_t *ctl) for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next) prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); +#ifdef illumos mutex_exit(&mod_lock); +#endif mutex_exit(&dtrace_provider_lock); /* @@ -14937,28 +17115,76 @@ dtrace_module_loaded(modctl_t *ctl) * not a serious problem -- it just means that the module that we * just loaded may not be immediately instrumentable. */ - xdelay(1); + delay(1); } static void +#ifndef __FreeBSD__ dtrace_module_unloaded(modctl_t *ctl) +#else +dtrace_module_unloaded(modctl_t *ctl, int *error) +#endif { dtrace_probe_t template, *probe, *first, *next; dtrace_provider_t *prov; +#ifndef illumos + char modname[DTRACE_MODNAMELEN]; + size_t len; +#endif +#ifdef illumos template.dtpr_mod = ctl->mod_modname; +#endif +#ifdef __FreeBSD__ + /* Handle the fact that ctl->filename may end in ".ko". */ + strlcpy(modname, ctl->filename, sizeof(modname)); + len = strlen(ctl->filename); + if (len > 3 && strcmp(modname + len - 3, ".ko") == 0) + modname[len - 3] = '\0'; + template.dtpr_mod = modname; +#endif +#ifdef __NetBSD__ + if (module_source(ctl) == MODULE_SOURCE_KERNEL && + strcmp(module_name(ctl), "netbsd")) { + return; + } + + /* Handle the fact that ctl->filename may end in ".kmod". */ + strlcpy(modname, module_name(ctl), sizeof(modname)); + len = strlen(modname); + if (len > 5 && strcmp(modname + len - 5, ".kmod") == 0) + modname[len - 5] = '\0'; + template.dtpr_mod = modname; + +#endif mutex_enter(&dtrace_provider_lock); +#ifdef illumos mutex_enter(&mod_lock); +#endif mutex_enter(&dtrace_lock); +#ifdef __FreeBSD__ + if (ctl->nenabled > 0) { + /* Don't allow unloads if a probe is enabled. */ + mutex_exit(&dtrace_provider_lock); + mutex_exit(&dtrace_lock); + *error = -1; + printf( + "kldunload: attempt to unload module that has DTrace probes enabled\n"); + return; + } +#endif + if (dtrace_bymod == NULL) { /* * The DTrace module is loaded (obviously) but not attached; * we don't have any work to do. */ mutex_exit(&dtrace_provider_lock); +#ifdef illumos mutex_exit(&mod_lock); +#endif mutex_exit(&dtrace_lock); return; } @@ -14967,7 +17193,9 @@ dtrace_module_unloaded(modctl_t *ctl) probe != NULL; probe = probe->dtpr_nextmod) { if (probe->dtpr_ecb != NULL) { mutex_exit(&dtrace_provider_lock); +#ifdef illumos mutex_exit(&mod_lock); +#endif mutex_exit(&dtrace_lock); /* @@ -14981,8 +17209,13 @@ dtrace_module_unloaded(modctl_t *ctl) * probe, either. */ if (dtrace_err_verbose) { +#ifdef illumos cmn_err(CE_WARN, "unloaded module '%s' had " "enabled probes", ctl->mod_modname); +#else + cmn_err(CE_WARN, "unloaded module '%s' had " + "enabled probes", modname); +#endif } return; @@ -15025,15 +17258,45 @@ dtrace_module_unloaded(modctl_t *ctl) kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1); kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1); kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1); +#ifdef illumos vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1); +#endif +#ifdef __FreeBSD__ + free_unr(dtrace_arena, probe->dtpr_id); +#endif +#ifdef __NetBSD__ + vmem_free(dtrace_arena, (uintptr_t)probe->dtpr_id, 1); +#endif kmem_free(probe, sizeof (dtrace_probe_t)); } mutex_exit(&dtrace_lock); +#ifdef illumos mutex_exit(&mod_lock); +#endif mutex_exit(&dtrace_provider_lock); } +#ifdef __FreeBSD__ +static void +dtrace_kld_load(void *arg __unused, linker_file_t lf) +{ + + dtrace_module_loaded(lf); +} + +static void +dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error) +{ + + if (*error != 0) + /* We already have an error, so don't do anything. */ + return; + dtrace_module_unloaded(lf, error); +} +#endif + +#ifdef illumos static void dtrace_suspend(void) { @@ -15106,7 +17369,7 @@ dtrace_cpu_setup(cpu_setup_t what, proce return (0); } -#if defined(sun) +#ifdef illumos static void dtrace_cpu_setup_initial(processorid_t cpu) { @@ -15151,10 +17414,29 @@ dtrace_toxrange_add(uintptr_t base, uint dtrace_toxranges++; } +static void +dtrace_getf_barrier() +{ +#ifdef illumos + /* + * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings + * that contain calls to getf(), this routine will be called on every + * closef() before either the underlying vnode is released or the + * file_t itself is freed. By the time we are here, it is essential + * that the file_t can no longer be accessed from a call to getf() + * in probe context -- that assures that a dtrace_sync() can be used + * to clear out any enablings referring to the old structures. + */ + if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 || + kcred->cr_zone->zone_dtrace_getf != 0) + dtrace_sync(); +#endif +} + /* * DTrace Driver Cookbook Functions */ -#if defined(sun) +#ifdef illumos /*ARGSUSED*/ static int dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) @@ -15266,17 +17548,6 @@ dtrace_attach(dev_info_t *devi, ddi_atta mutex_exit(&cpu_lock); /* - * If DTrace helper tracing is enabled, we need to allocate the - * trace buffer and initialize the values. - */ - if (dtrace_helptrace_enabled) { - ASSERT(dtrace_helptrace_buffer == NULL); - dtrace_helptrace_buffer = - kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP); - dtrace_helptrace_next = 0; - } - - /* * If there are already providers, we must ask them to provide their * probes, and then match any anonymous enabling against them. Note * that there should be no other retained enablings at this time: @@ -15323,16 +17594,7 @@ dtrace_attach(dev_info_t *devi, ddi_atta } #endif -#if !defined(sun) -#if __FreeBSD_version >= 800039 -static void -dtrace_dtr(void *data __unused) -{ -} -#endif -#endif - -#if !defined(sun) +#ifdef __NetBSD__ static dev_type_open(dtrace_open); /* Pseudo Device Entry points */ @@ -15367,11 +17629,19 @@ static const struct fileops dtrace_fileo }; #endif +#ifndef illumos +static void dtrace_dtr(void *); +#endif + /*ARGSUSED*/ static int -#if defined(sun) +#ifdef illumos dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) -#else +#endif +#ifdef __FreeBSD_ +dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +#endif +#ifdef __NetBSD__ dtrace_open(dev_t dev, int flags, int mode, struct lwp *l) #endif { @@ -15380,7 +17650,7 @@ dtrace_open(dev_t dev, int flags, int mo uid_t uid; zoneid_t zoneid; -#if defined(sun) +#ifdef illumos if (getminor(*devp) == DTRACEMNRN_HELPER) return (0); @@ -15388,8 +17658,16 @@ dtrace_open(dev_t dev, int flags, int mo * If this wasn't an open with the "helper" minor, then it must be * the "dtrace" minor. */ - ASSERT(getminor(*devp) == DTRACEMNRN_DTRACE); -#else + if (getminor(*devp) == DTRACEMNRN_DTRACE) + return (ENXIO); +#endif +#ifdef __FreeBSD__ + cred_t *cred_p = NULL; + cred_p = dev->si_cred; + + +#endif +#ifdef __NetBSD__ cred_t *cred_p = NULL; struct file *fp; int fd; @@ -15397,27 +17675,6 @@ dtrace_open(dev_t dev, int flags, int mo if ((res = fd_allocfile(&fp, &fd)) != 0) return res; -#if 0 -#if __FreeBSD_version < 800039 - /* - * The first minor device is the one that is cloned so there is - * nothing more to do here. - */ - if (dev2unit(dev) == 0) - return 0; - - /* - * Devices are cloned, so if the DTrace state has already - * been allocated, that means this device belongs to a - * different client. Each client should open '/dev/dtrace' - * to get a cloned device. - */ - if (dev->si_drv1 != NULL) - return (EBUSY); -#endif - - cred_p = dev->si_cred; -#endif cred_p = l->l_cred; #endif @@ -15442,7 +17699,7 @@ dtrace_open(dev_t dev, int flags, int mo dtrace_opens++; dtrace_membar_producer(); -#if defined(sun) +#ifdef illumos /* * If the kernel debugger is active (that is, if the kernel debugger * modified text in some way), we won't allow the open. @@ -15454,15 +17711,31 @@ dtrace_open(dev_t dev, int flags, int mo return (EBUSY); } + if (dtrace_helptrace_enable && dtrace_helptrace_buffer == NULL) { + /* + * If DTrace helper tracing is enabled, we need to allocate the + * trace buffer and initialize the values. + */ + dtrace_helptrace_buffer = + kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP); + dtrace_helptrace_next = 0; + dtrace_helptrace_wrapped = 0; + dtrace_helptrace_enable = 0; + } state = dtrace_state_create(devp, cred_p); -#else - state = dtrace_state_create(dev, cred_p); +#endif +#ifdef __FreeBSD__ + state = dtrace_state_create(dev, NULL); + devfs_set_cdevpriv(state, dtrace_dtr); +#endif +#ifdef __NetBSD__ + state = dtrace_state_create(&dev, cred_p); #endif mutex_exit(&cpu_lock); if (state == NULL) { -#if defined(sun) +#ifdef illumos if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); #else @@ -15474,54 +17747,86 @@ dtrace_open(dev_t dev, int flags, int mo mutex_exit(&dtrace_lock); -#if defined(sun) - return (0); -#else +#ifdef __NetBSD__ return fd_clone(fp, fd, flags, &dtrace_fileops, state); +#else + return (0); #endif } /*ARGSUSED*/ +#ifdef illumos static int -#if defined(sun) dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p) -#else +#endif +#ifdef __FreeBSD__ +static void +dtrace_dtr(void *data) +#endif +#ifdef __NetBSD__ +static int dtrace_close(struct file *fp) #endif { -#if defined(sun) +#ifdef illumos minor_t minor = getminor(dev); dtrace_state_t *state; - +#endif + dtrace_helptrace_t *buf = NULL; + +#ifdef illumos if (minor == DTRACEMNRN_HELPER) return (0); state = ddi_get_soft_state(dtrace_softstate, minor); -#else +#endif +#ifdef __FreeBSD__ + dtrace_state_t *state = data; +#endif +#ifdef __NetBSD__ dtrace_state_t *state = (dtrace_state_t *)fp->f_data; #endif mutex_enter(&cpu_lock); mutex_enter(&dtrace_lock); - if (state != NULL) { - if (state->dts_anon) { - /* - * There is anonymous state. Destroy that first. - */ - ASSERT(dtrace_anon.dta_state == NULL); - dtrace_state_destroy(state->dts_anon); - } +#if defined(illumos) || defined(__NetBSD__) + if (state->dts_anon) +#else + if (state != NULL && state->dts_anon) +#endif + { + /* + * There is anonymous state. Destroy that first. + */ + ASSERT(dtrace_anon.dta_state == NULL); + dtrace_state_destroy(state->dts_anon); + } - dtrace_state_destroy(state); + if (dtrace_helptrace_disable) { + /* + * If we have been told to disable helper tracing, set the + * buffer to NULL before calling into dtrace_state_destroy(); + * we take advantage of its dtrace_sync() to know that no + * CPU is in probe context with enabled helper tracing + * after it returns. + */ + buf = dtrace_helptrace_buffer; + dtrace_helptrace_buffer = NULL; + } -#if !defined(sun) - fp->f_data = NULL; -#endif +#if defined(illumos) || defined(__NetBSD__) + dtrace_state_destroy(state); +#else + if (state != NULL) { + dtrace_state_destroy(state); + kmem_free(state, 0); } +#endif ASSERT(dtrace_opens > 0); -#if defined(sun) + +#ifdef illumos /* * Only relinquish control of the kernel debugger interface when there * are no consumers and no anonymous enablings. @@ -15532,13 +17837,20 @@ dtrace_close(struct file *fp) --dtrace_opens; #endif + if (buf != NULL) { + kmem_free(buf, dtrace_helptrace_bufsize); + dtrace_helptrace_disable = 0; + } + mutex_exit(&dtrace_lock); mutex_exit(&cpu_lock); +#if defined(illumos) || defined(__NetBSD__) return (0); +#endif } -#if defined(sun) +#ifdef illumos /*ARGSUSED*/ static int dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv) @@ -15582,7 +17894,7 @@ dtrace_ioctl_helper(int cmd, intptr_t ar case DTRACEHIOC_REMOVE: { mutex_enter(&dtrace_lock); - rval = dtrace_helper_destroygen(arg); + rval = dtrace_helper_destroygen(NULL, arg); mutex_exit(&dtrace_lock); return (rval); @@ -16153,6 +18465,7 @@ dtrace_ioctl(dev_t dev, int cmd, intptr_ desc.dtbd_drops = buf->dtb_drops; desc.dtbd_errors = buf->dtb_errors; desc.dtbd_oldest = buf->dtb_xamot_offset; + desc.dtbd_timestamp = dtrace_gethrtime(); mutex_exit(&dtrace_lock); @@ -16205,6 +18518,7 @@ dtrace_ioctl(dev_t dev, int cmd, intptr_ desc.dtbd_drops = buf->dtb_xamot_drops; desc.dtbd_errors = buf->dtb_xamot_errors; desc.dtbd_oldest = 0; + desc.dtbd_timestamp = buf->dtb_switched; mutex_exit(&dtrace_lock); @@ -16420,12 +18734,10 @@ dtrace_detach(dev_info_t *dip, ddi_detac dtrace_modload = NULL; dtrace_modunload = NULL; - mutex_exit(&cpu_lock); + ASSERT(dtrace_getf == 0); + ASSERT(dtrace_closef == NULL); - if (dtrace_helptrace_enabled) { - kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize); - dtrace_helptrace_buffer = NULL; - } + mutex_exit(&cpu_lock); kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *)); dtrace_probes = NULL; @@ -16477,7 +18789,7 @@ dtrace_detach(dev_info_t *dip, ddi_detac } #endif -#if defined(sun) +#ifdef illumos /*ARGSUSED*/ static int dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) @@ -16500,7 +18812,7 @@ dtrace_info(dev_info_t *dip, ddi_info_cm } #endif -#if defined(sun) +#ifdef illumos static struct cb_ops dtrace_cb_ops = { dtrace_open, /* open */ dtrace_close, /* close */ @@ -16562,62 +18874,36 @@ _fini(void) { return (mod_remove(&modlinkage)); } -#else +#endif -#if 0 +#ifdef __FreeBSD__ static d_ioctl_t dtrace_ioctl; +static d_ioctl_t dtrace_ioctl_helper; static void dtrace_load(void *); static int dtrace_unload(void); -#if __FreeBSD_version < 800039 -static void dtrace_clone(void *, struct ucred *, char *, int , struct cdev **); -static struct clonedevs *dtrace_clones; /* Ptr to the array of cloned devices. */ -static eventhandler_tag eh_tag; /* Event handler tag. */ -#else static struct cdev *dtrace_dev; -#endif +static struct cdev *helper_dev; void dtrace_invop_init(void); void dtrace_invop_uninit(void); static struct cdevsw dtrace_cdevsw = { .d_version = D_VERSION, - .d_flags = D_TRACKCLOSE | D_NEEDMINOR, - .d_close = dtrace_close, .d_ioctl = dtrace_ioctl, .d_open = dtrace_open, .d_name = "dtrace", }; -#endif -void dtrace_invop_init(void); -void dtrace_invop_uninit(void); - -static void dtrace_load(void *); -static int dtrace_unload(void); - -#include -#include -#include -#include -#include -#include -#include -#include -#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) -#include -#endif - -MODULE(MODULE_CLASS_DRIVER, dtrace, "solaris"); -#if 0 -DEV_MODULE(dtrace, dtrace_modevent, NULL); -MODULE_VERSION(dtrace, 1); -MODULE_DEPEND(dtrace, cyclic, 1, 1, 1); -MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1); -#endif -#endif +static struct cdevsw helper_cdevsw = { + .d_version = D_VERSION, + .d_ioctl = dtrace_ioctl_helper, + .d_name = "helper", +}; +#endif /* __FreeBSD__ */ -#if !defined(sun) -#undef mutex_init +#ifdef __NetBSD__ +void dtrace_invop_init(void); +void dtrace_invop_uninit(void); struct dtrace_state_worker { kmutex_t lock; @@ -16657,7 +18943,7 @@ dtrace_state_worker_add(void (*fn)(dtrac int error __diagused; w = kmem_alloc(sizeof(*w), KM_SLEEP); - mutex_init(&w->lock, MUTEX_DEFAULT, IPL_NONE); + mutex_init(&w->lock, "dtrace", MUTEX_DEFAULT, NULL); cv_init(&w->cv, "dtrace"); w->interval = ((uintmax_t)hz * interval) / NANOSEC; w->fn = fn; @@ -16685,4 +18971,28 @@ dtrace_state_worker_remove(struct dtrace mutex_destroy(&w->lock); kmem_free(w, sizeof(*w)); } -#endif + +#endif /* __NetBSD__ */ + +static void dtrace_load(void *); +static int dtrace_unload(void); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __FreeBSD__ +DEV_MODULE(dtrace, dtrace_modevent, NULL); +MODULE_VERSION(dtrace, 1); +MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1); +#endif /* __FreeBSD__ */ + +#ifdef __NetBSD__ +MODULE(MODULE_CLASS_DRIVER, dtrace, "solaris"); +#endif /* __NetBSD__ */ Index: src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.c diff -N src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.c 2 Mar 2017 10:54:24 -0000 @@ -0,0 +1,89 @@ +/*- + * Copyright (c) 2016 (Graeme Jenkinson) + * All rights reserved. + * + * This software was developed by BAE Systems, the University of Cambridge + * Computer Laboratory, and Memorial University under DARPA/AFRL contract + * FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent Computing + * (TC) research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include + +#include "dtrace_xoroshiro128_plus.h" + +static __inline uint64_t +rotl(const uint64_t x, int k) +{ + return (x << k) | (x >> (64 - k)); +} + +/* + * This is the jump function for the generator. It is equivalent to 2^64 calls + * to next(); it can be used to generate 2^64 non-overlapping subsequences for + * parallel computations. + */ +void +dtrace_xoroshiro128_plus_jump(uint64_t * const state, + uint64_t * const jump_state) +{ + static const uint64_t JUMP[] = { 0xbeac0467eba5facb, + 0xd86b048b86aa9922 }; + + uint64_t s0 = 0; + uint64_t s1 = 0; + int i = 0; + int b = 0; + for (i = 0; i < sizeof JUMP / sizeof *JUMP; i++) { + for (b = 0; b < 64; b++) { + if (JUMP[i] & 1ULL << b) { + s0 ^= state[0]; + s1 ^= state[1]; + } + dtrace_xoroshiro128_plus_next(state); + } + } + jump_state[0] = s0; + jump_state[1] = s1; +} + +/* + * xoroshiro128+ - XOR/rotate/shift/rotate + * xorshift.di.unimi.it + */ +uint64_t +dtrace_xoroshiro128_plus_next(uint64_t * const state) +{ + const uint64_t s0 = state[0]; + uint64_t s1 = state[1]; + uint64_t result; + result = s0 + s1; + + s1 ^= s0; + state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14); + state[1] = rotl(s1, 36); + + return result; +} Index: src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.h =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.h diff -N src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/dtrace/dtrace_xoroshiro128_plus.h 2 Mar 2017 10:54:24 -0000 @@ -0,0 +1,41 @@ +/*- + * Copyright (c) 2016 (Graeme Jenkinson) + * All rights reserved. + * + * This software was developed by BAE Systems, the University of Cambridge + * Computer Laboratory, and Memorial University under DARPA/AFRL contract + * FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent Computing + * (TC) research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#ifndef _DTRACE_XOROSHIRO128_PLUS_H +#define _DTRACE_XOROSHIRO128_PLUS_H + +#include + +void dtrace_xoroshiro128_plus_jump(uint64_t * const, uint64_t * const); +uint64_t dtrace_xoroshiro128_plus_next(uint64_t * const); + +#endif Index: src/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c,v retrieving revision 1.4 diff -u -p -r1.4 fasttrap.c --- src/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c 27 Mar 2014 15:50:48 -0000 1.4 +++ src/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c 2 Mar 2017 10:54:24 -0000 @@ -17,14 +17,20 @@ * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END + * + * Portions Copyright 2010 The FreeBSD Foundation + * + * $FreeBSD: head/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c 313758 2017-02-15 06:07:01Z markj $ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ #include #include @@ -32,11 +38,15 @@ #include #include #include +#ifdef illumos #include +#endif #include #include #include +#ifdef illumos #include +#endif #include #include #include @@ -44,9 +54,28 @@ #include #include #include -#include #include +#ifdef illumos #include +#endif +#include +#include +#ifndef illumos +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#endif /* * User-Land Trap-Based Tracing @@ -125,12 +154,21 @@ * never hold the provider lock and creation lock simultaneously */ -static dev_info_t *fasttrap_devi; +static d_open_t fasttrap_open; +static d_ioctl_t fasttrap_ioctl; + +static struct cdevsw fasttrap_cdevsw = { + .d_version = D_VERSION, + .d_open = fasttrap_open, + .d_ioctl = fasttrap_ioctl, + .d_name = "fasttrap", +}; +static struct cdev *fasttrap_cdev; static dtrace_meta_provider_id_t fasttrap_meta_id; -static timeout_id_t fasttrap_timeout; -static kmutex_t fasttrap_cleanup_mtx; -static uint_t fasttrap_cleanup_work; +static struct proc *fasttrap_cleanup_proc; +static struct mtx fasttrap_cleanup_mtx; +static uint_t fasttrap_cleanup_work, fasttrap_cleanup_drain, fasttrap_cleanup_cv; /* * Generation count on modifications to the global tracepoint lookup table. @@ -139,15 +177,19 @@ static volatile uint64_t fasttrap_mod_ge /* * When the fasttrap provider is loaded, fasttrap_max is set to either - * FASTTRAP_MAX_DEFAULT or the value for fasttrap-max-probes in the - * fasttrap.conf file. Each time a probe is created, fasttrap_total is - * incremented by the number of tracepoints that may be associated with that - * probe; fasttrap_total is capped at fasttrap_max. + * FASTTRAP_MAX_DEFAULT, or the value for fasttrap-max-probes in the + * fasttrap.conf file (Illumos), or the value provied in the loader.conf (FreeBSD). + * Each time a probe is created, fasttrap_total is incremented by the number + * of tracepoints that may be associated with that probe; fasttrap_total is capped + * at fasttrap_max. */ #define FASTTRAP_MAX_DEFAULT 250000 -static uint32_t fasttrap_max; +static uint32_t fasttrap_max = FASTTRAP_MAX_DEFAULT; static uint32_t fasttrap_total; +/* + * Copyright (c) 2011, Joyent, Inc. All rights reserved. + */ #define FASTTRAP_TPOINTS_DEFAULT_SIZE 0x4000 #define FASTTRAP_PROVIDERS_DEFAULT_SIZE 0x100 @@ -176,11 +218,31 @@ static void fasttrap_provider_free(fastt static fasttrap_proc_t *fasttrap_proc_lookup(pid_t); static void fasttrap_proc_release(fasttrap_proc_t *); +#ifndef illumos +static void fasttrap_thread_dtor(void *, struct thread *); +#endif + #define FASTTRAP_PROVS_INDEX(pid, name) \ ((fasttrap_hash_str(name) + (pid)) & fasttrap_provs.fth_mask) #define FASTTRAP_PROCS_INDEX(pid) ((pid) & fasttrap_procs.fth_mask) +#ifndef illumos +struct rmlock fasttrap_tp_lock; +static eventhandler_tag fasttrap_thread_dtor_tag; +#endif + +static unsigned long tpoints_hash_size = FASTTRAP_TPOINTS_DEFAULT_SIZE; + +#ifdef __FreeBSD__ +SYSCTL_DECL(_kern_dtrace); +SYSCTL_NODE(_kern_dtrace, OID_AUTO, fasttrap, CTLFLAG_RD, 0, "DTrace fasttrap parameters"); +SYSCTL_UINT(_kern_dtrace_fasttrap, OID_AUTO, max_probes, CTLFLAG_RWTUN, &fasttrap_max, + FASTTRAP_MAX_DEFAULT, "Maximum number of fasttrap probes"); +SYSCTL_ULONG(_kern_dtrace_fasttrap, OID_AUTO, tpoints_hash_size, CTLFLAG_RDTUN, &tpoints_hash_size, + FASTTRAP_TPOINTS_DEFAULT_SIZE, "Size of the tracepoint hash table"); +#endif + static int fasttrap_highbit(ulong_t i) { @@ -229,6 +291,7 @@ fasttrap_hash_str(const char *p) void fasttrap_sigtrap(proc_t *p, kthread_t *t, uintptr_t pc) { +#ifdef illumos sigqueue_t *sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); sqp->sq_info.si_signo = SIGTRAP; @@ -241,7 +304,130 @@ fasttrap_sigtrap(proc_t *p, kthread_t *t if (t != NULL) aston(t); +#else + ksiginfo_t *ksi = kmem_zalloc(sizeof (ksiginfo_t), KM_SLEEP); + + ksiginfo_init(ksi); + ksi->ksi_signo = SIGTRAP; + ksi->ksi_code = TRAP_DTRACE; + ksi->ksi_addr = (caddr_t)pc; + PROC_LOCK(p); + (void) tdsendsignal(p, t, SIGTRAP, ksi); + PROC_UNLOCK(p); +#endif +} + +#ifndef illumos +/* + * Obtain a chunk of scratch space in the address space of the target process. + */ +fasttrap_scrspace_t * +fasttrap_scraddr(struct thread *td, fasttrap_proc_t *fprc) +{ + fasttrap_scrblock_t *scrblk; + fasttrap_scrspace_t *scrspc; + struct proc *p; + vm_offset_t addr; + int error, i; + + scrspc = NULL; + if (td->t_dtrace_sscr != NULL) { + /* If the thread already has scratch space, we're done. */ + scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr; + return (scrspc); + } + + p = td->td_proc; + + mutex_enter(&fprc->ftpc_mtx); + if (LIST_EMPTY(&fprc->ftpc_fscr)) { + /* + * No scratch space is available, so we'll map a new scratch + * space block into the traced process' address space. + */ + addr = 0; + error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, + FASTTRAP_SCRBLOCK_SIZE, 0, VMFS_ANY_SPACE, VM_PROT_ALL, + VM_PROT_ALL, 0); + if (error != KERN_SUCCESS) + goto done; + + scrblk = malloc(sizeof(*scrblk), M_SOLARIS, M_WAITOK); + scrblk->ftsb_addr = addr; + LIST_INSERT_HEAD(&fprc->ftpc_scrblks, scrblk, ftsb_next); + + /* + * Carve the block up into chunks and put them on the free list. + */ + for (i = 0; + i < FASTTRAP_SCRBLOCK_SIZE / FASTTRAP_SCRSPACE_SIZE; i++) { + scrspc = malloc(sizeof(*scrspc), M_SOLARIS, M_WAITOK); + scrspc->ftss_addr = addr + + i * FASTTRAP_SCRSPACE_SIZE; + LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc, + ftss_next); + } + } + + /* + * Take the first scratch chunk off the free list, put it on the + * allocated list, and return its address. + */ + scrspc = LIST_FIRST(&fprc->ftpc_fscr); + LIST_REMOVE(scrspc, ftss_next); + LIST_INSERT_HEAD(&fprc->ftpc_ascr, scrspc, ftss_next); + + /* + * This scratch space is reserved for use by td until the thread exits. + */ + td->t_dtrace_sscr = scrspc; + +done: + mutex_exit(&fprc->ftpc_mtx); + + return (scrspc); +} + +/* + * Return any allocated per-thread scratch space chunks back to the process' + * free list. + */ +static void +fasttrap_thread_dtor(void *arg __unused, struct thread *td) +{ + fasttrap_bucket_t *bucket; + fasttrap_proc_t *fprc; + fasttrap_scrspace_t *scrspc; + pid_t pid; + + if (td->t_dtrace_sscr == NULL) + return; + + pid = td->td_proc->p_pid; + bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)]; + fprc = NULL; + + /* Look up the fasttrap process handle for this process. */ + mutex_enter(&bucket->ftb_mtx); + for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) { + if (fprc->ftpc_pid == pid) { + mutex_enter(&fprc->ftpc_mtx); + mutex_exit(&bucket->ftb_mtx); + break; + } + } + if (fprc == NULL) { + mutex_exit(&bucket->ftb_mtx); + return; + } + + scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr; + LIST_REMOVE(scrspc, ftss_next); + LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc, ftss_next); + + mutex_exit(&fprc->ftpc_mtx); } +#endif /* * This function ensures that no threads are actively using the memory @@ -257,15 +443,23 @@ fasttrap_mod_barrier(uint64_t gen) fasttrap_mod_gen++; - for (i = 0; i < NCPU; i++) { - mutex_enter(&cpu_core[i].cpuc_pid_lock); - mutex_exit(&cpu_core[i].cpuc_pid_lock); +#ifdef illumos + CPU_FOREACH(i) { + mutex_enter(&fasttrap_cpuc_pid_lock[i]); + mutex_exit(&fasttrap_cpuc_pid_lock[i]); } +#else + rm_wlock(&fasttrap_tp_lock); + rm_wunlock(&fasttrap_tp_lock); +#endif } /* - * This is the timeout's callback for cleaning up the providers and their - * probes. + * This function performs asynchronous cleanup of fasttrap providers. The + * Solaris implementation of this mechanism use a timeout that's activated in + * fasttrap_pid_cleanup(), but this doesn't work in FreeBSD: one may sleep while + * holding the DTrace mutexes, but it is unsafe to sleep in a callout handler. + * Thus we use a dedicated process to perform the cleanup when requested. */ /*ARGSUSED*/ static void @@ -274,16 +468,12 @@ fasttrap_pid_cleanup_cb(void *data) fasttrap_provider_t **fpp, *fp; fasttrap_bucket_t *bucket; dtrace_provider_id_t provid; - int i, later; - - static volatile int in = 0; - ASSERT(in == 0); - in = 1; + int i, later = 0, rval; - mutex_enter(&fasttrap_cleanup_mtx); - while (fasttrap_cleanup_work) { + mtx_lock(&fasttrap_cleanup_mtx); + while (!fasttrap_cleanup_drain || later > 0) { fasttrap_cleanup_work = 0; - mutex_exit(&fasttrap_cleanup_mtx); + mtx_unlock(&fasttrap_cleanup_mtx); later = 0; @@ -336,9 +526,13 @@ fasttrap_pid_cleanup_cb(void *data) * clean out the unenabled probes. */ provid = fp->ftp_provid; - if (dtrace_unregister(provid) != 0) { + if ((rval = dtrace_unregister(provid)) != 0) { if (fasttrap_total > fasttrap_max / 2) (void) dtrace_condense(provid); + + if (rval == EAGAIN) + fp->ftp_marked = 1; + later += fp->ftp_marked; fpp = &fp->ftp_next; } else { @@ -348,31 +542,32 @@ fasttrap_pid_cleanup_cb(void *data) } mutex_exit(&bucket->ftb_mtx); } + mtx_lock(&fasttrap_cleanup_mtx); - mutex_enter(&fasttrap_cleanup_mtx); + /* + * If we were unable to retire a provider, try again after a + * second. This situation can occur in certain circumstances + * where providers cannot be unregistered even though they have + * no probes enabled because of an execution of dtrace -l or + * something similar. + */ + if (later > 0 || fasttrap_cleanup_work || + fasttrap_cleanup_drain) { + mtx_unlock(&fasttrap_cleanup_mtx); + pause("ftclean", hz); + mtx_lock(&fasttrap_cleanup_mtx); + } else + mtx_sleep(&fasttrap_cleanup_cv, &fasttrap_cleanup_mtx, + 0, "ftcl", 0); } - ASSERT(fasttrap_timeout != 0); - /* - * If we were unable to remove a retired provider, try again after - * a second. This situation can occur in certain circumstances where - * providers cannot be unregistered even though they have no probes - * enabled because of an execution of dtrace -l or something similar. - * If the timeout has been disabled (set to 1 because we're trying - * to detach), we set fasttrap_cleanup_work to ensure that we'll - * get a chance to do that work if and when the timeout is reenabled - * (if detach fails). + * Wake up the thread in fasttrap_unload() now that we're done. */ - if (later > 0 && fasttrap_timeout != (timeout_id_t)1) - fasttrap_timeout = timeout(&fasttrap_pid_cleanup_cb, NULL, hz); - else if (later > 0) - fasttrap_cleanup_work = 1; - else - fasttrap_timeout = 0; + wakeup(&fasttrap_cleanup_drain); + mtx_unlock(&fasttrap_cleanup_mtx); - mutex_exit(&fasttrap_cleanup_mtx); - in = 0; + kthread_exit(); } /* @@ -381,11 +576,13 @@ fasttrap_pid_cleanup_cb(void *data) static void fasttrap_pid_cleanup(void) { - mutex_enter(&fasttrap_cleanup_mtx); - fasttrap_cleanup_work = 1; - if (fasttrap_timeout == 0) - fasttrap_timeout = timeout(&fasttrap_pid_cleanup_cb, NULL, 1); - mutex_exit(&fasttrap_cleanup_mtx); + + mtx_lock(&fasttrap_cleanup_mtx); + if (!fasttrap_cleanup_work) { + fasttrap_cleanup_work = 1; + wakeup(&fasttrap_cleanup_cv); + } + mtx_unlock(&fasttrap_cleanup_mtx); } /* @@ -397,12 +594,42 @@ fasttrap_pid_cleanup(void) static void fasttrap_fork(proc_t *p, proc_t *cp) { +#ifndef illumos + fasttrap_scrblock_t *scrblk; + fasttrap_proc_t *fprc = NULL; +#endif pid_t ppid = p->p_pid; int i; +#ifdef illumos ASSERT(curproc == p); ASSERT(p->p_proc_flag & P_PR_LOCK); +#else + PROC_LOCK_ASSERT(p, MA_OWNED); +#endif +#ifdef illumos ASSERT(p->p_dtrace_count > 0); +#else + if (p->p_dtrace_helpers) { + /* + * dtrace_helpers_duplicate() allocates memory. + */ + _PHOLD(cp); + PROC_UNLOCK(p); + PROC_UNLOCK(cp); + dtrace_helpers_duplicate(p, cp); + PROC_LOCK(cp); + PROC_LOCK(p); + _PRELE(cp); + } + /* + * This check is purposely here instead of in kern_fork.c because, + * for legal resons, we cannot include the dtrace_cddl.h header + * inside kern_fork.c and insert if-clause there. + */ + if (p->p_dtrace_count == 0) + return; +#endif ASSERT(cp->p_dtrace_count == 0); /* @@ -419,9 +646,19 @@ fasttrap_fork(proc_t *p, proc_t *cp) * We don't have to worry about the child process disappearing * because we're in fork(). */ - mutex_enter(&cp->p_lock); +#ifdef illumos + mtx_lock_spin(&cp->p_slock); sprlock_proc(cp); - mutex_exit(&cp->p_lock); + mtx_unlock_spin(&cp->p_slock); +#else + /* + * fasttrap_tracepoint_remove() expects the child process to be + * unlocked and the VM then expects curproc to be unlocked. + */ + _PHOLD(cp); + PROC_UNLOCK(cp); + PROC_UNLOCK(p); +#endif /* * Iterate over every tracepoint looking for ones that belong to the @@ -446,13 +683,38 @@ fasttrap_fork(proc_t *p, proc_t *cp) * mid-fork. */ ASSERT(tp->ftt_proc->ftpc_acount != 0); +#ifndef illumos + fprc = tp->ftt_proc; +#endif } } mutex_exit(&bucket->ftb_mtx); + +#ifndef illumos + /* + * Unmap any scratch space inherited from the parent's address + * space. + */ + if (fprc != NULL) { + mutex_enter(&fprc->ftpc_mtx); + LIST_FOREACH(scrblk, &fprc->ftpc_scrblks, ftsb_next) { + vm_map_remove(&cp->p_vmspace->vm_map, + scrblk->ftsb_addr, + scrblk->ftsb_addr + FASTTRAP_SCRBLOCK_SIZE); + } + mutex_exit(&fprc->ftpc_mtx); + } +#endif } +#ifdef illumos mutex_enter(&cp->p_lock); sprunlock(cp); +#else + PROC_LOCK(p); + PROC_LOCK(cp); + _PRELE(cp); +#endif } /* @@ -463,24 +725,42 @@ fasttrap_fork(proc_t *p, proc_t *cp) static void fasttrap_exec_exit(proc_t *p) { - ASSERT(p == curproc); - ASSERT(MUTEX_HELD(&p->p_lock)); +#ifndef illumos + struct thread *td; +#endif - mutex_exit(&p->p_lock); +#ifdef illumos + ASSERT(p == curproc); +#else + PROC_LOCK_ASSERT(p, MA_OWNED); + _PHOLD(p); + /* + * Since struct threads may be recycled, we cannot rely on t_dtrace_sscr + * fields to be zeroed by kdtrace_thread_ctor. Thus we must zero it + * ourselves when a process exits. + */ + FOREACH_THREAD_IN_PROC(p, td) + td->t_dtrace_sscr = NULL; + PROC_UNLOCK(p); +#endif /* * We clean up the pid provider for this process here; user-land * static probes are handled by the meta-provider remove entry point. */ fasttrap_provider_retire(p->p_pid, FASTTRAP_PID_NAME, 0); - - mutex_enter(&p->p_lock); +#ifndef illumos + if (p->p_dtrace_helpers) + dtrace_helpers_destroy(p); + PROC_LOCK(p); + _PRELE(p); +#endif } /*ARGSUSED*/ static void -fasttrap_pid_provide(void *arg, const dtrace_probedesc_t *desc) +fasttrap_pid_provide(void *arg, dtrace_probedesc_t *desc) { /* * There are no "default" pid probes. @@ -504,7 +784,9 @@ fasttrap_tracepoint_enable(proc_t *p, fa ASSERT(probe->ftp_tps[index].fit_tp->ftt_pid == pid); +#ifdef illumos ASSERT(!(p->p_flag & SVFORK)); +#endif /* * Before we make any modifications, make sure we've imposed a barrier @@ -610,7 +892,9 @@ again: * Increment the count of the number of tracepoints active in * the victim process. */ +#ifdef illumos ASSERT(p->p_proc_flag & P_PR_LOCK); +#endif p->p_dtrace_count++; return (rc); @@ -647,6 +931,13 @@ again: ASSERT(0); } +#ifdef __FreeBSD__ + if (SV_PROC_FLAG(p, SV_LP64)) + p->p_model = DATAMODEL_LP64; + else + p->p_model = DATAMODEL_ILP32; +#endif + /* * If the ISA-dependent initialization goes to plan, go back to the * beginning and try to install this freshly made tracepoint. @@ -666,7 +957,7 @@ fasttrap_tracepoint_disable(proc_t *p, f fasttrap_bucket_t *bucket; fasttrap_provider_t *provider = probe->ftp_prov; fasttrap_tracepoint_t **pp, *tp; - fasttrap_id_t *id, **idp; + fasttrap_id_t *id, **idp = NULL; pid_t pid; uintptr_t pc; @@ -800,7 +1091,9 @@ fasttrap_tracepoint_disable(proc_t *p, f * Decrement the count of the number of tracepoints active * in the victim process. */ +#ifdef illumos ASSERT(p->p_proc_flag & P_PR_LOCK); +#endif p->p_dtrace_count--; } @@ -851,42 +1144,49 @@ fasttrap_enable_callbacks(void) static void fasttrap_disable_callbacks(void) { +#ifdef illumos ASSERT(MUTEX_HELD(&cpu_lock)); +#endif + mutex_enter(&fasttrap_count_mtx); ASSERT(fasttrap_pid_count > 0); fasttrap_pid_count--; if (fasttrap_pid_count == 0) { +#ifdef illumos cpu_t *cur, *cpu = CPU; for (cur = cpu->cpu_next_onln; cur != cpu; cur = cur->cpu_next_onln) { rw_enter(&cur->cpu_ft_lock, RW_WRITER); } - +#endif dtrace_pid_probe_ptr = NULL; dtrace_return_probe_ptr = NULL; - +#ifdef illumos for (cur = cpu->cpu_next_onln; cur != cpu; cur = cur->cpu_next_onln) { rw_exit(&cur->cpu_ft_lock); } +#endif } mutex_exit(&fasttrap_count_mtx); } /*ARGSUSED*/ -static int +static void fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) { fasttrap_probe_t *probe = parg; - proc_t *p; + proc_t *p = NULL; int i, rc; ASSERT(probe != NULL); ASSERT(!probe->ftp_enabled); ASSERT(id == probe->ftp_id); +#ifdef illumos ASSERT(MUTEX_HELD(&cpu_lock)); +#endif /* * Increment the count of enabled probes on this probe's provider; @@ -904,25 +1204,36 @@ fasttrap_pid_enable(void *arg, dtrace_id * provider can't go away while we're in this code path. */ if (probe->ftp_prov->ftp_retired) - return (0); + return; /* * If we can't find the process, it may be that we're in the context of * a fork in which the traced process is being born and we're copying * USDT probes. Otherwise, the process is gone so bail. */ +#ifdef illumos if ((p = sprlock(probe->ftp_pid)) == NULL) { if ((curproc->p_flag & SFORKING) == 0) - return (0); + return; mutex_enter(&pidlock); p = prfind(probe->ftp_pid); + if (p == NULL) { + /* + * So it's not that the target process is being born, + * it's that it isn't there at all (and we simply + * happen to be forking). Anyway, we know that the + * target is definitely gone, so bail out. + */ + mutex_exit(&pidlock); + return (0); + } + /* * Confirm that curproc is indeed forking the process in which * we're trying to enable probes. */ - ASSERT(p != NULL); ASSERT(p->p_parent == curproc); ASSERT(p->p_stat == SIDL); @@ -934,6 +1245,10 @@ fasttrap_pid_enable(void *arg, dtrace_id ASSERT(!(p->p_flag & SVFORK)); mutex_exit(&p->p_lock); +#else + if (pget(probe->ftp_pid, PGET_HOLD | PGET_NOTWEXIT, &p) != 0) + return; +#endif /* * We have to enable the trap entry point before any user threads have @@ -967,23 +1282,29 @@ fasttrap_pid_enable(void *arg, dtrace_id i--; } +#ifdef illumos mutex_enter(&p->p_lock); sprunlock(p); +#else + PRELE(p); +#endif /* * Since we're not actually enabling this probe, * drop our reference on the trap table entry. */ fasttrap_disable_callbacks(); - return (0); + return; } } - +#ifdef illumos mutex_enter(&p->p_lock); sprunlock(p); +#else + PRELE(p); +#endif probe->ftp_enabled = 1; - return (0); } /*ARGSUSED*/ @@ -997,18 +1318,16 @@ fasttrap_pid_disable(void *arg, dtrace_i ASSERT(id == probe->ftp_id); + mutex_enter(&provider->ftp_mtx); + /* * We won't be able to acquire a /proc-esque lock on the process * iff the process is dead and gone. In this case, we rely on the * provider lock as a point of mutual exclusion to prevent other * DTrace consumers from disabling this probe. */ - if ((p = sprlock(probe->ftp_pid)) != NULL) { - ASSERT(!(p->p_flag & SVFORK)); - mutex_exit(&p->p_lock); - } - - mutex_enter(&provider->ftp_mtx); + if (pget(probe->ftp_pid, PGET_HOLD | PGET_NOTWEXIT, &p) != 0) + p = NULL; /* * Disable all the associated tracepoints (for fully enabled probes). @@ -1031,9 +1350,6 @@ fasttrap_pid_disable(void *arg, dtrace_i if (provider->ftp_retired && !provider->ftp_marked) whack = provider->ftp_marked = 1; mutex_exit(&provider->ftp_mtx); - - mutex_enter(&p->p_lock); - sprunlock(p); } else { /* * If the process is dead, we're just waiting for the @@ -1047,12 +1363,18 @@ fasttrap_pid_disable(void *arg, dtrace_i if (whack) fasttrap_pid_cleanup(); +#ifdef __FreeBSD__ + if (p != NULL) + PRELE(p); +#endif if (!probe->ftp_enabled) return; probe->ftp_enabled = 0; +#ifdef illumos ASSERT(MUTEX_HELD(&cpu_lock)); +#endif fasttrap_disable_callbacks(); } @@ -1164,6 +1486,7 @@ fasttrap_proc_lookup(pid_t pid) fasttrap_bucket_t *bucket; fasttrap_proc_t *fprc, *new_fprc; + bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)]; mutex_enter(&bucket->ftb_mtx); @@ -1172,7 +1495,7 @@ fasttrap_proc_lookup(pid_t pid) mutex_enter(&fprc->ftpc_mtx); mutex_exit(&bucket->ftb_mtx); fprc->ftpc_rcount++; - atomic_add_64(&fprc->ftpc_acount, 1); + atomic_inc_64(&fprc->ftpc_acount); ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount); mutex_exit(&fprc->ftpc_mtx); @@ -1190,6 +1513,10 @@ fasttrap_proc_lookup(pid_t pid) new_fprc->ftpc_pid = pid; new_fprc->ftpc_rcount = 1; new_fprc->ftpc_acount = 1; +#ifndef illumos + mutex_init(&new_fprc->ftpc_mtx, "fasttrap proc mtx", MUTEX_DEFAULT, + NULL); +#endif mutex_enter(&bucket->ftb_mtx); @@ -1202,7 +1529,7 @@ fasttrap_proc_lookup(pid_t pid) mutex_enter(&fprc->ftpc_mtx); mutex_exit(&bucket->ftb_mtx); fprc->ftpc_rcount++; - atomic_add_64(&fprc->ftpc_acount, 1); + atomic_inc_64(&fprc->ftpc_acount); ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount); mutex_exit(&fprc->ftpc_mtx); @@ -1226,6 +1553,12 @@ fasttrap_proc_release(fasttrap_proc_t *p fasttrap_bucket_t *bucket; fasttrap_proc_t *fprc, **fprcp; pid_t pid = proc->ftpc_pid; +#ifndef illumos + fasttrap_scrblock_t *scrblk, *scrblktmp; + fasttrap_scrspace_t *scrspc, *scrspctmp; + struct proc *p; + struct thread *td; +#endif mutex_enter(&proc->ftpc_mtx); @@ -1237,6 +1570,31 @@ fasttrap_proc_release(fasttrap_proc_t *p return; } +#ifndef illumos + /* + * Free all structures used to manage per-thread scratch space. + */ + LIST_FOREACH_SAFE(scrblk, &proc->ftpc_scrblks, ftsb_next, + scrblktmp) { + LIST_REMOVE(scrblk, ftsb_next); + free(scrblk, M_SOLARIS); + } + LIST_FOREACH_SAFE(scrspc, &proc->ftpc_fscr, ftss_next, scrspctmp) { + LIST_REMOVE(scrspc, ftss_next); + free(scrspc, M_SOLARIS); + } + LIST_FOREACH_SAFE(scrspc, &proc->ftpc_ascr, ftss_next, scrspctmp) { + LIST_REMOVE(scrspc, ftss_next); + free(scrspc, M_SOLARIS); + } + + if ((p = pfind(pid)) != NULL) { + FOREACH_THREAD_IN_PROC(p, td) + td->t_dtrace_sscr = NULL; + PROC_UNLOCK(p); + } +#endif + mutex_exit(&proc->ftpc_mtx); /* @@ -1312,17 +1670,8 @@ fasttrap_provider_lookup(pid_t pid, cons * Make sure the process exists, isn't a child created as the result * of a vfork(2), and isn't a zombie (but may be in fork). */ - mutex_enter(&pidlock); - if ((p = prfind(pid)) == NULL) { - mutex_exit(&pidlock); + if ((p = pfind(pid)) == NULL) return (NULL); - } - mutex_enter(&p->p_lock); - mutex_exit(&pidlock); - if (p->p_flag & (SVFORK | SEXITING)) { - mutex_exit(&p->p_lock); - return (NULL); - } /* * Increment p_dtrace_probes so that the process knows to inform us @@ -1335,15 +1684,18 @@ fasttrap_provider_lookup(pid_t pid, cons * Grab the credentials for this process so we have * something to pass to dtrace_register(). */ - mutex_enter(&p->p_crlock); - crhold(p->p_cred); - cred = p->p_cred; - mutex_exit(&p->p_crlock); - mutex_exit(&p->p_lock); + PROC_LOCK_ASSERT(p, MA_OWNED); + crhold(p->p_ucred); + cred = p->p_ucred; + PROC_UNLOCK(p); new_fp = kmem_zalloc(sizeof (fasttrap_provider_t), KM_SLEEP); new_fp->ftp_pid = pid; new_fp->ftp_proc = fasttrap_proc_lookup(pid); +#ifndef illumos + mutex_init(&new_fp->ftp_mtx, "provider mtx", MUTEX_DEFAULT, NULL); + mutex_init(&new_fp->ftp_cmtx, "lock on creating", MUTEX_DEFAULT, NULL); +#endif ASSERT(new_fp->ftp_proc != NULL); @@ -1414,13 +1766,17 @@ fasttrap_provider_free(fasttrap_provider * count of active providers on the associated process structure. */ if (!provider->ftp_retired) { - atomic_add_64(&provider->ftp_proc->ftpc_acount, -1); + atomic_dec_64(&provider->ftp_proc->ftpc_acount); ASSERT(provider->ftp_proc->ftpc_acount < provider->ftp_proc->ftpc_rcount); } fasttrap_proc_release(provider->ftp_proc); +#ifndef illumos + mutex_destroy(&provider->ftp_mtx); + mutex_destroy(&provider->ftp_cmtx); +#endif kmem_free(provider, sizeof (fasttrap_provider_t)); /* @@ -1430,17 +1786,14 @@ fasttrap_provider_free(fasttrap_provider * corresponds to this process's hash chain in the provider hash * table. Don't sweat it if we can't find the process. */ - mutex_enter(&pidlock); - if ((p = prfind(pid)) == NULL) { - mutex_exit(&pidlock); + if ((p = pfind(pid)) == NULL) { return; } - mutex_enter(&p->p_lock); - mutex_exit(&pidlock); - p->p_dtrace_probes--; - mutex_exit(&p->p_lock); +#ifndef illumos + PROC_UNLOCK(p); +#endif } static void @@ -1489,7 +1842,7 @@ fasttrap_provider_retire(pid_t pid, cons * bucket lock therefore protects the integrity of the provider hash * table. */ - atomic_add_64(&fp->ftp_proc->ftpc_acount, -1); + atomic_dec_64(&fp->ftp_proc->ftpc_acount); ASSERT(fp->ftp_proc->ftpc_acount < fp->ftp_proc->ftpc_rcount); fp->ftp_retired = 1; @@ -1528,7 +1881,7 @@ fasttrap_add_probe(fasttrap_probe_spec_t fasttrap_probe_t *pp; fasttrap_tracepoint_t *tp; char *name; - int i, aframes, whack; + int i, aframes = 0, whack; /* * There needs to be at least one desired trace point. @@ -1578,17 +1931,17 @@ fasttrap_add_probe(fasttrap_probe_spec_t for (i = 0; i < pdata->ftps_noffs; i++) { char name_str[17]; - (void) snprintf(name_str, sizeof(name_str), "%llx", + (void) sprintf(name_str, "%llx", (unsigned long long)pdata->ftps_offs[i]); if (dtrace_probe_lookup(provider->ftp_provid, pdata->ftps_mod, pdata->ftps_func, name_str) != 0) continue; - atomic_add_32(&fasttrap_total, 1); + atomic_inc_32(&fasttrap_total); if (fasttrap_total > fasttrap_max) { - atomic_add_32(&fasttrap_total, -1); + atomic_dec_32(&fasttrap_total); goto no_mem; } @@ -1716,7 +2069,7 @@ fasttrap_meta_provide(void *arg, dtrace_ */ if (strlen(dhpv->dthpv_provname) + 10 >= sizeof (provider->ftp_name)) { - cmn_err(CE_WARN, "failed to instantiate provider %s: " + printf("failed to instantiate provider %s: " "name too long to accomodate pid", dhpv->dthpv_provname); return (NULL); } @@ -1725,7 +2078,7 @@ fasttrap_meta_provide(void *arg, dtrace_ * Don't let folks spoof the true pid provider. */ if (strcmp(dhpv->dthpv_provname, FASTTRAP_PID_NAME) == 0) { - cmn_err(CE_WARN, "failed to instantiate provider %s: " + printf("failed to instantiate provider %s: " "%s is an invalid name", dhpv->dthpv_provname, FASTTRAP_PID_NAME); return (NULL); @@ -1748,7 +2101,7 @@ fasttrap_meta_provide(void *arg, dtrace_ if ((provider = fasttrap_provider_lookup(pid, dhpv->dthpv_provname, &dhpv->dthpv_pattr)) == NULL) { - cmn_err(CE_WARN, "failed to instantiate provider %s for " + printf("failed to instantiate provider %s for " "process %u", dhpv->dthpv_provname, (uint_t)pid); return (NULL); } @@ -1764,6 +2117,18 @@ fasttrap_meta_provide(void *arg, dtrace_ return (provider); } +/* + * We know a few things about our context here: we know that the probe being + * created doesn't already exist (DTrace won't load DOF at the same address + * twice, even if explicitly told to do so) and we know that we are + * single-threaded with respect to the meta provider machinery. Knowing that + * this is a new probe and that there is no way for us to race with another + * operation on this provider allows us an important optimization: we need not + * lookup a probe before adding it. Saving this lookup is important because + * this code is in the fork path for processes with USDT probes, and lookups + * here are potentially very expensive because of long hash conflicts on + * module, function and name (DTrace doesn't hash on provider name). + */ /*ARGSUSED*/ static void fasttrap_meta_create_probe(void *arg, void *parg, @@ -1800,19 +2165,6 @@ fasttrap_meta_create_probe(void *arg, vo return; } - /* - * Grab the creation lock to ensure consistency between calls to - * dtrace_probe_lookup() and dtrace_probe_create() in the face of - * other threads creating probes. - */ - mutex_enter(&provider->ftp_cmtx); - - if (dtrace_probe_lookup(provider->ftp_provid, dhpb->dthpb_mod, - dhpb->dthpb_func, dhpb->dthpb_name) != 0) { - mutex_exit(&provider->ftp_cmtx); - return; - } - ntps = dhpb->dthpb_noffs + dhpb->dthpb_nenoffs; ASSERT(ntps > 0); @@ -1820,7 +2172,6 @@ fasttrap_meta_create_probe(void *arg, vo if (fasttrap_total > fasttrap_max) { atomic_add_32(&fasttrap_total, -ntps); - mutex_exit(&provider->ftp_cmtx); return; } @@ -1884,8 +2235,6 @@ fasttrap_meta_create_probe(void *arg, vo */ pp->ftp_id = dtrace_probe_create(provider->ftp_provid, dhpb->dthpb_mod, dhpb->dthpb_func, dhpb->dthpb_name, FASTTRAP_OFFSET_AFRAMES, pp); - - mutex_exit(&provider->ftp_cmtx); } /*ARGSUSED*/ @@ -1909,25 +2258,30 @@ static dtrace_mops_t fasttrap_mops = { /*ARGSUSED*/ static int -fasttrap_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +fasttrap_open(struct cdev *dev __unused, int oflags __unused, + int devtype __unused, struct thread *td __unused) { return (0); } /*ARGSUSED*/ static int -fasttrap_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) +fasttrap_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int fflag, + struct thread *td) { +#ifdef notyet + struct kinfo_proc kp; + const cred_t *cr = td->td_ucred; +#endif if (!dtrace_attached()) return (EAGAIN); if (cmd == FASTTRAPIOC_MAKEPROBE) { - fasttrap_probe_spec_t *uprobe = (void *)arg; + fasttrap_probe_spec_t *uprobe = *(fasttrap_probe_spec_t **)arg; fasttrap_probe_spec_t *probe; uint64_t noffs; size_t size; - int ret; - char *c; + int ret, err; if (copyin(&uprobe->ftps_noffs, &noffs, sizeof (uprobe->ftps_noffs))) @@ -1957,44 +2311,66 @@ fasttrap_ioctl(dev_t dev, int cmd, intpt * Verify that the function and module strings contain no * funny characters. */ - for (c = &probe->ftps_func[0]; *c != '\0'; c++) { - if (*c < 0x20 || 0x7f <= *c) { - ret = EINVAL; - goto err; - } + if (u8_validate(probe->ftps_func, strlen(probe->ftps_func), + NULL, U8_VALIDATE_ENTIRE, &err) < 0) { + ret = EINVAL; + goto err; } - for (c = &probe->ftps_mod[0]; *c != '\0'; c++) { - if (*c < 0x20 || 0x7f <= *c) { - ret = EINVAL; - goto err; - } + if (u8_validate(probe->ftps_mod, strlen(probe->ftps_mod), + NULL, U8_VALIDATE_ENTIRE, &err) < 0) { + ret = EINVAL; + goto err; } +#ifdef notyet if (!PRIV_POLICY_CHOICE(cr, PRIV_ALL, B_FALSE)) { proc_t *p; pid_t pid = probe->ftps_pid; +#ifdef illumos mutex_enter(&pidlock); +#endif /* * Report an error if the process doesn't exist * or is actively being birthed. */ - if ((p = prfind(pid)) == NULL || p->p_stat == SIDL) { + sx_slock(&proctree_lock); + p = pfind(pid); + if (p) + fill_kinfo_proc(p, &kp); + sx_sunlock(&proctree_lock); + if (p == NULL || kp.ki_stat == SIDL) { +#ifdef illumos mutex_exit(&pidlock); +#endif return (ESRCH); } +#ifdef illumos mutex_enter(&p->p_lock); mutex_exit(&pidlock); +#else + PROC_LOCK_ASSERT(p, MA_OWNED); +#endif +#ifdef notyet if ((ret = priv_proc_cred_perm(cr, p, NULL, VREAD | VWRITE)) != 0) { +#ifdef illumos mutex_exit(&p->p_lock); +#else + PROC_UNLOCK(p); +#endif return (ret); } - +#endif /* notyet */ +#ifdef illumos mutex_exit(&p->p_lock); +#else + PROC_UNLOCK(p); +#endif } +#endif /* notyet */ ret = fasttrap_add_probe(probe); err: @@ -2006,35 +2382,64 @@ err: fasttrap_instr_query_t instr; fasttrap_tracepoint_t *tp; uint_t index; +#ifdef illumos int ret; +#endif +#ifdef illumos if (copyin((void *)arg, &instr, sizeof (instr)) != 0) return (EFAULT); +#endif +#ifdef notyet if (!PRIV_POLICY_CHOICE(cr, PRIV_ALL, B_FALSE)) { proc_t *p; pid_t pid = instr.ftiq_pid; +#ifdef illumos mutex_enter(&pidlock); +#endif /* * Report an error if the process doesn't exist * or is actively being birthed. */ - if ((p = prfind(pid)) == NULL || p->p_stat == SIDL) { + sx_slock(&proctree_lock); + p = pfind(pid); + if (p) + fill_kinfo_proc(p, &kp); + sx_sunlock(&proctree_lock); + if (p == NULL || kp.ki_stat == SIDL) { +#ifdef illumos mutex_exit(&pidlock); +#endif return (ESRCH); } +#ifdef illumos mutex_enter(&p->p_lock); mutex_exit(&pidlock); +#else + PROC_LOCK_ASSERT(p, MA_OWNED); +#endif +#ifdef notyet if ((ret = priv_proc_cred_perm(cr, p, NULL, VREAD)) != 0) { +#ifdef illumos mutex_exit(&p->p_lock); +#else + PROC_UNLOCK(p); +#endif return (ret); } +#endif /* notyet */ +#ifdef illumos mutex_exit(&p->p_lock); +#else + PROC_UNLOCK(p); +#endif } +#endif /* notyet */ index = FASTTRAP_TPOINTS_INDEX(instr.ftiq_pid, instr.ftiq_pc); @@ -2067,89 +2472,42 @@ err: return (EINVAL); } -static struct cb_ops fasttrap_cb_ops = { - fasttrap_open, /* open */ - nodev, /* close */ - nulldev, /* strategy */ - nulldev, /* print */ - nodev, /* dump */ - nodev, /* read */ - nodev, /* write */ - fasttrap_ioctl, /* ioctl */ - nodev, /* devmap */ - nodev, /* mmap */ - nodev, /* segmap */ - nochpoll, /* poll */ - ddi_prop_op, /* cb_prop_op */ - 0, /* streamtab */ - D_NEW | D_MP /* Driver compatibility flag */ -}; - -/*ARGSUSED*/ -static int -fasttrap_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) -{ - int error; - - switch (infocmd) { - case DDI_INFO_DEVT2DEVINFO: - *result = (void *)fasttrap_devi; - error = DDI_SUCCESS; - break; - case DDI_INFO_DEVT2INSTANCE: - *result = (void *)0; - error = DDI_SUCCESS; - break; - default: - error = DDI_FAILURE; - } - return (error); -} - static int -fasttrap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +fasttrap_load(void) { ulong_t nent; + int i, ret; - switch (cmd) { - case DDI_ATTACH: - break; - case DDI_RESUME: - return (DDI_SUCCESS); - default: - return (DDI_FAILURE); - } - - if (ddi_create_minor_node(devi, "fasttrap", S_IFCHR, 0, - DDI_PSEUDO, NULL) == DDI_FAILURE) { - ddi_remove_minor_node(devi, NULL); - return (DDI_FAILURE); - } - - ddi_report_dev(devi); - fasttrap_devi = devi; + /* Create the /dev/dtrace/fasttrap entry. */ + fasttrap_cdev = make_dev(&fasttrap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, + "dtrace/fasttrap"); - /* - * Install our hooks into fork(2), exec(2), and exit(2). - */ - dtrace_fasttrap_fork_ptr = &fasttrap_fork; - dtrace_fasttrap_exit_ptr = &fasttrap_exec_exit; - dtrace_fasttrap_exec_ptr = &fasttrap_exec_exit; + mtx_init(&fasttrap_cleanup_mtx, "fasttrap clean", "dtrace", MTX_DEF); + mutex_init(&fasttrap_count_mtx, "fasttrap count mtx", MUTEX_DEFAULT, + NULL); +#ifdef illumos fasttrap_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, "fasttrap-max-probes", FASTTRAP_MAX_DEFAULT); +#endif fasttrap_total = 0; /* * Conjure up the tracepoints hashtable... */ +#ifdef illumos nent = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS, "fasttrap-hash-size", FASTTRAP_TPOINTS_DEFAULT_SIZE); +#else + nent = tpoints_hash_size; +#endif if (nent == 0 || nent > 0x1000000) nent = FASTTRAP_TPOINTS_DEFAULT_SIZE; - if ((nent & (nent - 1)) == 0) + tpoints_hash_size = nent; + + if (ISP2(nent)) fasttrap_tpoints.fth_nent = nent; else fasttrap_tpoints.fth_nent = 1 << fasttrap_highbit(nent); @@ -2157,12 +2515,17 @@ fasttrap_attach(dev_info_t *devi, ddi_at fasttrap_tpoints.fth_mask = fasttrap_tpoints.fth_nent - 1; fasttrap_tpoints.fth_table = kmem_zalloc(fasttrap_tpoints.fth_nent * sizeof (fasttrap_bucket_t), KM_SLEEP); +#ifndef illumos + for (i = 0; i < fasttrap_tpoints.fth_nent; i++) + mutex_init(&fasttrap_tpoints.fth_table[i].ftb_mtx, + "tracepoints bucket mtx", MUTEX_DEFAULT, NULL); +#endif /* * ... and the providers hash table... */ nent = FASTTRAP_PROVIDERS_DEFAULT_SIZE; - if ((nent & (nent - 1)) == 0) + if (ISP2(nent)) fasttrap_provs.fth_nent = nent; else fasttrap_provs.fth_nent = 1 << fasttrap_highbit(nent); @@ -2170,12 +2533,35 @@ fasttrap_attach(dev_info_t *devi, ddi_at fasttrap_provs.fth_mask = fasttrap_provs.fth_nent - 1; fasttrap_provs.fth_table = kmem_zalloc(fasttrap_provs.fth_nent * sizeof (fasttrap_bucket_t), KM_SLEEP); +#ifndef illumos + for (i = 0; i < fasttrap_provs.fth_nent; i++) + mutex_init(&fasttrap_provs.fth_table[i].ftb_mtx, + "providers bucket mtx", MUTEX_DEFAULT, NULL); +#endif + + ret = kproc_create(fasttrap_pid_cleanup_cb, NULL, + &fasttrap_cleanup_proc, 0, 0, "ftcleanup"); + if (ret != 0) { + destroy_dev(fasttrap_cdev); +#ifndef illumos + for (i = 0; i < fasttrap_provs.fth_nent; i++) + mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx); + for (i = 0; i < fasttrap_tpoints.fth_nent; i++) + mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx); +#endif + kmem_free(fasttrap_provs.fth_table, fasttrap_provs.fth_nent * + sizeof (fasttrap_bucket_t)); + mtx_destroy(&fasttrap_cleanup_mtx); + mutex_destroy(&fasttrap_count_mtx); + return (ret); + } + /* * ... and the procs hash table. */ nent = FASTTRAP_PROCS_DEFAULT_SIZE; - if ((nent & (nent - 1)) == 0) + if (ISP2(nent)) fasttrap_procs.fth_nent = nent; else fasttrap_procs.fth_nent = 1 << fasttrap_highbit(nent); @@ -2183,27 +2569,38 @@ fasttrap_attach(dev_info_t *devi, ddi_at fasttrap_procs.fth_mask = fasttrap_procs.fth_nent - 1; fasttrap_procs.fth_table = kmem_zalloc(fasttrap_procs.fth_nent * sizeof (fasttrap_bucket_t), KM_SLEEP); +#ifndef illumos + for (i = 0; i < fasttrap_procs.fth_nent; i++) + mutex_init(&fasttrap_procs.fth_table[i].ftb_mtx, + "processes bucket mtx", MUTEX_DEFAULT, NULL); + + rm_init(&fasttrap_tp_lock, "fasttrap tracepoint"); + + /* + * This event handler must run before kdtrace_thread_dtor() since it + * accesses the thread's struct kdtrace_thread. + */ + fasttrap_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor, + fasttrap_thread_dtor, NULL, EVENTHANDLER_PRI_FIRST); +#endif + + /* + * Install our hooks into fork(2), exec(2), and exit(2). + */ + dtrace_fasttrap_fork = &fasttrap_fork; + dtrace_fasttrap_exit = &fasttrap_exec_exit; + dtrace_fasttrap_exec = &fasttrap_exec_exit; (void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL, &fasttrap_meta_id); - return (DDI_SUCCESS); + return (0); } static int -fasttrap_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) +fasttrap_unload(void) { int i, fail = 0; - timeout_id_t tmp; - - switch (cmd) { - case DDI_DETACH: - break; - case DDI_SUSPEND: - return (DDI_SUCCESS); - default: - return (DDI_FAILURE); - } /* * Unregister the meta-provider to make sure no new fasttrap- @@ -2214,28 +2611,7 @@ fasttrap_detach(dev_info_t *devi, ddi_de */ if (fasttrap_meta_id != DTRACE_METAPROVNONE && dtrace_meta_unregister(fasttrap_meta_id) != 0) - return (DDI_FAILURE); - - /* - * Prevent any new timeouts from running by setting fasttrap_timeout - * to a non-zero value, and wait for the current timeout to complete. - */ - mutex_enter(&fasttrap_cleanup_mtx); - fasttrap_cleanup_work = 0; - - while (fasttrap_timeout != (timeout_id_t)1) { - tmp = fasttrap_timeout; - fasttrap_timeout = (timeout_id_t)1; - - if (tmp != 0) { - mutex_exit(&fasttrap_cleanup_mtx); - (void) untimeout(tmp); - mutex_enter(&fasttrap_cleanup_mtx); - } - } - - fasttrap_cleanup_work = 0; - mutex_exit(&fasttrap_cleanup_mtx); + return (-1); /* * Iterate over all of our providers. If there's still a process @@ -2271,32 +2647,51 @@ fasttrap_detach(dev_info_t *devi, ddi_de } if (fail) { - uint_t work; - /* - * If we're failing to detach, we need to unblock timeouts - * and start a new timeout if any work has accumulated while - * we've been unsuccessfully trying to detach. - */ - mutex_enter(&fasttrap_cleanup_mtx); - fasttrap_timeout = 0; - work = fasttrap_cleanup_work; - mutex_exit(&fasttrap_cleanup_mtx); - - if (work) - fasttrap_pid_cleanup(); - (void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL, &fasttrap_meta_id); - return (DDI_FAILURE); + return (-1); } + /* + * Stop new processes from entering these hooks now, before the + * fasttrap_cleanup thread runs. That way all processes will hopefully + * be out of these hooks before we free fasttrap_provs.fth_table + */ + ASSERT(dtrace_fasttrap_fork == &fasttrap_fork); + dtrace_fasttrap_fork = NULL; + + ASSERT(dtrace_fasttrap_exec == &fasttrap_exec_exit); + dtrace_fasttrap_exec = NULL; + + ASSERT(dtrace_fasttrap_exit == &fasttrap_exec_exit); + dtrace_fasttrap_exit = NULL; + + mtx_lock(&fasttrap_cleanup_mtx); + fasttrap_cleanup_drain = 1; + /* Wait for the cleanup thread to finish up and signal us. */ + wakeup(&fasttrap_cleanup_cv); + mtx_sleep(&fasttrap_cleanup_drain, &fasttrap_cleanup_mtx, 0, "ftcld", + 0); + fasttrap_cleanup_proc = NULL; + mtx_destroy(&fasttrap_cleanup_mtx); + #ifdef DEBUG mutex_enter(&fasttrap_count_mtx); ASSERT(fasttrap_pid_count == 0); mutex_exit(&fasttrap_count_mtx); #endif +#ifndef illumos + EVENTHANDLER_DEREGISTER(thread_dtor, fasttrap_thread_dtor_tag); + + for (i = 0; i < fasttrap_tpoints.fth_nent; i++) + mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx); + for (i = 0; i < fasttrap_provs.fth_nent; i++) + mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx); + for (i = 0; i < fasttrap_procs.fth_nent; i++) + mutex_destroy(&fasttrap_procs.fth_table[i].ftb_mtx); +#endif kmem_free(fasttrap_tpoints.fth_table, fasttrap_tpoints.fth_nent * sizeof (fasttrap_bucket_t)); fasttrap_tpoints.fth_nent = 0; @@ -2309,70 +2704,44 @@ fasttrap_detach(dev_info_t *devi, ddi_de fasttrap_procs.fth_nent * sizeof (fasttrap_bucket_t)); fasttrap_procs.fth_nent = 0; - /* - * We know there are no tracepoints in any process anywhere in - * the system so there is no process which has its p_dtrace_count - * greater than zero, therefore we know that no thread can actively - * be executing code in fasttrap_fork(). Similarly for p_dtrace_probes - * and fasttrap_exec() and fasttrap_exit(). - */ - ASSERT(dtrace_fasttrap_fork_ptr == &fasttrap_fork); - dtrace_fasttrap_fork_ptr = NULL; - - ASSERT(dtrace_fasttrap_exec_ptr == &fasttrap_exec_exit); - dtrace_fasttrap_exec_ptr = NULL; - - ASSERT(dtrace_fasttrap_exit_ptr == &fasttrap_exec_exit); - dtrace_fasttrap_exit_ptr = NULL; - - ddi_remove_minor_node(devi, NULL); - - return (DDI_SUCCESS); -} - -static struct dev_ops fasttrap_ops = { - DEVO_REV, /* devo_rev */ - 0, /* refcnt */ - fasttrap_info, /* get_dev_info */ - nulldev, /* identify */ - nulldev, /* probe */ - fasttrap_attach, /* attach */ - fasttrap_detach, /* detach */ - nodev, /* reset */ - &fasttrap_cb_ops, /* driver operations */ - NULL, /* bus operations */ - nodev /* dev power */ -}; - -/* - * Module linkage information for the kernel. - */ -static struct modldrv modldrv = { - &mod_driverops, /* module type (this is a pseudo driver) */ - "Fasttrap Tracing", /* name of module */ - &fasttrap_ops, /* driver ops */ -}; - -static struct modlinkage modlinkage = { - MODREV_1, - (void *)&modldrv, - NULL -}; +#ifndef illumos + destroy_dev(fasttrap_cdev); + mutex_destroy(&fasttrap_count_mtx); + rm_destroy(&fasttrap_tp_lock); +#endif -int -_init(void) -{ - return (mod_install(&modlinkage)); + return (0); } -int -_info(struct modinfo *modinfop) +/* ARGSUSED */ +static int +fasttrap_modevent(module_t mod __unused, int type, void *data __unused) { - return (mod_info(&modlinkage, modinfop)); -} + int error = 0; -int -_fini(void) -{ - return (mod_remove(&modlinkage)); + switch (type) { + case MOD_LOAD: + break; + + case MOD_UNLOAD: + break; + + case MOD_SHUTDOWN: + break; + + default: + error = EOPNOTSUPP; + break; + } + return (error); } + +SYSINIT(fasttrap_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, fasttrap_load, + NULL); +SYSUNINIT(fasttrap_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, + fasttrap_unload, NULL); + +DEV_MODULE(fasttrap, fasttrap_modevent, NULL); +MODULE_VERSION(fasttrap, 1); +MODULE_DEPEND(fasttrap, dtrace, 1, 1, 1); +MODULE_DEPEND(fasttrap, opensolaris, 1, 1, 1); Index: src/external/cddl/osnet/dist/uts/common/fs/gfs.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/gfs.c diff -N src/external/cddl/osnet/dist/uts/common/fs/gfs.c --- src/external/cddl/osnet/dist/uts/common/fs/gfs.c 14 Dec 2010 01:28:18 -0000 1.2 +++ /dev/null 1 Jan 1970 00:00:00 -0000 @@ -1,1191 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* Portions Copyright 2007 Shivakumar GN */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -/* - * Generic pseudo-filesystem routines. - * - * There are significant similarities between the implementation of certain file - * system entry points across different filesystems. While one could attempt to - * "choke up on the bat" and incorporate common functionality into a VOP - * preamble or postamble, such an approach is limited in the benefit it can - * provide. In this file we instead define a toolkit of routines which can be - * called from a filesystem (with in-kernel pseudo-filesystems being the focus - * of the exercise) in a more component-like fashion. - * - * There are three basic classes of routines: - * - * 1) Lowlevel support routines - * - * These routines are designed to play a support role for existing - * pseudo-filesystems (such as procfs). They simplify common tasks, - * without forcing the filesystem to hand over management to GFS. The - * routines covered are: - * - * gfs_readdir_init() - * gfs_readdir_emit() - * gfs_readdir_emitn() - * gfs_readdir_pred() - * gfs_readdir_fini() - * gfs_lookup_dot() - * - * 2) Complete GFS management - * - * These routines take a more active role in management of the - * pseudo-filesystem. They handle the relationship between vnode private - * data and VFS data, as well as the relationship between vnodes in the - * directory hierarchy. - * - * In order to use these interfaces, the first member of every private - * v_data must be a gfs_file_t or a gfs_dir_t. This hands over all control - * to GFS. - * - * gfs_file_create() - * gfs_dir_create() - * gfs_root_create() - * - * gfs_file_inactive() - * gfs_dir_inactive() - * gfs_dir_lookup() - * gfs_dir_readdir() - * - * gfs_vop_inactive() - * gfs_vop_lookup() - * gfs_vop_readdir() - * gfs_vop_map() - * - * 3) Single File pseudo-filesystems - * - * This routine creates a rooted file to be overlayed ontop of another - * file in the physical filespace. - * - * Note that the parent is NULL (actually the vfs), but there is nothing - * technically keeping such a file from utilizing the "Complete GFS - * management" set of routines. - * - * gfs_root_create_file() - */ - -/* - * gfs_make_opsvec: take an array of vnode type definitions and create - * their vnodeops_t structures - * - * This routine takes an array of gfs_opsvec_t's. It could - * alternatively take an array of gfs_opsvec_t*'s, which would allow - * vnode types to be completely defined in files external to the caller - * of gfs_make_opsvec(). As it stands, much more sharing takes place -- - * both the caller and the vnode type provider need to access gfsv_ops - * and gfsv_template, and the caller also needs to know gfsv_name. - */ -#ifdef PORT_SOLARIS -int -gfs_make_opsvec(gfs_opsvec_t *vec) -{ - int error, i; - - for (i = 0; ; i++) { - if (vec[i].gfsv_name == NULL) - return (0); - error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template, - vec[i].gfsv_ops); - if (error) - break; - } - - cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'", - vec[i].gfsv_name); - for (i--; i >= 0; i--) { - vn_freevnodeops(*vec[i].gfsv_ops); - *vec[i].gfsv_ops = NULL; - } - return (error); -} -#endif - -/* - * Low level directory routines - * - * These routines provide some simple abstractions for reading directories. - * They are designed to be used by existing pseudo filesystems (namely procfs) - * that already have a complicated management infrastructure. - */ - -/* - * gfs_get_parent_ino: used to obtain a parent inode number and the - * inode number of the given vnode in preparation for calling gfs_readdir_init. - */ -int -gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct, - ino64_t *pino, ino64_t *ino) -{ - vnode_t *parent; - gfs_dir_t *dp = dvp->v_data; - int error; - - *ino = dp->gfsd_file.gfs_ino; - parent = dp->gfsd_file.gfs_parent; - - if (parent == NULL) { - *pino = *ino; /* root of filesystem */ - } else if (dvp->v_flag & V_XATTRDIR) { - vattr_t va; - - va.va_mask = AT_NODEID; - error = VOP_GETATTR(parent, &va, 0, cr, ct); - if (error) - return (error); - *pino = va.va_nodeid; - } else { - *pino = ((gfs_file_t *)(parent->v_data))->gfs_ino; - } - - return (0); -} - -/* - * gfs_readdir_init: initiate a generic readdir - * st - a pointer to an uninitialized gfs_readdir_state_t structure - * name_max - the directory's maximum file name length - * ureclen - the exported file-space record length (1 for non-legacy FSs) - * uiop - the uiop passed to readdir - * parent - the parent directory's inode - * self - this directory's inode - * flags - flags from VOP_READDIR - * - * Returns 0 or a non-zero errno. - * - * Typical VOP_READDIR usage of gfs_readdir_*: - * - * if ((error = gfs_readdir_init(...)) != 0) - * return (error); - * eof = 0; - * while ((error = gfs_readdir_pred(..., &voffset)) != 0) { - * if (!consumer_entry_at(voffset)) - * voffset = consumer_next_entry(voffset); - * if (consumer_eof(voffset)) { - * eof = 1 - * break; - * } - * if ((error = gfs_readdir_emit(..., voffset, - * consumer_ino(voffset), consumer_name(voffset))) != 0) - * break; - * } - * return (gfs_readdir_fini(..., error, eofp, eof)); - * - * As you can see, a zero result from gfs_readdir_pred() or - * gfs_readdir_emit() indicates that processing should continue, - * whereas a non-zero result indicates that the loop should terminate. - * Most consumers need do nothing more than let gfs_readdir_fini() - * determine what the cause of failure was and return the appropriate - * value. - */ -int -gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen, - uio_t *uiop, ino64_t parent, ino64_t self, int flags) -{ - size_t dirent_size; - - if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 || - (uiop->uio_loffset % ureclen) != 0) - return (EINVAL); - - st->grd_ureclen = ureclen; - st->grd_oresid = uiop->uio_resid; - st->grd_namlen = name_max; - if (flags & V_RDDIR_ENTFLAGS) - dirent_size = EDIRENT_RECLEN(st->grd_namlen); - else - dirent_size = DIRENT64_RECLEN(st->grd_namlen); - st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP); - st->grd_parent = parent; - st->grd_self = self; - st->grd_flags = flags; - - return (0); -} - -/* - * gfs_readdir_emit_int: internal routine to emit directory entry - * - * st - the current readdir state, which must have d_ino/ed_ino - * and d_name/ed_name set - * uiop - caller-supplied uio pointer - * next - the offset of the next entry - */ -static int -gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next) -{ - int reclen, namelen; - dirent64_t *dp; - edirent_t *edp; - - if (st->grd_flags & V_RDDIR_ENTFLAGS) { - edp = st->grd_dirent; - namelen = strlen(edp->ed_name); - reclen = EDIRENT_RECLEN(strlen(edp->ed_name)); - } else { - dp = st->grd_dirent; - namelen = strlen(dp->d_name); - reclen = DIRENT64_RECLEN(strlen(dp->d_name)); - } - - if (reclen > uiop->uio_resid) { - /* - * Error if no entries were returned yet - */ - if (uiop->uio_resid == st->grd_oresid) - return (EINVAL); - return (-1); - } - - if (st->grd_flags & V_RDDIR_ENTFLAGS) { - edp->ed_off = next; - edp->ed_reclen = (ushort_t)reclen; - } else { - dp->d_reclen = (ushort_t)reclen; - dp->d_type = DT_DIR; - dp->d_namlen = namelen; - } - - if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop)) - return (EFAULT); - - uiop->uio_loffset = next; - - return (0); -} - -/* - * gfs_readdir_emit: emit a directory entry - * voff - the virtual offset (obtained from gfs_readdir_pred) - * ino - the entry's inode - * name - the entry's name - * eflags - value for ed_eflags (if processing edirent_t) - * - * Returns a 0 on success, a non-zero errno on failure, or -1 if the - * readdir loop should terminate. A non-zero result (either errno or - * -1) from this function is typically passed directly to - * gfs_readdir_fini(). - */ -int -gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, - ino64_t ino, const char *name, int eflags) -{ - offset_t off = (voff + 2) * st->grd_ureclen; - - if (st->grd_flags & V_RDDIR_ENTFLAGS) { - edirent_t *edp = st->grd_dirent; - - edp->ed_ino = ino; - (void) strncpy(edp->ed_name, name, st->grd_namlen); - edp->ed_eflags = eflags; - } else { - dirent64_t *dp = st->grd_dirent; - - dp->d_ino = ino; - (void) strncpy(dp->d_name, name, st->grd_namlen); - } - - /* - * Inter-entry offsets are invalid, so we assume a record size of - * grd_ureclen and explicitly set the offset appropriately. - */ - return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen)); -} - -/* - * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer - * instead of a string for the entry's name. - */ -int -gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff, - ino64_t ino, unsigned long num) -{ - char buf[40]; - - numtos(num, buf); - return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0)); -} - -/* - * gfs_readdir_pred: readdir loop predicate - * voffp - a pointer in which the next virtual offset should be stored - * - * Returns a 0 on success, a non-zero errno on failure, or -1 if the - * readdir loop should terminate. A non-zero result (either errno or - * -1) from this function is typically passed directly to - * gfs_readdir_fini(). - */ -int -gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp) -{ - offset_t off, voff; - int error; - -top: - if (uiop->uio_resid <= 0) - return (-1); - - off = uiop->uio_loffset / st->grd_ureclen; - voff = off - 2; - if (off == 0) { - if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self, - ".", 0)) == 0) - goto top; - } else if (off == 1) { - if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent, - "..", 0)) == 0) - goto top; - } else { - *voffp = voff; - return (0); - } - - return (error); -} - -/* - * gfs_readdir_fini: generic readdir cleanup - * error - if positive, an error to return - * eofp - the eofp passed to readdir - * eof - the eof value - * - * Returns a 0 on success, a non-zero errno on failure. This result - * should be returned from readdir. - */ -int -gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof) -{ - size_t dirent_size; - - if (st->grd_flags & V_RDDIR_ENTFLAGS) - dirent_size = EDIRENT_RECLEN(st->grd_namlen); - else - dirent_size = DIRENT64_RECLEN(st->grd_namlen); - kmem_free(st->grd_dirent, dirent_size); - if (error > 0) - return (error); - if (eofp) - *eofp = eof; - return (0); -} - -/* - * gfs_lookup_dot - * - * Performs a basic check for "." and ".." directory entries. - */ -int -gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm) -{ - if (*nm == '\0' || strcmp(nm, ".") == 0) { - VN_HOLD(dvp); - *vpp = dvp; - return (0); - } else if (strcmp(nm, "..") == 0) { - if (pvp == NULL) { - ASSERT(dvp->v_flag & VROOT); - VN_HOLD(dvp); - *vpp = dvp; - } else { - VN_HOLD(pvp); - *vpp = pvp; - } - return (0); - } - - return (-1); -} - -/* - * gfs_file_create(): create a new GFS file - * - * size - size of private data structure (v_data) - * pvp - parent vnode (GFS directory) - * ops - vnode operations vector - * - * In order to use this interface, the parent vnode must have been created by - * gfs_dir_create(), and the private data stored in v_data must have a - * 'gfs_file_t' as its first field. - * - * Given these constraints, this routine will automatically: - * - * - Allocate v_data for the vnode - * - Initialize necessary fields in the vnode - * - Hold the parent - */ -vnode_t * -gfs_file_create(size_t size, vnode_t *pvp, vnodeops_t *ops) -{ - gfs_file_t *fp; - vnode_t *vp; - int error; - - /* - * Allocate vnode and internal data structure - */ - fp = kmem_zalloc(size, KM_SLEEP); - /* XXX FreeBSD adds vfs_t * as parameter to gfs_file_create and - gfs_dir_create */ - error = getnewvnode(VT_ZFS, pvp->v_vfsp, ops, &vp); - ASSERT(error == 0); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - - /* - * Set up various pointers - */ - fp->gfs_vnode = vp; - fp->gfs_parent = pvp; - vp->v_data = fp; - fp->gfs_size = size; - fp->gfs_type = GFS_FILE; - - /* - * Initialize vnode and hold parent. - */ - vn_setops(vp, ops); - if (pvp) { - VN_SET_VFS_TYPE_DEV(vp, pvp->v_vfsp, VREG, 0); - VN_HOLD(pvp); - } - - return (vp); -} - -/* - * gfs_dir_create: creates a new directory in the parent - * - * size - size of private data structure (v_data) - * pvp - parent vnode (GFS directory) - * ops - vnode operations vector - * entries - NULL-terminated list of static entries (if any) - * maxlen - maximum length of a directory entry - * readdir_cb - readdir callback (see gfs_dir_readdir) - * inode_cb - inode callback (see gfs_dir_readdir) - * lookup_cb - lookup callback (see gfs_dir_lookup) - * - * In order to use this function, the first member of the private vnode - * structure (v_data) must be a gfs_dir_t. For each directory, there are - * static entries, defined when the structure is initialized, and dynamic - * entries, retrieved through callbacks. - * - * If a directory has static entries, then it must supply a inode callback, - * which will compute the inode number based on the parent and the index. - * For a directory with dynamic entries, the caller must supply a readdir - * callback and a lookup callback. If a static lookup fails, we fall back to - * the supplied lookup callback, if any. - * - * This function also performs the same initialization as gfs_file_create(). - */ -vnode_t * -gfs_dir_create(size_t struct_size, vnode_t *pvp, vnodeops_t *ops, - gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, - gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) -{ - vnode_t *vp; - gfs_dir_t *dp; - gfs_dirent_t *de; - - vp = gfs_file_create(struct_size, pvp, ops); - vp->v_type = VDIR; - - dp = vp->v_data; - dp->gfsd_file.gfs_type = GFS_DIR; - dp->gfsd_maxlen = maxlen; - - if (entries != NULL) { - for (de = entries; de->gfse_name != NULL; de++) - dp->gfsd_nstatic++; - - dp->gfsd_static = kmem_alloc( - dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP); - bcopy(entries, dp->gfsd_static, - dp->gfsd_nstatic * sizeof (gfs_dirent_t)); - } - - dp->gfsd_readdir = readdir_cb; - dp->gfsd_lookup = lookup_cb; - dp->gfsd_inode = inode_cb; - - mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL); - - return (vp); -} - -/* - * gfs_root_create(): create a root vnode for a GFS filesystem - * - * Similar to gfs_dir_create(), this creates a root vnode for a filesystem. The - * only difference is that it takes a vfs_t instead of a vnode_t as its parent. - */ -vnode_t * -gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino, - gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen, - gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb) -{ - vnode_t *vp = gfs_dir_create(size, NULL, ops, entries, inode_cb, - maxlen, readdir_cb, lookup_cb); - - /* Manually set the inode */ - ((gfs_file_t *)vp->v_data)->gfs_ino = ino; - - VFS_HOLD(vfsp); - VN_SET_VFS_TYPE_DEV(vp, vfsp, VDIR, 0); - vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT; - - return (vp); -} - -/* - * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem - * - * Similar to gfs_root_create(), this creates a root vnode for a file to - * be the pseudo-filesystem. - */ -vnode_t * -gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino) -{ - vnode_t *vp = gfs_file_create(size, NULL, ops); - - ((gfs_file_t *)vp->v_data)->gfs_ino = ino; - - VFS_HOLD(vfsp); - VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0); - vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT; - - return (vp); -} - -/* - * gfs_file_inactive() - * - * Called from the VOP_INACTIVE() routine. If necessary, this routine will - * remove the given vnode from the parent directory and clean up any references - * in the VFS layer. - * - * If the vnode was not removed (due to a race with vget), then NULL is - * returned. Otherwise, a pointer to the private data is returned. - */ -void * -gfs_file_inactive(vnode_t *vp) -{ - int i; - gfs_dirent_t *ge = NULL; - gfs_file_t *fp = vp->v_data; - gfs_dir_t *dp = NULL; - void *data; - - if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR)) - goto found; - - dp = fp->gfs_parent->v_data; - - /* - * First, see if this vnode is cached in the parent. - */ - gfs_dir_lock(dp); - - /* - * Find it in the set of static entries. - */ - for (i = 0; i < dp->gfsd_nstatic; i++) { - ge = &dp->gfsd_static[i]; - - if (ge->gfse_vnode == vp) - goto found; - } - - /* - * If 'ge' is NULL, then it is a dynamic entry. - */ - ge = NULL; - -found: - if (vp->v_flag & V_XATTRDIR) { - mutex_enter(&fp->gfs_parent->v_lock); - } - mutex_enter(&vp->v_lock); -#ifdef PORT_SOLARIS - if (vp->v_count == 1) { - /* - * Really remove this vnode - */ - data = vp->v_data; - if (ge != NULL) { - /* - * If this was a statically cached entry, simply set the - * cached vnode to NULL. - */ - ge->gfse_vnode = NULL; - } - if (vp->v_flag & V_XATTRDIR) { - fp->gfs_parent->v_xattrdir = NULL; - mutex_exit(&fp->gfs_parent->v_lock); - } - mutex_exit(&vp->v_lock); - - /* - * Free vnode and release parent - */ - if (fp->gfs_parent) { - if (dp) { - gfs_dir_unlock(dp); - } - VN_RELE(fp->gfs_parent); - } else { - ASSERT(vp->v_vfsp != NULL); - VFS_RELE(vp->v_vfsp); - } - vn_free(vp); - } else { - vp->v_count--; - data = NULL; - mutex_exit(&vp->v_lock); - if (vp->v_flag & V_XATTRDIR) { - mutex_exit(&fp->gfs_parent->v_lock); - } - if (dp) - gfs_dir_unlock(dp); - } -#endif - - return (data); -} - -/* - * gfs_dir_inactive() - * - * Same as above, but for directories. - */ -void * -gfs_dir_inactive(vnode_t *vp) -{ - gfs_dir_t *dp; - - ASSERT(vp->v_type == VDIR); - - if ((dp = gfs_file_inactive(vp)) != NULL) { - mutex_destroy(&dp->gfsd_lock); - if (dp->gfsd_nstatic) - kmem_free(dp->gfsd_static, - dp->gfsd_nstatic * sizeof (gfs_dirent_t)); - } - - return (dp); -} - -/* - * gfs_dir_lookup_dynamic() - * - * This routine looks up the provided name amongst the dynamic entries - * in the gfs directory and returns the corresponding vnode, if found. - * - * The gfs directory is expected to be locked by the caller prior to - * calling this function. The directory will be unlocked during the - * execution of this function, but will be locked upon return from the - * function. This function returns 0 on success, non-zero on error. - * - * The dynamic lookups are performed by invoking the lookup - * callback, which is passed to this function as the first argument. - * The arguments to the callback are: - * - * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr, - * int flags, int *deflgs, pathname_t *rpnp); - * - * pvp - parent vnode - * nm - name of entry - * vpp - pointer to resulting vnode - * cr - pointer to cred - * flags - flags value from lookup request - * ignored here; currently only used to request - * insensitive lookups - * direntflgs - output parameter, directory entry flags - * ignored here; currently only used to indicate a lookup - * has more than one possible match when case is not considered - * realpnp - output parameter, real pathname - * ignored here; when lookup was performed case-insensitively, - * this field contains the "real" name of the file. - * - * Returns 0 on success, non-zero on error. - */ -static int -gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp, - const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags, - int *direntflags, pathname_t *realpnp) -{ - gfs_file_t *fp; - ino64_t ino; - int ret; - - ASSERT(GFS_DIR_LOCKED(dp)); - - /* - * Drop the directory lock, as the lookup routine - * will need to allocate memory, or otherwise deadlock on this - * directory. - */ - gfs_dir_unlock(dp); - ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp); - gfs_dir_lock(dp); - - /* - * The callback for extended attributes returns a vnode - * with v_data from an underlying fs. - */ - if (ret == 0 && !IS_XATTRDIR(dvp)) { - fp = (gfs_file_t *)((*vpp)->v_data); - fp->gfs_index = -1; - fp->gfs_ino = ino; - } - - return (ret); -} - -/* - * gfs_dir_lookup_static() - * - * This routine looks up the provided name amongst the static entries - * in the gfs directory and returns the corresponding vnode, if found. - * The first argument to the function is a pointer to the comparison - * function this function should use to decide if names are a match. - * - * If a match is found, and GFS_CACHE_VNODE is set and the vnode - * exists, we simply return the existing vnode. Otherwise, we call - * the static entry's callback routine, caching the result if - * necessary. If the idx pointer argument is non-NULL, we use it to - * return the index of the matching static entry. - * - * The gfs directory is expected to be locked by the caller prior to calling - * this function. The directory may be unlocked during the execution of - * this function, but will be locked upon return from the function. - * - * This function returns 0 if a match is found, ENOENT if not. - */ -static int -gfs_dir_lookup_static(int (*compare)(const char *, const char *), - gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx, - vnode_t **vpp, pathname_t *rpnp) -{ - gfs_dirent_t *ge; - vnode_t *vp = NULL; - int i; - - ASSERT(GFS_DIR_LOCKED(dp)); - - /* - * Search static entries. - */ - for (i = 0; i < dp->gfsd_nstatic; i++) { - ge = &dp->gfsd_static[i]; - - if (compare(ge->gfse_name, nm) == 0) { - if (rpnp) - (void) strlcpy(rpnp->pn_buf, ge->gfse_name, - rpnp->pn_bufsize); - - if (ge->gfse_vnode) { - ASSERT(ge->gfse_flags & GFS_CACHE_VNODE); - vp = ge->gfse_vnode; - VN_HOLD(vp); - break; - } - - /* - * We drop the directory lock, as the constructor will - * need to do KM_SLEEP allocations. If we return from - * the constructor only to find that a parallel - * operation has completed, and GFS_CACHE_VNODE is set - * for this entry, we discard the result in favor of - * the cached vnode. - */ - gfs_dir_unlock(dp); - vp = ge->gfse_ctor(dvp); - gfs_dir_lock(dp); - - ((gfs_file_t *)vp->v_data)->gfs_index = i; - - /* Set the inode according to the callback. */ - ((gfs_file_t *)vp->v_data)->gfs_ino = - dp->gfsd_inode(dvp, i); - - if (ge->gfse_flags & GFS_CACHE_VNODE) { - if (ge->gfse_vnode == NULL) { - ge->gfse_vnode = vp; - } else { - /* - * A parallel constructor beat us to it; - * return existing vnode. We have to be - * careful because we can't release the - * current vnode while holding the - * directory lock; its inactive routine - * will try to lock this directory. - */ - vnode_t *oldvp = vp; - vp = ge->gfse_vnode; - VN_HOLD(vp); - - gfs_dir_unlock(dp); - VN_RELE(oldvp); - gfs_dir_lock(dp); - } - } - break; - } - } - - if (vp == NULL) - return (ENOENT); - else if (idx) - *idx = i; - *vpp = vp; - return (0); -} - -/* - * gfs_dir_lookup() - * - * Looks up the given name in the directory and returns the corresponding - * vnode, if found. - * - * First, we search statically defined entries, if any, with a call to - * gfs_dir_lookup_static(). If no static entry is found, and we have - * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic(). - * - * This function returns 0 on success, non-zero on error. - */ -int -gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr, - int flags, int *direntflags, pathname_t *realpnp) -{ - gfs_dir_t *dp = dvp->v_data; - boolean_t casecheck; - vnode_t *dynvp = NULL; - vnode_t *vp = NULL; - int (*compare)(const char *, const char *); - int error, idx; - - ASSERT(dvp->v_type == VDIR); - - if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0) - return (0); - - casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL; - if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) || - (flags & FIGNORECASE)) - compare = strcasecmp; - else - compare = strcmp; - - gfs_dir_lock(dp); - - error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp); - - if (vp && casecheck) { - gfs_dirent_t *ge; - int i; - - for (i = idx + 1; i < dp->gfsd_nstatic; i++) { - ge = &dp->gfsd_static[i]; - - if (strcasecmp(ge->gfse_name, nm) == 0) { - *direntflags |= ED_CASE_CONFLICT; - goto out; - } - } - } - - if ((error || casecheck) && dp->gfsd_lookup) - error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp, - &dynvp, cr, flags, direntflags, vp ? NULL : realpnp); - - if (vp && dynvp) { - /* static and dynamic entries are case-insensitive conflict */ - ASSERT(casecheck); - *direntflags |= ED_CASE_CONFLICT; - VN_RELE(dynvp); - } else if (vp == NULL) { - vp = dynvp; - } else if (error == ENOENT) { - error = 0; - } else if (error) { - VN_RELE(vp); - vp = NULL; - } - -out: - gfs_dir_unlock(dp); - - *vpp = vp; - return (error); -} - -/* - * gfs_dir_readdir: does a readdir() on the given directory - * - * dvp - directory vnode - * uiop - uio structure - * eofp - eof pointer - * data - arbitrary data passed to readdir callback - * - * This routine does all the readdir() dirty work. Even so, the caller must - * supply two callbacks in order to get full compatibility. - * - * If the directory contains static entries, an inode callback must be - * specified. This avoids having to create every vnode and call VOP_GETATTR() - * when reading the directory. This function has the following arguments: - * - * ino_t gfs_inode_cb(vnode_t *vp, int index); - * - * vp - vnode for the directory - * index - index in original gfs_dirent_t array - * - * Returns the inode number for the given entry. - * - * For directories with dynamic entries, a readdir callback must be provided. - * This is significantly more complex, thanks to the particulars of - * VOP_READDIR(). - * - * int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp, - * offset_t *off, offset_t *nextoff, void *data, int flags) - * - * vp - directory vnode - * dp - directory entry, sized according to maxlen given to - * gfs_dir_create(). callback must fill in d_name and - * d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags - * (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS - * is set in 'flags'. - * eofp - callback must set to 1 when EOF has been reached - * off - on entry, the last offset read from the directory. Callback - * must set to the offset of the current entry, typically left - * untouched. - * nextoff - callback must set to offset of next entry. Typically - * (off + 1) - * data - caller-supplied data - * flags - VOP_READDIR flags - * - * Return 0 on success, or error on failure. - */ -int -gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, void *data, cred_t *cr, - caller_context_t *ct, int flags) -{ - gfs_readdir_state_t gstate; - int error, eof = 0; - ino64_t ino, pino; - offset_t off, next; - gfs_dir_t *dp = dvp->v_data; - - error = gfs_get_parent_ino(dvp, cr, ct, &pino, &ino); - if (error) - return (error); - - if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop, - pino, ino, flags)) != 0) - return (error); - - while ((error = gfs_readdir_pred(&gstate, uiop, &off)) == 0 && - !eof) { - - if (off >= 0 && off < dp->gfsd_nstatic) { - ino = dp->gfsd_inode(dvp, off); - - if ((error = gfs_readdir_emit(&gstate, uiop, - off, ino, dp->gfsd_static[off].gfse_name, 0)) - != 0) - break; - - } else if (dp->gfsd_readdir) { - off -= dp->gfsd_nstatic; - - if ((error = dp->gfsd_readdir(dvp, - gstate.grd_dirent, &eof, &off, &next, - data, flags)) != 0 || eof) - break; - - off += dp->gfsd_nstatic + 2; - next += dp->gfsd_nstatic + 2; - - if ((error = gfs_readdir_emit_int(&gstate, uiop, - next)) != 0) - break; - } else { - /* - * Offset is beyond the end of the static entries, and - * we have no dynamic entries. Set EOF. - */ - eof = 1; - } - } - - return (gfs_readdir_fini(&gstate, error, eofp, eof)); -} - - -/* - * gfs_vop_lookup: VOP_LOOKUP() entry point - * - * For use directly in vnode ops table. Given a GFS directory, calls - * gfs_dir_lookup() as necessary. - */ -/* ARGSUSED */ -int -gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, - int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, - int *direntflags, pathname_t *realpnp) -{ - return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp)); -} - -/* - * gfs_vop_readdir: VOP_READDIR() entry point - * - * For use directly in vnode ops table. Given a GFS directory, calls - * gfs_dir_readdir() as necessary. - */ -/* ARGSUSED */ -int -gfs_vop_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, - caller_context_t *ct, int flags) -{ - return (gfs_dir_readdir(vp, uiop, eofp, NULL, cr, ct, flags)); -} - - -/* - * gfs_vop_map: VOP_MAP() entry point - * - * Convenient routine for handling pseudo-files that wish to allow mmap() calls. - * This function only works for readonly files, and uses the read function for - * the vnode to fill in the data. The mapped data is immediately faulted in and - * filled with the necessary data during this call; there are no getpage() or - * putpage() routines. - */ -/* ARGSUSED */ -#ifdef PORT_SOLARIS -int -gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, - size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred, - caller_context_t *ct) -{ - int rv; - ssize_t resid = len; - - /* - * Check for bad parameters - */ -#ifdef _ILP32 - if (len > MAXOFF_T) - return (ENOMEM); -#endif - if (vp->v_flag & VNOMAP) - return (ENOTSUP); - if (off > MAXOFF_T) - return (EFBIG); - if ((long)off < 0 || (long)(off + len) < 0) - return (EINVAL); - if (vp->v_type != VREG) - return (ENODEV); - if ((prot & (PROT_EXEC | PROT_WRITE)) != 0) - return (EACCES); - - /* - * Find appropriate address if needed, otherwise clear address range. - */ - as_rangelock(as); - rv = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); - if (rv != 0) { - as_rangeunlock(as); - return (rv); - } - - /* - * Create mapping - */ - rv = as_map(as, *addrp, len, segvn_create, zfod_argsp); - as_rangeunlock(as); - if (rv != 0) - return (rv); - - /* - * Fill with data from read() - */ - rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE, - 0, (rlim64_t)0, cred, &resid); - - if (rv == 0 && resid != 0) - rv = ENXIO; - - if (rv != 0) { - as_rangelock(as); - (void) as_unmap(as, *addrp, len); - as_rangeunlock(as); - } - - return (rv); -} -#endif -/* - * gfs_vop_inactive: VOP_INACTIVE() entry point - * - * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or - * gfs_dir_inactive() as necessary, and kmem_free()s associated private data. - */ -/* ARGSUSED */ -void -gfs_vop_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) -{ - gfs_file_t *fp = vp->v_data; - void *data; - - if (fp->gfs_type == GFS_DIR) - data = gfs_dir_inactive(vp); - else - data = gfs_file_inactive(vp); - - if (data != NULL) - kmem_free(data, fp->gfs_size); -} Index: src/external/cddl/osnet/dist/uts/common/fs/vnode.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/vnode.c diff -N src/external/cddl/osnet/dist/uts/common/fs/vnode.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/vnode.c 11 May 2017 20:07:48 -0000 @@ -0,0 +1,106 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#include +#include +#include +#include +#include + +/* Extensible attribute (xva) routines. */ + +/* + * Zero out the structure, set the size of the requested/returned bitmaps, + * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer + * to the returned attributes array. + */ +void +xva_init(xvattr_t *xvap) +{ + bzero(xvap, sizeof (xvattr_t)); + xvap->xva_mapsize = XVA_MAPSIZE; + xvap->xva_magic = XVA_MAGIC; + xvap->xva_vattr.va_mask = AT_XVATTR; + xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0]; +} + +/* + * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t + * structure. Otherwise, returns NULL. + */ +xoptattr_t * +xva_getxoptattr(xvattr_t *xvap) +{ + xoptattr_t *xoap = NULL; + if (xvap->xva_vattr.va_mask & AT_XVATTR) + xoap = &xvap->xva_xoptattrs; + return (xoap); +} + +#ifdef __FreeBSD__ +static void +vn_rele_inactive(vnode_t *vp) +{ + vrele(vp); +} + +/* + * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it + * asynchronously using a taskq. This can avoid deadlocks caused by re-entering + * the file system as a result of releasing the vnode. Note, file systems + * already have to handle the race where the vnode is incremented before the + * inactive routine is called and does its locking. + * + * Warning: Excessive use of this routine can lead to performance problems. + * This is because taskqs throttle back allocation if too many are created. + */ +void +vn_rele_async(vnode_t *vp, taskq_t *taskq) +{ + VERIFY(vp->v_count > 0); + VI_LOCK(vp); + if (vp->v_count == 1 && !(vp->v_iflag & VI_DOINGINACT)) { + VI_UNLOCK(vp); + VERIFY(taskq_dispatch((taskq_t *)taskq, + (task_func_t *)vn_rele_inactive, vp, TQ_SLEEP) != 0); + return; + } + refcount_release(&vp->v_usecount); + vdropl(vp); +} +#endif /* __FreeBSD__ */ Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4 =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4 diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4 --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4 16 Feb 2013 16:40:57 -0000 @@ -0,0 +1,30 @@ +LZ4 - Fast LZ compression algorithm +Copyright (C) 2011-2013, Yann Collet. +BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +You can contact the author at : +- LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html +- LZ4 source repository : http://code.google.com/p/lz4/ Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/THIRDPARTYLICENSE.lz4.descrip 16 Feb 2013 16:40:56 -0000 @@ -0,0 +1 @@ +LZ4 COMPRESSION FUNCTIONALITY IN ZFS Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c,v retrieving revision 1.11 diff -u -p -r1.11 arc.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c 27 Jan 2012 19:48:38 -0000 1.11 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c 16 May 2017 00:54:12 -0000 @@ -19,8 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ /* @@ -56,11 +59,11 @@ * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All - * elements of the cache are therefor exactly the same size. So + * elements of the cache are therefore exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (rangeing from 512 bytes to - * 128K bytes). We therefor choose a set of blocks to evict to make + * 128K bytes). We therefore choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * @@ -75,13 +78,13 @@ * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal arc algorithms for - * adjusting the cache use method 2. We therefor provide two + * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the * arc list locks. * - * Buffers do not have their own mutexs, rather they rely on the - * hash table mutexs for the bulk of their protection (i.e. most - * fields in the arc_buf_hdr_t are protected by these mutexs). + * Buffers do not have their own mutexes, rather they rely on the + * hash table mutexes for the bulk of their protection (i.e. most + * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns @@ -102,13 +105,13 @@ * with the buffer may be evicted prior to the callback. The callback * must be made with *no locks held* (to prevent deadlock). Additionally, * the users of callbacks must ensure that their private data is - * protected from simultaneous callbacks from arc_buf_evict() + * protected from simultaneous callbacks from arc_clear_callback() * and arc_do_user_evicts(). * * Note that the majority of the performance stats are manipulated * with atomic operations. * - * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: + * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction @@ -117,29 +120,167 @@ * - ARC header release, as it removes from L2ARC buflists */ +/* + * ARC operation: + * + * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. + * This structure can point either to a block that is still in the cache or to + * one that is only accessible in an L2 ARC device, or it can provide + * information about a block that was recently evicted. If a block is + * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough + * information to retrieve it from the L2ARC device. This information is + * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block + * that is in this state cannot access the data directly. + * + * Blocks that are actively being referenced or have not been evicted + * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within + * the arc_buf_hdr_t that will point to the data block in memory. A block can + * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC + * caches data in two ways -- in a list of arc buffers (arc_buf_t) and + * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). + * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC + * consumer, and always contains uncompressed data. The ARC will provide + * references to this data and will keep it cached until it is no longer in + * use. Typically, the arc will try to cache only the L1ARC's physical data + * block and will aggressively evict any arc_buf_t that is no longer referenced. + * The amount of memory consumed by the arc_buf_t's can be seen via the + * "overhead_size" kstat. + * + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pdata +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * (potentially) | | | | + * compressed | | | | + * data +------+ | v + * +->+------+ +------+ + * uncompressed | | | | + * data | | | | + * +------+ +------+ + * + * The L1ARC's data pointer, however, may or may not be uncompressed. The + * ARC has the ability to store the physical data (b_pdata) associated with + * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk + * physical block, it will match its on-disk compression characteristics. + * If the block on-disk is compressed, then the physical data block + * in the cache will also be compressed and vice-versa. This behavior + * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the + * compressed ARC functionality is disabled, the b_pdata will point to an + * uncompressed version of the on-disk data. + * + * When a consumer reads a block, the ARC must first look to see if the + * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t, + * then an additional arc_buf_t is allocated and the uncompressed data is + * bcopied from the existing arc_buf_t. If the hdr is cached but does not + * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses + * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's + * b_pdata is not compressed, then the block is shared with the newly + * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t + * in the arc buffer chain. Sharing the block reduces the memory overhead + * required when the hdr is caching uncompressed blocks or the compressed + * arc functionality has been disabled via 'zfs_compressed_arc_enabled'. + * + * The diagram below shows an example of an uncompressed ARC hdr that is + * sharing its data with an arc_buf_t: + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t (shared) + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pdata +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * | | | | + * uncompressed | | | | + * data +------+ | | + * ^ +->+------+ | + * | uncompressed | | | + * | data | | | + * | +------+ | + * +---------------------------------+ + * + * Writing to the arc requires that the ARC first discard the b_pdata + * since the physical block is about to be rewritten. The new data contents + * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline + * performs the write, it may compress the data before writing it to disk. + * The ARC will be called with the transformed data and will bcopy the + * transformed on-disk block into a newly allocated b_pdata. + * + * When the L2ARC is in use, it will also take advantage of the b_pdata. The + * L2ARC will always write the contents of b_pdata to the L2ARC. This means + * that when compressed arc is enabled that the L2ARC blocks are identical + * to the on-disk block in the main data pool. This provides a significant + * advantage since the ARC can leverage the bp's checksum when reading from the + * L2ARC to determine if the contents are valid. However, if the compressed + * arc is disabled, then the L2ARC's block must be transformed to look + * like the physical block in the main data pool before comparing the + * checksum and determining its validity. + */ + #include #include +#include +#include +#include #include #include #include #include #include +#include +#include #ifdef _KERNEL -#include -#include -#include #include +#include #endif #include #include +#include #include +#include + +#include + +#ifdef illumos +#ifndef _KERNEL +/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ +boolean_t arc_watch = B_FALSE; +int arc_procfd; +#endif +#endif /* illumos */ #ifdef __NetBSD__ #include #ifndef btop #define btop(x) ((x) / PAGE_SIZE) #endif -#define needfree (uvmexp.free < uvmexp.freetarg ? uvmexp.freetarg : 0) +//#define needfree (uvmexp.free < uvmexp.freetarg ? uvmexp.freetarg : 0) #define buf_init arc_buf_init #define freemem uvmexp.free #define minfree uvmexp.freemin @@ -160,30 +301,58 @@ static struct callback_entry arc_kva_rec #endif /* __NetBSD__ */ -static kmutex_t arc_reclaim_thr_lock; -static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ -static uint8_t arc_thread_exit; - -extern int zfs_write_limit_shift; -extern uint64_t zfs_write_limit_max; -extern kmutex_t zfs_write_limit_lock; - -#define ARC_REDUCE_DNLC_PERCENT 3 -uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; - -typedef enum arc_reclaim_strategy { - ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ - ARC_RECLAIM_CONS /* Conservative reclaim strategy */ -} arc_reclaim_strategy_t; +static kmutex_t arc_reclaim_lock; +static kcondvar_t arc_reclaim_thread_cv; +static boolean_t arc_reclaim_thread_exit; +static kcondvar_t arc_reclaim_waiters_cv; + +#ifdef __FreeBSD__ +static kmutex_t arc_dnlc_evicts_lock; +static kcondvar_t arc_dnlc_evicts_cv; +static boolean_t arc_dnlc_evicts_thread_exit; + +uint_t arc_reduce_dnlc_percent = 3; +#endif + +/* + * The number of headers to evict in arc_evict_state_impl() before + * dropping the sublist lock and evicting from another sublist. A lower + * value means we're more likely to evict the "correct" header (i.e. the + * oldest header in the arc state), but comes with higher overhead + * (i.e. more invocations of arc_evict_state_impl()). + */ +int zfs_arc_evict_batch_limit = 10; + +/* + * The number of sublists used for each of the arc state lists. If this + * is not set to a suitable value by the user, it will be configured to + * the number of CPUs on the system in arc_init(). + */ +int zfs_arc_num_sublists_per_state = 0; /* number of seconds before growing cache again */ static int arc_grow_retry = 60; +/* shift of arc_c for calculating overflow limit in arc_get_data_buf */ +int zfs_arc_overflow_shift = 8; + /* shift of arc_c for calculating both min and max arc_p */ static int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ -static int arc_shrink_shift = 5; +static int arc_shrink_shift = 7; + +/* + * log2(fraction of ARC which must be free to allow growing). + * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, + * when reading a new block into the ARC, we will evict an equal-sized block + * from the ARC. + * + * This must be less than arc_shrink_shift, so that when we shrink the ARC, + * we will still not allow it to grow. + */ +int arc_no_grow_shift = 5; + /* * minimum lifespan of a prefetch block in clock ticks @@ -191,7 +360,13 @@ static int arc_shrink_shift = 5; */ static int arc_min_prefetch_lifespan; +/* + * If this percent of memory is free, don't throttle. + */ +int arc_lotsfree_percent = 10; + static int arc_dead; +extern boolean_t zfs_prefetch_disable; /* * The arc has filled available memory and has now warmed up. @@ -204,9 +379,89 @@ static boolean_t arc_warm; uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; +uint64_t zfs_arc_meta_min = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; +uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ +u_int zfs_arc_free_target = 0; + +/* Absolute min for arc min / max is 16MB. */ +static uint64_t arc_abs_min = 16 << 20; + +boolean_t zfs_compressed_arc_enabled = B_TRUE; + +#if defined(__FreeBSD__) && defined(_KERNEL) +static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); +static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); +static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); +static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); + +static void +arc_free_target_init(void *unused __unused) +{ + + zfs_arc_free_target = vm_pageout_wakeup_thresh; +} +SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, + arc_free_target_init, NULL); + +TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); +TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); +TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); +SYSCTL_DECL(_vfs_zfs); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN, + 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN, + 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, + &zfs_arc_average_blocksize, 0, + "ARC average blocksize"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, + &arc_shrink_shift, 0, + "log2(fraction of arc to reclaim)"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN, + &zfs_compressed_arc_enabled, 0, "Enable compressed ARC"); + +/* + * We don't have a tunable for arc_free_target due to the dependency on + * pagedaemon initialisation. + */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), + sysctl_vfs_zfs_arc_free_target, "IU", + "Desired number of free pages below which ARC triggers reclaim"); + +static int +sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) +{ + u_int val; + int err; + + val = zfs_arc_free_target; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < minfree) + return (EINVAL); + if (val > vm_cnt.v_page_count) + return (EINVAL); + + zfs_arc_free_target = val; + + return (0); +} + +/* + * Must be declared here, before the definition of corresponding kstat + * macro which uses the same names will confuse the compiler. + */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), + sysctl_vfs_zfs_arc_meta_limit, "QU", + "ARC metadata limit"); +#endif /* * Note that buffers can be in one of 6 states: @@ -241,10 +496,19 @@ int zfs_arc_p_min_shift = 0; */ typedef struct arc_state { - list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ - uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ - uint64_t arcs_size; /* total amount of data in this state */ - kmutex_t arcs_mtx; + /* + * list of evictable buffers + */ + multilist_t arcs_list[ARC_BUFC_NUMTYPES]; + /* + * total amount of evictable data in this state + */ + refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; + /* + * total amount of data in this state; this includes: evictable, + * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. + */ + refcount_t arcs_size; } arc_state_t; /* The 6 states: */ @@ -270,13 +534,30 @@ typedef struct arc_stats { kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_allocated; kstat_named_t arcstat_deleted; - kstat_named_t arcstat_recycle_miss; + /* + * Number of buffers that could not be evicted because the hash lock + * was held by another thread. The lock may not necessarily be held + * by something using the same buffer, since hash locks are shared + * by multiple buffers. + */ kstat_named_t arcstat_mutex_miss; + /* + * Number of buffers skipped because they have I/O in progress, are + * indrect prefetch buffers that have not lived long enough, or are + * not from the spa we're trying to evict from. + */ kstat_named_t arcstat_evict_skip; + /* + * Number of times arc_evict_state() was unable to evict enough + * buffers to reach it's target amount. + */ + kstat_named_t arcstat_evict_not_enough; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_ineligible; + kstat_named_t arcstat_evict_l2_skip; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; @@ -287,9 +568,157 @@ typedef struct arc_stats { kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; + /* + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata. + * Note that the compressed bytes may match the uncompressed bytes + * if the block is either not compressed or compressed arc is disabled. + */ + kstat_named_t arcstat_compressed_size; + /* + * Uncompressed size of the data stored in b_pdata. If compressed + * arc is disabled then this value will be identical to the stat + * above. + */ + kstat_named_t arcstat_uncompressed_size; + /* + * Number of bytes stored in all the arc_buf_t's. This is classified + * as "overhead" since this data is typically short-lived and will + * be evicted from the arc when it becomes unreferenced unless the + * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level + * values have been set (see comment in dbuf.c for more information). + */ + kstat_named_t arcstat_overhead_size; + /* + * Number of bytes consumed by internal ARC structures necessary + * for tracking purposes; these structures are not actually + * backed by ARC buffers. This includes arc_buf_hdr_t structures + * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only + * caches), and arc_buf_t structures (allocated via arc_buf_t + * cache). + */ kstat_named_t arcstat_hdr_size; + /* + * Number of bytes consumed by ARC buffers of type equal to + * ARC_BUFC_DATA. This is generally consumed by buffers backing + * on disk user data (e.g. plain file contents). + */ kstat_named_t arcstat_data_size; + /* + * Number of bytes consumed by ARC buffers of type equal to + * ARC_BUFC_METADATA. This is generally consumed by buffers + * backing on disk data that is used for internal ZFS + * structures (e.g. ZAP, dnode, indirect blocks, etc). + */ + kstat_named_t arcstat_metadata_size; + /* + * Number of bytes consumed by various buffers and structures + * not actually backed with ARC buffers. This includes bonus + * buffers (allocated directly via zio_buf_* functions), + * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t + * cache), and dnode_t structures (allocated via dnode_t cache). + */ kstat_named_t arcstat_other_size; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_anon state. This includes *all* buffers in the arc_anon + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + */ + kstat_named_t arcstat_anon_size; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_DATA, + * residing in the arc_anon state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + */ + kstat_named_t arcstat_anon_evictable_data; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_METADATA, + * residing in the arc_anon state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + */ + kstat_named_t arcstat_anon_evictable_metadata; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_mru state. This includes *all* buffers in the arc_mru + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + */ + kstat_named_t arcstat_mru_size; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_DATA, + * residing in the arc_mru state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + */ + kstat_named_t arcstat_mru_evictable_data; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_METADATA, + * residing in the arc_mru state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + */ + kstat_named_t arcstat_mru_evictable_metadata; + /* + * Total number of bytes that *would have been* consumed by ARC + * buffers in the arc_mru_ghost state. The key thing to note + * here, is the fact that this size doesn't actually indicate + * RAM consumption. The ghost lists only consist of headers and + * don't actually have ARC buffers linked off of these headers. + * Thus, *if* the headers had associated ARC buffers, these + * buffers *would have* consumed this number of bytes. + */ + kstat_named_t arcstat_mru_ghost_size; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. + */ + kstat_named_t arcstat_mru_ghost_evictable_data; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. + */ + kstat_named_t arcstat_mru_ghost_evictable_metadata; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_mfu state. This includes *all* buffers in the arc_mfu + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + */ + kstat_named_t arcstat_mfu_size; + /* + * Number of bytes consumed by ARC buffers that are eligible for + * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu + * state. + */ + kstat_named_t arcstat_mfu_evictable_data; + /* + * Number of bytes consumed by ARC buffers that are eligible for + * eviction, of type ARC_BUFC_METADATA, and reside in the + * arc_mfu state. + */ + kstat_named_t arcstat_mfu_evictable_metadata; + /* + * Total number of bytes that *would have been* consumed by ARC + * buffers in the arc_mfu_ghost state. See the comment above + * arcstat_mru_ghost_size for more details. + */ + kstat_named_t arcstat_mfu_ghost_size; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. + */ + kstat_named_t arcstat_mfu_ghost_evictable_data; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. + */ + kstat_named_t arcstat_mfu_ghost_evictable_metadata; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_feeds; @@ -299,16 +728,36 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; - kstat_named_t arcstat_l2_writes_hdr_miss; + kstat_named_t arcstat_l2_writes_lock_retry; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_size; + kstat_named_t arcstat_l2_asize; kstat_named_t arcstat_l2_hdr_size; + kstat_named_t arcstat_l2_write_trylock_fail; + kstat_named_t arcstat_l2_write_passed_headroom; + kstat_named_t arcstat_l2_write_spa_mismatch; + kstat_named_t arcstat_l2_write_in_l2; + kstat_named_t arcstat_l2_write_hdr_io_in_progress; + kstat_named_t arcstat_l2_write_not_cacheable; + kstat_named_t arcstat_l2_write_full; + kstat_named_t arcstat_l2_write_buffer_iter; + kstat_named_t arcstat_l2_write_pios; + kstat_named_t arcstat_l2_write_buffer_bytes_scanned; + kstat_named_t arcstat_l2_write_buffer_list_iter; + kstat_named_t arcstat_l2_write_buffer_list_null_iter; kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_meta_used; + kstat_named_t arcstat_meta_limit; + kstat_named_t arcstat_meta_max; + kstat_named_t arcstat_meta_min; + kstat_named_t arcstat_sync_wait_for_async; + kstat_named_t arcstat_demand_hit_predictive_prefetch; } arc_stats_t; static arc_stats_t arc_stats = { @@ -326,13 +775,15 @@ static arc_stats_t arc_stats = { { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, + { "allocated", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, - { "recycle_miss", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, + { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, + { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, @@ -343,9 +794,28 @@ static arc_stats_t arc_stats = { { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, + { "compressed_size", KSTAT_DATA_UINT64 }, + { "uncompressed_size", KSTAT_DATA_UINT64 }, + { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, + { "metadata_size", KSTAT_DATA_UINT64 }, { "other_size", KSTAT_DATA_UINT64 }, + { "anon_size", KSTAT_DATA_UINT64 }, + { "anon_evictable_data", KSTAT_DATA_UINT64 }, + { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, + { "mru_size", KSTAT_DATA_UINT64 }, + { "mru_evictable_data", KSTAT_DATA_UINT64 }, + { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, + { "mru_ghost_size", KSTAT_DATA_UINT64 }, + { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, + { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, + { "mfu_size", KSTAT_DATA_UINT64 }, + { "mfu_evictable_data", KSTAT_DATA_UINT64 }, + { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, + { "mfu_ghost_size", KSTAT_DATA_UINT64 }, + { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, + { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, @@ -355,22 +825,42 @@ static arc_stats_t arc_stats = { { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, - { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, + { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, + { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "memory_throttle_count", KSTAT_DATA_UINT64 } + { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, + { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, + { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, + { "l2_write_in_l2", KSTAT_DATA_UINT64 }, + { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, + { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, + { "l2_write_full", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, + { "l2_write_pios", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, + { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "arc_meta_used", KSTAT_DATA_UINT64 }, + { "arc_meta_limit", KSTAT_DATA_UINT64 }, + { "arc_meta_max", KSTAT_DATA_UINT64 }, + { "arc_meta_min", KSTAT_DATA_UINT64 }, + { "sync_wait_for_async", KSTAT_DATA_UINT64 }, + { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) #define ARCSTAT_INCR(stat, val) \ - atomic_add_64(&arc_stats.stat.value.ui64, (val)); + atomic_add_64(&arc_stats.stat.value.ui64, (val)) #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) @@ -426,15 +916,21 @@ static arc_state_t *arc_l2c_only; #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ +#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ +#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ +#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ + +/* compressed size of entire arc */ +#define arc_compressed_size ARCSTAT(arcstat_compressed_size) +/* uncompressed size of entire arc */ +#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) +/* number of bytes in the arc from arc_buf_t's */ +#define arc_overhead_size ARCSTAT(arcstat_overhead_size) static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; -static uint64_t arc_meta_used; -static uint64_t arc_meta_limit; -static uint64_t arc_meta_max = 0; - -typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; typedef struct arc_callback arc_callback_t; @@ -451,35 +947,64 @@ typedef struct arc_write_callback arc_wr struct arc_write_callback { void *awcb_private; arc_done_func_t *awcb_ready; + arc_done_func_t *awcb_children_ready; + arc_done_func_t *awcb_physdone; arc_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - uint64_t b_cksum0; - +/* + * ARC buffers are separated into multiple structs as a memory saving measure: + * - Common fields struct, always defined, and embedded within it: + * - L2-only fields, always allocated but undefined when not in L2ARC + * - L1-only fields, only allocated when in L1ARC + * + * Buffer in L1 Buffer only in L2 + * +------------------------+ +------------------------+ + * | arc_buf_hdr_t | | arc_buf_hdr_t | + * | | | | + * | | | | + * | | | | + * +------------------------+ +------------------------+ + * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | + * | (undefined if L1-only) | | | + * +------------------------+ +------------------------+ + * | l1arc_buf_hdr_t | + * | | + * | | + * | | + * | | + * +------------------------+ + * + * Because it's possible for the L2ARC to become extremely large, we can wind + * up eating a lot of memory in L2ARC buffer headers, so the size of a header + * is minimized by only allocating the fields necessary for an L1-cached buffer + * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and + * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple + * words in pointers. arc_hdr_realloc() is used to switch a header between + * these two allocation states. + */ +typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; zio_cksum_t *b_freeze_cksum; +#ifdef ZFS_DEBUG + /* + * used for debugging wtih kmem_flags - by allocating and freeing + * b_thawed when the buffer is thawed, we get a record of the stack + * trace that thawed it. + */ + void *b_thawed; +#endif - arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; - uint32_t b_flags; - uint32_t b_datacnt; - - arc_callback_t *b_acb; + uint32_t b_bufcnt; + /* for waiting on writes to complete */ kcondvar_t b_cv; - - /* immutable */ - arc_buf_contents_t b_type; - uint64_t b_size; - uint64_t b_spa; + uint8_t b_byteswap; /* protected by arc state mutex */ arc_state_t *b_state; - list_node_t b_arc_node; + multilist_node_t b_arc_node; /* updated atomically */ clock_t b_arc_access; @@ -487,69 +1012,207 @@ struct arc_buf_hdr { /* self protecting */ refcount_t b_refcnt; - l2arc_buf_hdr_t *b_l2hdr; + arc_callback_t *b_acb; + void *b_pdata; +} l1arc_buf_hdr_t; + +typedef struct l2arc_dev l2arc_dev_t; + +typedef struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ + list_node_t b_l2node; +} l2arc_buf_hdr_t; + +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + + arc_buf_contents_t b_type; + arc_buf_hdr_t *b_hash_next; + arc_flags_t b_flags; + + /* + * This field stores the size of the data buffer after + * compression, and is set in the arc's zio completion handlers. + * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). + * + * While the block pointers can store up to 32MB in their psize + * field, we can only store up to 32MB minus 512B. This is due + * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. + * a field of zeros represents 512B in the bp). We can't use a + * bias of 1 since we need to reserve a psize of zero, here, to + * represent holes and embedded blocks. + * + * This isn't a problem in practice, since the maximum size of a + * buffer is limited to 16MB, so we never need to store 32MB in + * this field. Even in the upstream illumos code base, the + * maximum size of a buffer is limited to 16MB. + */ + uint16_t b_psize; + + /* + * This field stores the size of the data buffer before + * compression, and cannot change once set. It is in units + * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) + */ + uint16_t b_lsize; /* immutable */ + uint64_t b_spa; /* immutable */ + + /* L2ARC fields. Undefined when not in L2ARC. */ + l2arc_buf_hdr_t b_l2hdr; + /* L1ARC fields. Undefined when in l2arc_only state */ + l1arc_buf_hdr_t b_l1hdr; }; -static arc_buf_t *arc_eviction_list; -static kmutex_t arc_eviction_mtx; -static arc_buf_hdr_t arc_eviction_hdr; -static void arc_get_data_buf(arc_buf_t *buf); -static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); -static int arc_evict_needed(arc_buf_contents_t type); -static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); +#if defined(__FreeBSD__) && defined(_KERNEL) +static int +sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = arc_meta_limit; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val <= 0 || val > arc_c_max) + return (EINVAL); + + arc_meta_limit = val; + return (0); +} + +static int +sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_arc_max; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (zfs_arc_max == 0) { + /* Loader tunable so blindly set */ + zfs_arc_max = val; + return (0); + } + + if (val < arc_abs_min || val > kmem_size()) + return (EINVAL); + if (val < arc_c_min) + return (EINVAL); + if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) + return (EINVAL); + + arc_c_max = val; + + arc_c = arc_c_max; + arc_p = (arc_c >> 1); + + if (zfs_arc_meta_limit == 0) { + /* limit meta-data to 1/4 of the arc capacity */ + arc_meta_limit = arc_c_max / 4; + } + + /* if kmem_flags are set, lets try to use less memory */ + if (kmem_debugging()) + arc_c = arc_c / 2; + + zfs_arc_max = arc_c; + + return (0); +} + +static int +sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_arc_min; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (zfs_arc_min == 0) { + /* Loader tunable so blindly set */ + zfs_arc_min = val; + return (0); + } + + if (val < arc_abs_min || val > arc_c_max) + return (EINVAL); + + arc_c_min = val; + + if (zfs_arc_meta_min == 0) + arc_meta_min = arc_c_min / 2; -static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); + if (arc_c < arc_c_min) + arc_c = arc_c_min; + + zfs_arc_min = arc_c_min; + + return (0); +} +#endif #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) -/* - * Private ARC flags. These flags are private ARC only flags that will show up - * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can - * be passed in as arc_flags in things like arc_read. However, these flags - * should never be passed and should only be set by ARC code. When adding new - * public flags, make sure not to smash the private ones. - */ - -#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ -#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ -#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ -#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ -#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ -#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ -#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ -#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ -#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ -#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ - -#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) -#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) -#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) -#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) -#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) -#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) -#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) -#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) -#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ - (hdr)->b_l2hdr != NULL) -#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) -#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) -#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) +#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) +#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) +#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) +#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) +#define HDR_COMPRESSION_ENABLED(hdr) \ + ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) + +#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) +#define HDR_L2_READING(hdr) \ + (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ + ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) +#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) +#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) +#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) + +#define HDR_ISTYPE_METADATA(hdr) \ + ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) +#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) + +#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) +#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) + +/* For storing compression mode in b_flags */ +#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) + +#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) +#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); + +#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) /* * Other sizes */ -#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) -#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) +#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines */ -#define HT_LOCK_PAD 64 +#define HT_LOCK_PAD CACHE_LINE_SIZE struct ht_lock { kmutex_t ht_lock; @@ -562,7 +1225,7 @@ struct ht_lock { typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; - struct ht_lock ht_locks[BUF_LOCKS]; + struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); } buf_hash_table_t; static buf_hash_table_t buf_hash_table; @@ -571,8 +1234,8 @@ static buf_hash_table_t buf_hash_table; (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) -#define HDR_LOCK(buf) \ - (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) +#define HDR_LOCK(hdr) \ + (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; @@ -581,59 +1244,127 @@ uint64_t zfs_crc64_table[256]; */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ -#define L2ARC_HEADROOM 2 /* num of writes */ +#define L2ARC_HEADROOM 2 /* num of writes */ +/* + * If we discover during ARC scan any buffers to be compressed, we boost + * our headroom for the next scanning cycle by this percentage multiple. + */ +#define L2ARC_HEADROOM_BOOST 200 #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) -/* - * L2ARC Performance Tunables - */ +/* L2ARC Performance Tunables */ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ +uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, + &l2arc_write_max, 0, "max write size"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, + &l2arc_write_boost, 0, "extra write during warmup"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, + &l2arc_headroom, 0, "number of dev writes"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, + &l2arc_feed_secs, 0, "interval seconds"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, + &l2arc_feed_min_ms, 0, "min interval milliseconds"); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, + &l2arc_noprefetch, 0, "don't cache prefetch bufs"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, + &l2arc_feed_again, 0, "turbo warmup"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, + &l2arc_norw, 0, "no reads during writes"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, + &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of anonymous state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, + &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, + &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru ghost state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, + &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu ghost state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, + &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); + /* * L2ARC Internals */ -typedef struct l2arc_dev { +struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ spa_t *l2ad_spa; /* spa */ uint64_t l2ad_hand; /* next write location */ - uint64_t l2ad_write; /* desired write size, bytes */ - uint64_t l2ad_boost; /* warmup write boost, bytes */ uint64_t l2ad_start; /* first addr on device */ uint64_t l2ad_end; /* last addr on device */ - uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ - list_t *l2ad_buflist; /* buffer list */ + kmutex_t l2ad_mtx; /* lock for buffer list */ + list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ -} l2arc_dev_t; + refcount_t l2ad_alloc; /* allocated bytes */ +}; static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ -static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { - arc_buf_t *l2rcb_buf; /* read buffer */ - spa_t *l2rcb_spa; /* spa */ - blkptr_t l2rcb_bp; /* original blkptr */ - zbookmark_t l2rcb_zb; /* original bookmark */ - int l2rcb_flags; /* original flags */ + arc_buf_hdr_t *l2rcb_hdr; /* read buffer */ + blkptr_t l2rcb_bp; /* original blkptr */ + zbookmark_phys_t l2rcb_zb; /* original bookmark */ + int l2rcb_flags; /* original flags */ + void *l2rcb_data; /* temporary buffer */ } l2arc_read_callback_t; typedef struct l2arc_write_callback { @@ -641,17 +1372,11 @@ typedef struct l2arc_write_callback { arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ } l2arc_write_callback_t; -struct l2arc_buf_hdr { - /* protected by arc_buf_hdr mutex */ - l2arc_dev_t *b_dev; /* L2ARC device */ - uint64_t b_daddr; /* disk address, offset byte */ -}; - typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; size_t l2df_size; - void (*l2df_func)(void *, size_t); + arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; @@ -659,9 +1384,35 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; -static void l2arc_read_done(zio_t *zio); -static void l2arc_hdr_stat_add(void); -static void l2arc_hdr_stat_remove(void); +static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); +static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); +static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr); +static void arc_hdr_alloc_pdata(arc_buf_hdr_t *); +static void arc_access(arc_buf_hdr_t *, kmutex_t *); +static boolean_t arc_is_overflowing(); +static void arc_buf_watch(arc_buf_t *); + +static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); +static uint32_t arc_bufc_to_flags(arc_buf_contents_t); +static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); +static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); + +static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); +static void l2arc_read_done(zio_t *); + +static void +l2arc_trim(const arc_buf_hdr_t *hdr) +{ + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + + ASSERT(HDR_HAS_L2HDR(hdr)); + ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); + + if (HDR_GET_PSIZE(hdr) != 0) { + trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, + HDR_GET_PSIZE(hdr), 0); + } +} static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) @@ -680,29 +1431,38 @@ buf_hash(uint64_t spa, const dva_t *dva, return (crc); } -#define BUF_EMPTY(buf) \ - ((buf)->b_dva.dva_word[0] == 0 && \ - (buf)->b_dva.dva_word[1] == 0 && \ - (buf)->b_birth == 0) - -#define BUF_EQUAL(spa, dva, birth, buf) \ - ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ - ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ - ((buf)->b_birth == birth) && ((buf)->b_spa == spa) +#define HDR_EMPTY(hdr) \ + ((hdr)->b_dva.dva_word[0] == 0 && \ + (hdr)->b_dva.dva_word[1] == 0) + +#define HDR_EQUAL(spa, dva, birth, hdr) \ + ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ + ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ + ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) + +static void +buf_discard_identity(arc_buf_hdr_t *hdr) +{ + hdr->b_dva.dva_word[0] = 0; + hdr->b_dva.dva_word[1] = 0; + hdr->b_birth = 0; +} static arc_buf_hdr_t * -buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) +buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { + const dva_t *dva = BP_IDENTITY(bp); + uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *buf; + arc_buf_hdr_t *hdr; mutex_enter(hash_lock); - for (buf = buf_hash_table.ht_table[idx]; buf != NULL; - buf = buf->b_hash_next) { - if (BUF_EQUAL(spa, dva, birth, buf)) { + for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; + hdr = hdr->b_hash_next) { + if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; - return (buf); + return (hdr); } } mutex_exit(hash_lock); @@ -715,27 +1475,36 @@ buf_hash_find(uint64_t spa, const dva_t * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. + * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * -buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) +buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { - uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *fbuf; + arc_buf_hdr_t *fhdr; uint32_t i; - ASSERT(!HDR_IN_HASH_TABLE(buf)); - *lockp = hash_lock; - mutex_enter(hash_lock); - for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; - fbuf = fbuf->b_hash_next, i++) { - if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) - return (fbuf); + ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); + ASSERT(hdr->b_birth != 0); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + + if (lockp != NULL) { + *lockp = hash_lock; + mutex_enter(hash_lock); + } else { + ASSERT(MUTEX_HELD(hash_lock)); + } + + for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; + fhdr = fhdr->b_hash_next, i++) { + if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) + return (fhdr); } - buf->b_hash_next = buf_hash_table.ht_table[idx]; - buf_hash_table.ht_table[idx] = buf; - buf->b_flags |= ARC_IN_HASH_TABLE; + hdr->b_hash_next = buf_hash_table.ht_table[idx]; + buf_hash_table.ht_table[idx] = hdr; + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { @@ -753,22 +1522,22 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmut } static void -buf_hash_remove(arc_buf_hdr_t *buf) +buf_hash_remove(arc_buf_hdr_t *hdr) { - arc_buf_hdr_t *fbuf, **bufp; - uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + arc_buf_hdr_t *fhdr, **hdrp; + uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); - ASSERT(HDR_IN_HASH_TABLE(buf)); + ASSERT(HDR_IN_HASH_TABLE(hdr)); - bufp = &buf_hash_table.ht_table[idx]; - while ((fbuf = *bufp) != buf) { - ASSERT(fbuf != NULL); - bufp = &fbuf->b_hash_next; - } - *bufp = buf->b_hash_next; - buf->b_hash_next = NULL; - buf->b_flags &= ~ARC_IN_HASH_TABLE; + hdrp = &buf_hash_table.ht_table[idx]; + while ((fhdr = *hdrp) != hdr) { + ASSERT3P(fhdr, !=, NULL); + hdrp = &fhdr->b_hash_next; + } + *hdrp = hdr->b_hash_next; + hdr->b_hash_next = NULL; + arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); @@ -781,7 +1550,8 @@ buf_hash_remove(arc_buf_hdr_t *buf) /* * Global data structures and functions for the buf kmem cache. */ -static kmem_cache_t *hdr_cache; +static kmem_cache_t *hdr_full_cache; +static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void @@ -793,7 +1563,8 @@ buf_fini(void) (buf_hash_table.ht_mask + 1) * sizeof (void *)); for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); - kmem_cache_destroy(hdr_cache); + kmem_cache_destroy(hdr_full_cache); + kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } @@ -803,15 +1574,34 @@ buf_fini(void) */ /* ARGSUSED */ static int -hdr_cons(void *vbuf, void *unused, int kmflag) +hdr_full_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *hdr = vbuf; + +#ifdef __NetBSD__ + hdr = unused; +#endif + bzero(hdr, HDR_FULL_SIZE); + cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); + refcount_create(&hdr->b_l1hdr.b_refcnt); + mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + multilist_link_init(&hdr->b_l1hdr.b_arc_node); + arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); + + return (0); +} + +/* ARGSUSED */ +static int +hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { - arc_buf_hdr_t *buf = unused; + arc_buf_hdr_t *hdr = vbuf; - bzero(buf, sizeof (arc_buf_hdr_t)); - refcount_create(&buf->b_refcnt); - cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); +#ifdef __NetBSD__ + hdr = unused; +#endif + bzero(hdr, HDR_L2ONLY_SIZE); + arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } @@ -820,10 +1610,13 @@ hdr_cons(void *vbuf, void *unused, int k static int buf_cons(void *vbuf, void *unused, int kmflag) { - arc_buf_t *buf = unused; + arc_buf_t *buf = vbuf; +#ifdef __NetBSD__ + buf = unused; +#endif bzero(buf, sizeof (arc_buf_t)); - rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); @@ -835,24 +1628,44 @@ buf_cons(void *vbuf, void *unused, int k */ /* ARGSUSED */ static void -hdr_dest(void *vbuf, void *unused) +hdr_full_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *hdr = vbuf; + +#ifdef __NetBSD__ + hdr = unused; +#endif + ASSERT(HDR_EMPTY(hdr)); + cv_destroy(&hdr->b_l1hdr.b_cv); + refcount_destroy(&hdr->b_l1hdr.b_refcnt); + mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); +} + +/* ARGSUSED */ +static void +hdr_l2only_dest(void *vbuf, void *unused) { - arc_buf_hdr_t *buf = unused; + arc_buf_hdr_t *hdr = vbuf; - ASSERT(BUF_EMPTY(buf)); - refcount_destroy(&buf->b_refcnt); - cv_destroy(&buf->b_cv); - mutex_destroy(&buf->b_freeze_lock); - arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); +#ifdef __NetBSD__ + hdr = unused; +#endif + ASSERT(HDR_EMPTY(hdr)); + arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } /* ARGSUSED */ static void buf_dest(void *vbuf, void *unused) { - arc_buf_t *buf = unused; + arc_buf_t *buf = vbuf; - rw_destroy(&buf->b_lock); +#ifdef __NetBSD__ + buf = unused; +#endif + mutex_destroy(&buf->b_evict_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } @@ -869,7 +1682,7 @@ hdr_recl(void *unused) * which is after we do arc_fini(). */ if (!arc_dead) - cv_signal(&arc_reclaim_thr_cv); + cv_signal(&arc_reclaim_thread_cv); } static void @@ -881,10 +1694,11 @@ buf_init(void) /* * The hash table is big enough to fill all of physical memory - * with an average 64K block size. The table will take up - * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). + * with an average block size of zfs_arc_average_blocksize (default 8K). + * By default, the table will take up + * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ - while (hsize * 65536 < (uint64_t)physmem * PAGESIZE) + while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; @@ -896,8 +1710,11 @@ retry: goto retry; } - hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), - 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); + hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, + 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); + hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", + HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, + NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); @@ -913,232 +1730,709 @@ retry: #define ARC_MINTIME (hz>>4) /* 62 ms */ +static inline boolean_t +arc_buf_is_shared(arc_buf_t *buf) +{ + boolean_t shared = (buf->b_data != NULL && + buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); + IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); + return (shared); +} + +static inline void +arc_cksum_free(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_l1hdr.b_freeze_cksum = NULL; + } + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); +} + static void arc_cksum_verify(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum == NULL || - (buf->b_hdr->b_flags & ARC_IO_ERROR)) { - mutex_exit(&buf->b_hdr->b_freeze_lock); + ASSERT(HDR_HAS_L1HDR(hdr)); + + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); - if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) + fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc); + if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } -static int -arc_cksum_equal(arc_buf_t *buf) +static boolean_t +arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { - zio_cksum_t zc; - int equal; + enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); + boolean_t valid_cksum; - mutex_enter(&buf->b_hdr->b_freeze_lock); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); - equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); - mutex_exit(&buf->b_hdr->b_freeze_lock); + ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); + VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); + + /* + * We rely on the blkptr's checksum to determine if the block + * is valid or not. When compressed arc is enabled, the l2arc + * writes the block to the l2arc just as it appears in the pool. + * This allows us to use the blkptr's checksum to validate the + * data that we just read off of the l2arc without having to store + * a separate checksum in the arc_buf_hdr_t. However, if compressed + * arc is disabled, then the data written to the l2arc is always + * uncompressed and won't match the block as it exists in the main + * pool. When this is the case, we must first compress it if it is + * compressed on the main pool before we can validate the checksum. + */ + if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + uint64_t lsize = HDR_GET_LSIZE(hdr); + uint64_t csize; + + void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); + csize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); + if (csize < HDR_GET_PSIZE(hdr)) { + /* + * Compressed blocks are always a multiple of the + * smallest ashift in the pool. Ideally, we would + * like to round up the csize to the next + * spa_min_ashift but that value may have changed + * since the block was last written. Instead, + * we rely on the fact that the hdr's psize + * was set to the psize of the block when it was + * last written. We set the csize to that value + * and zero out any part that should not contain + * data. + */ + bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize); + csize = HDR_GET_PSIZE(hdr); + } + zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL); + } - return (equal); + /* + * Block pointers always store the checksum for the logical data. + * If the block pointer has the gang bit set, then the checksum + * it represents is for the reconstituted data and not for an + * individual gang member. The zio pipeline, however, must be able to + * determine the checksum of each of the gang constituents so it + * treats the checksum comparison differently than what we need + * for l2arc blocks. This prevents us from using the + * zio_checksum_error() interface directly. Instead we must call the + * zio_checksum_error_impl() so that we can ensure the checksum is + * generated using the correct checksum algorithm and accounts for the + * logical I/O size and not just a gang fragment. + */ + valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, + BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size, + zio->io_offset, NULL) == 0); + zio_pop_transforms(zio); + return (valid_cksum); } static void -arc_cksum_compute(arc_buf_t *buf, boolean_t force) +arc_cksum_compute(arc_buf_t *buf) { - if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) + arc_buf_hdr_t *hdr = buf->b_hdr; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - mutex_exit(&buf->b_hdr->b_freeze_lock); + ASSERT(HDR_HAS_L1HDR(hdr)); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } - buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, - buf->b_hdr->b_freeze_cksum); - mutex_exit(&buf->b_hdr->b_freeze_lock); + hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), + KM_SLEEP); + fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, + hdr->b_l1hdr.b_freeze_cksum); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); +#ifdef illumos + arc_buf_watch(buf); +#endif } -void -arc_buf_thaw(arc_buf_t *buf) +#ifdef illumos +#ifndef _KERNEL +typedef struct procctl { + long cmd; + prwatch_t prwatch; +} procctl_t; +#endif + +/* ARGSUSED */ +static void +arc_buf_unwatch(arc_buf_t *buf) { - if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_state != arc_anon) +#ifndef _KERNEL + if (arc_watch) { + int result; + procctl_t ctl; + ctl.cmd = PCWATCH; + ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; + ctl.prwatch.pr_size = 0; + ctl.prwatch.pr_wflags = 0; + result = write(arc_procfd, &ctl, sizeof (ctl)); + ASSERT3U(result, ==, sizeof (ctl)); + } +#endif +} + +/* ARGSUSED */ +static void +arc_buf_watch(arc_buf_t *buf) +{ +#ifndef _KERNEL + if (arc_watch) { + int result; + procctl_t ctl; + ctl.cmd = PCWATCH; + ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; + ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr); + ctl.prwatch.pr_wflags = WA_WRITE; + result = write(arc_procfd, &ctl, sizeof (ctl)); + ASSERT3U(result, ==, sizeof (ctl)); + } +#endif +} +#endif /* illumos */ + +static arc_buf_contents_t +arc_buf_type(arc_buf_hdr_t *hdr) +{ + arc_buf_contents_t type; + if (HDR_ISTYPE_METADATA(hdr)) { + type = ARC_BUFC_METADATA; + } else { + type = ARC_BUFC_DATA; + } + VERIFY3U(hdr->b_type, ==, type); + return (type); +} + +static uint32_t +arc_bufc_to_flags(arc_buf_contents_t type) +{ + switch (type) { + case ARC_BUFC_DATA: + /* metadata field is 0 if buffer contains normal data */ + return (0); + case ARC_BUFC_METADATA: + return (ARC_FLAG_BUFC_METADATA); + default: + break; + } + panic("undefined ARC buffer type!"); + return ((uint32_t)-1); +} + +void +arc_buf_thaw(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (hdr->b_l1hdr.b_state != arc_anon) panic("modifying non-anon buffer!"); - if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) + if (HDR_IO_IN_PROGRESS(hdr)) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } - mutex_enter(&buf->b_hdr->b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum != NULL) { - kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - buf->b_hdr->b_freeze_cksum = NULL; + ASSERT(HDR_HAS_L1HDR(hdr)); + arc_cksum_free(hdr); + + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); +#ifdef ZFS_DEBUG + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (hdr->b_l1hdr.b_thawed != NULL) + kmem_free(hdr->b_l1hdr.b_thawed, 1); + hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); } - mutex_exit(&buf->b_hdr->b_freeze_lock); +#endif + + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); + +#ifdef illumos + arc_buf_unwatch(buf); +#endif } void arc_buf_freeze(arc_buf_t *buf) { + arc_buf_hdr_t *hdr = buf->b_hdr; + kmutex_t *hash_lock; + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - ASSERT(buf->b_hdr->b_freeze_cksum != NULL || - buf->b_hdr->b_state == arc_anon); - arc_cksum_compute(buf, B_FALSE); + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || + hdr->b_l1hdr.b_state == arc_anon); + arc_cksum_compute(buf); + mutex_exit(hash_lock); + +} + +/* + * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, + * the following functions should be used to ensure that the flags are + * updated in a thread-safe way. When manipulating the flags either + * the hash_lock must be held or the hdr must be undiscoverable. This + * ensures that we're not racing with any other threads when updating + * the flags. + */ +static inline void +arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags |= flags; +} + +static inline void +arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + hdr->b_flags &= ~flags; } +/* + * Setting the compression bits in the arc_buf_hdr_t's b_flags is + * done in a special way since we have to clear and set bits + * at the same time. Consumers that wish to set the compression bits + * must use this function to ensure that the flags are updated in + * thread-safe manner. + */ static void -add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { - ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * Holes and embedded blocks will always have a psize = 0 so + * we ignore the compression of the blkptr and set the + * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. + * Holes and embedded blocks remain anonymous so we don't + * want to uncompress them. Mark them as uncompressed. + */ + if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { + arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + HDR_SET_COMPRESS(hdr, cmp); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); + ASSERT(HDR_COMPRESSION_ENABLED(hdr)); + } +} + +static int +arc_decompress(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; + int error; + + if (arc_buf_is_shared(buf)) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + /* + * The arc_buf_hdr_t is either not compressed or is + * associated with an embedded block or a hole in which + * case they remain anonymous. + */ + IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 || + HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr)); + bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr)); + } else { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); + error = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + if (error != 0) { + zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d", + hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + return (SET_ERROR(EIO)); + } + } + if (bswap != DMU_BSWAP_NUMFUNCS) { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); + dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); + } + arc_cksum_compute(buf); + return (0); +} + +/* + * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t. + */ +static uint64_t +arc_hdr_size(arc_buf_hdr_t *hdr) +{ + uint64_t size; + + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + HDR_GET_PSIZE(hdr) > 0) { + size = HDR_GET_PSIZE(hdr); + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); + size = HDR_GET_LSIZE(hdr); + } + return (size); +} + +/* + * Increment the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + (void) refcount_add_many(&state->arcs_esize[type], lsize, hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_add_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_add_many(&state->arcs_esize[type], lsize, buf); + } +} + +/* + * Decrement the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + (void) refcount_remove_many(&state->arcs_esize[type], + lsize, hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_remove_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + (void) refcount_remove_many(&state->arcs_esize[type], + lsize, buf); + } +} + +/* + * Add a reference to this hdr indicating that someone is actively + * referencing that memory. When the refcount transitions from 0 to 1, + * we remove it from the respective arc_state_t list to indicate that + * it is not evictable. + */ +static void +add_reference(arc_buf_hdr_t *hdr, void *tag) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + if (!MUTEX_HELD(HDR_LOCK(hdr))) { + ASSERT(hdr->b_l1hdr.b_state == arc_anon); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + } + + arc_state_t *state = hdr->b_l1hdr.b_state; - if ((refcount_add(&ab->b_refcnt, tag) == 1) && - (ab->b_state != arc_anon)) { - uint64_t delta = ab->b_size * ab->b_datacnt; - list_t *list = &ab->b_state->arcs_list[ab->b_type]; - uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; - - ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); - mutex_enter(&ab->b_state->arcs_mtx); - ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(list, ab); - if (GHOST_STATE(ab->b_state)) { - ASSERT3U(ab->b_datacnt, ==, 0); - ASSERT3P(ab->b_buf, ==, NULL); - delta = ab->b_size; - } - ASSERT(delta > 0); - ASSERT3U(*size, >=, delta); - atomic_add_64(size, -delta); - mutex_exit(&ab->b_state->arcs_mtx); + if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && + (state != arc_anon)) { + /* We don't use the L2-only state list. */ + if (state != arc_l2c_only) { + multilist_remove(&state->arcs_list[arc_buf_type(hdr)], + hdr); + arc_evitable_space_decrement(hdr, state); + } /* remove the prefetch flag if we get a reference */ - if (ab->b_flags & ARC_PREFETCH) - ab->b_flags &= ~ARC_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); } } +/* + * Remove a reference from this hdr. When the reference transitions from + * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's + * list making it eligible for eviction. + */ static int -remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; - arc_state_t *state = ab->b_state; + arc_state_t *state = hdr->b_l1hdr.b_state; + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); - if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && + /* + * arc_l2c_only counts as a ghost state so we don't need to explicitly + * check to prevent usage of the arc_l2c_only list. + */ + if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - uint64_t *size = &state->arcs_lsize[ab->b_type]; - - ASSERT(!MUTEX_HELD(&state->arcs_mtx)); - mutex_enter(&state->arcs_mtx); - ASSERT(!list_link_active(&ab->b_arc_node)); - list_insert_head(&state->arcs_list[ab->b_type], ab); - ASSERT(ab->b_datacnt > 0); - atomic_add_64(size, ab->b_size * ab->b_datacnt); - mutex_exit(&state->arcs_mtx); + multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); + arc_evictable_space_increment(hdr, state); } return (cnt); } /* - * Move the supplied buffer to the indicated state. The mutex + * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void -arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, + kmutex_t *hash_lock) { - arc_state_t *old_state = ab->b_state; - int64_t refcnt = refcount_count(&ab->b_refcnt); - uint64_t from_delta, to_delta; + arc_state_t *old_state; + int64_t refcnt; + uint32_t bufcnt; + boolean_t update_old, update_new; + arc_buf_contents_t buftype = arc_buf_type(hdr); + + /* + * We almost always have an L1 hdr here, since we call arc_hdr_realloc() + * in arc_read() when bringing a buffer out of the L2ARC. However, the + * L1 hdr doesn't always exist when we change state to arc_anon before + * destroying a header, in which case reallocating to add the L1 hdr is + * pointless. + */ + if (HDR_HAS_L1HDR(hdr)) { + old_state = hdr->b_l1hdr.b_state; + refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); + bufcnt = hdr->b_l1hdr.b_bufcnt; + update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL); + } else { + old_state = arc_l2c_only; + refcnt = 0; + bufcnt = 0; + update_old = B_FALSE; + } + update_new = update_old; ASSERT(MUTEX_HELD(hash_lock)); - ASSERT(new_state != old_state); - ASSERT(refcnt == 0 || ab->b_datacnt > 0); - ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); - ASSERT(ab->b_datacnt <= 1 || new_state != arc_anon); - ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); - - from_delta = to_delta = ab->b_datacnt * ab->b_size; + ASSERT3P(new_state, !=, old_state); + ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); + ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { - if (old_state != arc_anon) { - int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); - uint64_t *size = &old_state->arcs_lsize[ab->b_type]; - - if (use_mutex) - mutex_enter(&old_state->arcs_mtx); - - ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&old_state->arcs_list[ab->b_type], ab); + if (old_state != arc_anon && old_state != arc_l2c_only) { + ASSERT(HDR_HAS_L1HDR(hdr)); + multilist_remove(&old_state->arcs_list[buftype], hdr); + + if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_old = B_TRUE; + } + arc_evitable_space_decrement(hdr, old_state); + } + if (new_state != arc_anon && new_state != arc_l2c_only) { /* - * If prefetching out of the ghost cache, - * we will have a non-null datacnt. + * An L1 header always exists here, since if we're + * moving to some L1-cached state (i.e. not l2c_only or + * anonymous), we realloc the header to add an L1hdr + * beforehand. */ - if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { - /* ghost elements have a ghost size */ - ASSERT(ab->b_buf == NULL); - from_delta = ab->b_size; - } - ASSERT3U(*size, >=, from_delta); - atomic_add_64(size, -from_delta); + ASSERT(HDR_HAS_L1HDR(hdr)); + multilist_insert(&new_state->arcs_list[buftype], hdr); - if (use_mutex) - mutex_exit(&old_state->arcs_mtx); + if (GHOST_STATE(new_state)) { + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_new = B_TRUE; + } + arc_evictable_space_increment(hdr, new_state); } - if (new_state != arc_anon) { - int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); - uint64_t *size = &new_state->arcs_lsize[ab->b_type]; + } - if (use_mutex) - mutex_enter(&new_state->arcs_mtx); + ASSERT(!HDR_EMPTY(hdr)); + if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) + buf_hash_remove(hdr); - list_insert_head(&new_state->arcs_list[ab->b_type], ab); + /* adjust state sizes (ignore arc_l2c_only) */ - /* ghost elements have a ghost size */ - if (GHOST_STATE(new_state)) { - ASSERT(ab->b_datacnt == 0); - ASSERT(ab->b_buf == NULL); - to_delta = ab->b_size; + if (update_new && new_state != arc_l2c_only) { + ASSERT(HDR_HAS_L1HDR(hdr)); + if (GHOST_STATE(new_state)) { + ASSERT0(bufcnt); + + /* + * When moving a header to a ghost state, we first + * remove all arc buffers. Thus, we'll have a + * bufcnt of zero, and no arc buffer to use for + * the reference. As a result, we use the arc + * header pointer for the reference. + */ + (void) refcount_add_many(&new_state->arcs_size, + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + } else { + uint32_t buffers = 0; + + /* + * Each individual buffer holds a unique reference, + * thus we must remove each of these references one + * at a time. + */ + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + ASSERT3U(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } + + (void) refcount_add_many(&new_state->arcs_size, + HDR_GET_LSIZE(hdr), buf); } - atomic_add_64(size, to_delta); + ASSERT3U(bufcnt, ==, buffers); - if (use_mutex) - mutex_exit(&new_state->arcs_mtx); + if (hdr->b_l1hdr.b_pdata != NULL) { + (void) refcount_add_many(&new_state->arcs_size, + arc_hdr_size(hdr), hdr); + } else { + ASSERT(GHOST_STATE(old_state)); + } } } - ASSERT(!BUF_EMPTY(ab)); - if (new_state == arc_anon) { - buf_hash_remove(ab); - } + if (update_old && old_state != arc_l2c_only) { + ASSERT(HDR_HAS_L1HDR(hdr)); + if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + + /* + * When moving a header off of a ghost state, + * the header will not contain any arc buffers. + * We use the arc header pointer for the reference + * which is exactly what we did when we put the + * header on the ghost state. + */ + + (void) refcount_remove_many(&old_state->arcs_size, + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + } else { + uint32_t buffers = 0; + + /* + * Each individual buffer holds a unique reference, + * thus we must remove each of these references one + * at a time. + */ + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + ASSERT3P(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + continue; + } - /* adjust state sizes */ - if (to_delta) - atomic_add_64(&new_state->arcs_size, to_delta); - if (from_delta) { - ASSERT3U(old_state->arcs_size, >=, from_delta); - atomic_add_64(&old_state->arcs_size, -from_delta); + (void) refcount_remove_many( + &old_state->arcs_size, HDR_GET_LSIZE(hdr), + buf); + } + ASSERT3U(bufcnt, ==, buffers); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + (void) refcount_remove_many( + &old_state->arcs_size, arc_hdr_size(hdr), hdr); + } } - ab->b_state = new_state; - /* adjust l2arc hdr stats */ - if (new_state == arc_l2c_only) - l2arc_hdr_stat_add(); - else if (old_state == arc_l2c_only) - l2arc_hdr_stat_remove(); + if (HDR_HAS_L1HDR(hdr)) + hdr->b_l1hdr.b_state = new_state; + + /* + * L2 headers should never be on the L2 state list since they don't + * have L1 headers allocated. + */ + ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && + multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void @@ -1150,6 +2444,9 @@ arc_space_consume(uint64_t space, arc_sp case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; + case ARC_SPACE_META: + ARCSTAT_INCR(arcstat_metadata_size, space); + break; case ARC_SPACE_OTHER: ARCSTAT_INCR(arcstat_other_size, space); break; @@ -1161,7 +2458,9 @@ arc_space_consume(uint64_t space, arc_sp break; } - atomic_add_64(&arc_meta_used, space); + if (type != ARC_SPACE_DATA) + ARCSTAT_INCR(arcstat_meta_used, space); + atomic_add_64(&arc_size, space); } @@ -1174,6 +2473,9 @@ arc_space_return(uint64_t space, arc_spa case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; + case ARC_SPACE_META: + ARCSTAT_INCR(arcstat_metadata_size, -space); + break; case ARC_SPACE_OTHER: ARCSTAT_INCR(arcstat_other_size, -space); break; @@ -1185,58 +2487,96 @@ arc_space_return(uint64_t space, arc_spa break; } - ASSERT(arc_meta_used >= space); - if (arc_meta_max < arc_meta_used) - arc_meta_max = arc_meta_used; - atomic_add_64(&arc_meta_used, -space); + if (type != ARC_SPACE_DATA) { + ASSERT(arc_meta_used >= space); + if (arc_meta_max < arc_meta_used) + arc_meta_max = arc_meta_used; + ARCSTAT_INCR(arcstat_meta_used, -space); + } + ASSERT(arc_size >= space); atomic_add_64(&arc_size, -space); } -void * -arc_data_buf_alloc(uint64_t size) +/* + * Allocate an initial buffer for this hdr, subsequent buffers will + * use arc_buf_clone(). + */ +static arc_buf_t * +arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) { - if (arc_evict_needed(ARC_BUFC_DATA)) - cv_signal(&arc_reclaim_thr_cv); - atomic_add_64(&arc_size, size); - return (zio_data_buf_alloc(size)); -} + arc_buf_t *buf; -void -arc_data_buf_free(void *buf, uint64_t size) -{ - zio_data_buf_free(buf, size); - ASSERT(arc_size >= size); - atomic_add_64(&arc_size, -size); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + VERIFY(hdr->b_type == ARC_BUFC_DATA || + hdr->b_type == ARC_BUFC_METADATA); + + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_next = NULL; + + add_reference(hdr, tag); + + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * If the hdr's data can be shared (no byteswapping, hdr is + * uncompressed, hdr's data is not currently being written to the + * L2ARC write) then we share the data buffer and set the appropriate + * bit in the hdr's b_flags to indicate the hdr is sharing it's + * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to + * store the buf's data. + */ + if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && + HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) { + buf->b_data = hdr->b_l1hdr.b_pdata; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + } else { + buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + } + VERIFY3P(buf->b_data, !=, NULL); + + hdr->b_l1hdr.b_buf = buf; + hdr->b_l1hdr.b_bufcnt += 1; + + return (buf); } -arc_buf_t * -arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) +/* + * Used when allocating additional buffers. + */ +static arc_buf_t * +arc_buf_clone(arc_buf_t *from) { - arc_buf_hdr_t *hdr; arc_buf_t *buf; + arc_buf_hdr_t *hdr = from->b_hdr; + uint64_t size = HDR_GET_LSIZE(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); - ASSERT3U(size, >, 0); - hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); - ASSERT(BUF_EMPTY(hdr)); - hdr->b_size = size; - hdr->b_type = type; - hdr->b_spa = spa_guid(spa); - hdr->b_state = arc_anon; - hdr->b_arc_access = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = NULL; - hdr->b_buf = buf; - arc_get_data_buf(buf); - hdr->b_datacnt = 1; - hdr->b_flags = 0; - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - (void) refcount_add(&hdr->b_refcnt, tag); + buf->b_next = hdr->b_l1hdr.b_buf; + hdr->b_l1hdr.b_buf = buf; + buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + bcopy(from->b_data, buf->b_data, size); + hdr->b_l1hdr.b_bufcnt += 1; + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); return (buf); } @@ -1253,7 +2593,7 @@ arc_loan_buf(spa_t *spa, int size) { arc_buf_t *buf; - buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA); atomic_add_64(&arc_loaned_bytes, size); return (buf); @@ -1267,161 +2607,224 @@ arc_return_buf(arc_buf_t *buf, void *tag { arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(buf->b_data != NULL); - (void) refcount_add(&hdr->b_refcnt, tag); - (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); + ASSERT3P(buf->b_data, !=, NULL); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); - atomic_add_64(&arc_loaned_bytes, -hdr->b_size); + atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr)); } /* Detach an arc_buf from a dbuf (tag) */ void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { - arc_buf_hdr_t *hdr; + arc_buf_hdr_t *hdr = buf->b_hdr; - rw_enter(&buf->b_lock, RW_WRITER); - ASSERT(buf->b_data != NULL); - hdr = buf->b_hdr; - (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); - (void) refcount_remove(&hdr->b_refcnt, tag); - buf->b_efunc = NULL; - buf->b_private = NULL; + ASSERT3P(buf->b_data, !=, NULL); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); - atomic_add_64(&arc_loaned_bytes, hdr->b_size); - rw_exit(&buf->b_lock); + atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr)); } -static arc_buf_t * -arc_buf_clone(arc_buf_t *from) +static void +l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type) { - arc_buf_t *buf; - arc_buf_hdr_t *hdr = from->b_hdr; - uint64_t size = hdr->b_size; - - ASSERT(hdr->b_state != arc_anon); + l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = hdr->b_buf; - hdr->b_buf = buf; - arc_get_data_buf(buf); - bcopy(from->b_data, buf->b_data, size); - hdr->b_datacnt += 1; - return (buf); + df->l2df_data = data; + df->l2df_size = size; + df->l2df_type = type; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); } -void -arc_buf_add_ref(arc_buf_t *buf, void* tag) +static void +arc_hdr_free_on_write(arc_buf_hdr_t *hdr) { - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - - /* - * Check to see if this buffer is evicted. Callers - * must verify b_data != NULL to know if the add_ref - * was successful. - */ - rw_enter(&buf->b_lock, RW_READER); - if (buf->b_data == NULL) { - rw_exit(&buf->b_lock); - return; + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t size = arc_hdr_size(hdr); + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, hdr); + } + (void) refcount_remove_many(&state->arcs_size, size, hdr); + if (type == ARC_BUFC_METADATA) { + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + arc_space_return(size, ARC_SPACE_DATA); } - hdr = buf->b_hdr; - ASSERT(hdr != NULL); - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - rw_exit(&buf->b_lock); - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); - add_reference(hdr, hash_lock, tag); - DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); - ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, - data, metadata, hits); + l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type); } /* - * Free the arc data buffer. If it is an l2arc write in progress, - * the buffer is placed on l2arc_free_on_write to be freed later. + * Share the arc_buf_t's data with the hdr. Whenever we are sharing the + * data buffer, we transfer the refcount ownership to the hdr and update + * the appropriate kstats. */ static void -arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), - void *data, size_t size) +arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { - if (HDR_L2_WRITING(hdr)) { - l2arc_data_free_t *df; - df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); - df->l2df_data = data; - df->l2df_size = size; - df->l2df_func = free_func; - mutex_enter(&l2arc_free_on_write_mtx); - list_insert_head(l2arc_free_on_write, df); - mutex_exit(&l2arc_free_on_write_mtx); - ARCSTAT_BUMP(arcstat_l2_free_on_write); - } else { - free_func(data, size); - } + arc_state_t *state = hdr->b_l1hdr.b_state; + + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + + /* + * Start sharing the data buffer. We transfer the + * refcount ownership to the hdr since it always owns + * the refcount whenever an arc_buf_t is shared. + */ + refcount_transfer_ownership(&state->arcs_size, buf, hdr); + hdr->b_l1hdr.b_pdata = buf->b_data; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + + /* + * Since we've transferred ownership to the hdr we need + * to increment its compressed and uncompressed kstats and + * decrement the overhead size. + */ + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr)); } static void -arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) +arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { - arc_buf_t **bufp; + arc_state_t *state = hdr->b_l1hdr.b_state; - /* free up data associated with the buf */ - if (buf->b_data) { - arc_state_t *state = buf->b_hdr->b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = buf->b_hdr->b_type; + ASSERT(HDR_SHARED_DATA(hdr)); + ASSERT(arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - arc_cksum_verify(buf); + /* + * We are no longer sharing this buffer so we need + * to transfer its ownership to the rightful owner. + */ + refcount_transfer_ownership(&state->arcs_size, hdr, buf); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + hdr->b_l1hdr.b_pdata = NULL; - if (!recycle) { - if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf->b_hdr, zio_buf_free, - buf->b_data, size); - arc_space_return(size, ARC_SPACE_DATA); - } else { - ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf->b_hdr, - zio_data_buf_free, buf->b_data, size); - ARCSTAT_INCR(arcstat_data_size, -size); - atomic_add_64(&arc_size, -size); - } - } - if (list_link_active(&buf->b_hdr->b_arc_node)) { - uint64_t *cnt = &state->arcs_lsize[type]; + /* + * Since the buffer is no longer shared between + * the arc buf and the hdr, count it as overhead. + */ + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); +} + +/* + * Free up buf->b_data and if 'remove' is set, then pull the + * arc_buf_t off of the the arc_buf_hdr_t's list and free it. + */ +static void +arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) +{ + arc_buf_t **bufp; + arc_buf_hdr_t *hdr = buf->b_hdr; + uint64_t size = HDR_GET_LSIZE(hdr); + boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf); - ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); - ASSERT(state != arc_anon); + /* + * Free up the data associated with the buf but only + * if we're not sharing this with the hdr. If we are sharing + * it with the hdr, then hdr will have performed the allocation + * so allow it to do the free. + */ + if (buf->b_data != NULL) { + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - ASSERT3U(*cnt, >=, size); - atomic_add_64(cnt, -size); + arc_cksum_verify(buf); +#ifdef illumos + arc_buf_unwatch(buf); +#endif + + if (destroyed_buf_is_shared) { + ASSERT(ARC_BUF_LAST(buf)); + ASSERT(HDR_SHARED_DATA(hdr)); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + } else { + arc_free_data_buf(hdr, buf->b_data, size, buf); + ARCSTAT_INCR(arcstat_overhead_size, -size); } - ASSERT3U(state->arcs_size, >=, size); - atomic_add_64(&state->arcs_size, -size); buf->b_data = NULL; - ASSERT(buf->b_hdr->b_datacnt > 0); - buf->b_hdr->b_datacnt -= 1; + + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + hdr->b_l1hdr.b_bufcnt -= 1; } /* only remove the buf if requested */ - if (!all) + if (!remove) return; /* remove the buf from the hdr list */ - for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) - continue; - *bufp = buf->b_next; + arc_buf_t *lastbuf = NULL; + bufp = &hdr->b_l1hdr.b_buf; + while (*bufp != NULL) { + if (*bufp == buf) + *bufp = buf->b_next; + + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } + buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + + /* + * If the current arc_buf_t is sharing its data + * buffer with the hdr, then reassign the hdr's + * b_pdata to share it with the new buffer at the end + * of the list. The shared buffer is always the last one + * on the hdr's buffer list. + */ + if (destroyed_buf_is_shared && lastbuf != NULL) { + ASSERT(ARC_BUF_LAST(buf)); + ASSERT(ARC_BUF_LAST(lastbuf)); + VERIFY(!arc_buf_is_shared(lastbuf)); + + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + arc_hdr_free_pdata(hdr); - ASSERT(buf->b_efunc == NULL); + /* + * We must setup a new shared block between the + * last buffer and the hdr. The data would have + * been allocated by the arc buf so we need to transfer + * ownership to the hdr since it's now being shared. + */ + arc_share_buf(hdr, lastbuf); + } else if (HDR_SHARED_DATA(hdr)) { + ASSERT(arc_buf_is_shared(lastbuf)); + } + + if (hdr->b_l1hdr.b_bufcnt == 0) + arc_cksum_free(hdr); /* clean up the buf */ buf->b_hdr = NULL; @@ -1429,527 +2832,1026 @@ arc_buf_destroy(arc_buf_t *buf, boolean_ } static void -arc_hdr_destroy(arc_buf_hdr_t *hdr) +arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr) { - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - ASSERT3P(hdr->b_state, ==, arc_anon); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr)); + + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); +} + +static void +arc_hdr_free_pdata(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + + /* + * If the hdr is currently being written to the l2arc then + * we defer freeing the data by adding it to the l2arc_free_on_write + * list. The l2arc will free the data once it's finished + * writing it to the l2arc device. + */ + if (HDR_L2_WRITING(hdr)) { + arc_hdr_free_on_write(hdr); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else { + arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata, + arc_hdr_size(hdr), hdr); + } + hdr->b_l1hdr.b_pdata = NULL; + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); +} + +static arc_buf_hdr_t * +arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, + enum zio_compress compress, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr; + + ASSERT3U(lsize, >, 0); + VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); + + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); + ASSERT(HDR_EMPTY(hdr)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); + HDR_SET_PSIZE(hdr, psize); + HDR_SET_LSIZE(hdr, lsize); + hdr->b_spa = spa; + hdr->b_type = type; + hdr->b_flags = 0; + arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); + arc_hdr_set_compress(hdr, compress); + + hdr->b_l1hdr.b_state = arc_anon; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_bufcnt = 0; + hdr->b_l1hdr.b_buf = NULL; + + /* + * Allocate the hdr's buffer. This will contain either + * the compressed or uncompressed data depending on the block + * it references and compressed arc enablement. + */ + arc_hdr_alloc_pdata(hdr); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + + return (hdr); +} + +/* + * Transition between the two allocation states for the arc_buf_hdr struct. + * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without + * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller + * version is used when a cache buffer is only in the L2ARC in order to reduce + * memory usage. + */ +static arc_buf_hdr_t * +arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) +{ + ASSERT(HDR_HAS_L2HDR(hdr)); + + arc_buf_hdr_t *nhdr; + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + + ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || + (old == hdr_l2only_cache && new == hdr_full_cache)); + + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); + + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + buf_hash_remove(hdr); - if (l2hdr != NULL) { - boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + + if (new == hdr_full_cache) { + arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* - * To prevent arc_free() and l2arc_evict() from - * attempting to free the same buffer at the same time, - * a FREE_IN_PROGRESS flag is given to arc_free() to - * give it priority. l2arc_evict() can't destroy this - * header while we are waiting on l2arc_buflist_mtx. - * - * The hdr may be removed from l2ad_buflist before we - * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. + * arc_access and arc_change_state need to be aware that a + * header has just come out of L2ARC, so we set its state to + * l2c_only even though it's about to change. + */ + nhdr->b_l1hdr.b_state = arc_l2c_only; + + /* Verify previous threads set to NULL before freeing */ + ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL); + } else { + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + + /* + * If we've reached here, We must have been called from + * arc_evict_hdr(), as such we should have already been + * removed from any ghost list we were previously on + * (which protects us from racing with arc_evict_state), + * thus no locking is needed during this check. + */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + + /* + * A buffer must not be moved into the arc_l2c_only + * state if it's not finished being written out to the + * l2arc device. Otherwise, the b_l1hdr.b_pdata field + * might try to be accessed, even though it was removed. */ - if (!buflist_held) { - mutex_enter(&l2arc_buflist_mtx); - l2hdr = hdr->b_l2hdr; + VERIFY(!HDR_L2_WRITING(hdr)); + VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL); + +#ifdef ZFS_DEBUG + if (hdr->b_l1hdr.b_thawed != NULL) { + kmem_free(hdr->b_l1hdr.b_thawed, 1); + hdr->b_l1hdr.b_thawed = NULL; } +#endif + + arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); + } + /* + * The header has been reallocated so we need to re-insert it into any + * lists it was on. + */ + (void) buf_hash_insert(nhdr, NULL); + + ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); + + mutex_enter(&dev->l2ad_mtx); + + /* + * We must place the realloc'ed header back into the list at + * the same spot. Otherwise, if it's placed earlier in the list, + * l2arc_write_buffers() could find it during the function's + * write phase, and try to write it out to the l2arc. + */ + list_insert_after(&dev->l2ad_buflist, hdr, nhdr); + list_remove(&dev->l2ad_buflist, hdr); + + mutex_exit(&dev->l2ad_mtx); + + /* + * Since we're using the pointer address as the tag when + * incrementing and decrementing the l2ad_alloc refcount, we + * must remove the old pointer (that we're about to destroy) and + * add the new pointer to the refcount. Otherwise we'd remove + * the wrong pointer address when calling arc_hdr_destroy() later. + */ + + (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); + (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); + + buf_discard_identity(hdr); + kmem_cache_free(old, hdr); + + return (nhdr); +} + +/* + * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. + * The buf is returned thawed since we expect the consumer to modify it. + */ +arc_buf_t * +arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) +{ + arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, + ZIO_COMPRESS_OFF, type); + ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); + arc_buf_t *buf = arc_buf_alloc_impl(hdr, tag); + arc_buf_thaw(buf); + return (buf); +} + +static void +arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) +{ + l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; + l2arc_dev_t *dev = l2hdr->b_dev; + uint64_t asize = arc_hdr_size(hdr); + + ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); + ASSERT(HDR_HAS_L2HDR(hdr)); + + list_remove(&dev->l2ad_buflist, hdr); + + ARCSTAT_INCR(arcstat_l2_asize, -asize); + ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); + + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + + (void) refcount_remove_many(&dev->l2ad_alloc, asize, hdr); + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); +} + +static void +arc_hdr_destroy(arc_buf_hdr_t *hdr) +{ + if (HDR_HAS_L1HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_buf == NULL || + hdr->b_l1hdr.b_bufcnt > 0); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + } + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + + if (!HDR_EMPTY(hdr)) + buf_discard_identity(hdr); + + if (HDR_HAS_L2HDR(hdr)) { + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); - if (l2hdr != NULL) { - list_remove(l2hdr->b_dev->l2ad_buflist, hdr); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); - if (hdr->b_state == arc_l2c_only) - l2arc_hdr_stat_remove(); - hdr->b_l2hdr = NULL; + if (!buflist_held) + mutex_enter(&dev->l2ad_mtx); + + /* + * Even though we checked this conditional above, we + * need to check this again now that we have the + * l2ad_mtx. This is because we could be racing with + * another thread calling l2arc_evict() which might have + * destroyed this header's L2 portion as we were waiting + * to acquire the l2ad_mtx. If that happens, we don't + * want to re-destroy the header's L2 portion. + */ + if (HDR_HAS_L2HDR(hdr)) { + l2arc_trim(hdr); + arc_hdr_l2hdr_destroy(hdr); } if (!buflist_held) - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&dev->l2ad_mtx); } - if (!BUF_EMPTY(hdr)) { - ASSERT(!HDR_IN_HASH_TABLE(hdr)); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; - } - while (hdr->b_buf) { - arc_buf_t *buf = hdr->b_buf; - - if (buf->b_efunc) { - mutex_enter(&arc_eviction_mtx); - rw_enter(&buf->b_lock, RW_WRITER); - ASSERT(buf->b_hdr != NULL); - arc_buf_destroy(hdr->b_buf, FALSE, FALSE); - hdr->b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - rw_exit(&buf->b_lock); - mutex_exit(&arc_eviction_mtx); - } else { - arc_buf_destroy(hdr->b_buf, FALSE, TRUE); + if (HDR_HAS_L1HDR(hdr)) { + arc_cksum_free(hdr); + + while (hdr->b_l1hdr.b_buf != NULL) + arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE); + +#ifdef ZFS_DEBUG + if (hdr->b_l1hdr.b_thawed != NULL) { + kmem_free(hdr->b_l1hdr.b_thawed, 1); + hdr->b_l1hdr.b_thawed = NULL; + } +#endif + + if (hdr->b_l1hdr.b_pdata != NULL) { + arc_hdr_free_pdata(hdr); } - } - if (hdr->b_freeze_cksum != NULL) { - kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_freeze_cksum = NULL; } - ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT3P(hdr->b_hash_next, ==, NULL); - ASSERT3P(hdr->b_acb, ==, NULL); - kmem_cache_free(hdr_cache, hdr); + if (HDR_HAS_L1HDR(hdr)) { + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + kmem_cache_free(hdr_full_cache, hdr); + } else { + kmem_cache_free(hdr_l2only_cache, hdr); + } } void -arc_buf_free(arc_buf_t *buf, void *tag) +arc_buf_destroy(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - int hashed = hdr->b_state != arc_anon; + kmutex_t *hash_lock = HDR_LOCK(hdr); + + if (hdr->b_l1hdr.b_state == arc_anon) { + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + VERIFY0(remove_reference(hdr, NULL, tag)); + arc_hdr_destroy(hdr); + return; + } - ASSERT(buf->b_efunc == NULL); - ASSERT(buf->b_data != NULL); + mutex_enter(hash_lock); + ASSERT3P(hdr, ==, buf->b_hdr); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); + ASSERT3P(buf->b_data, !=, NULL); - if (hashed) { - kmutex_t *hash_lock = HDR_LOCK(hdr); + (void) remove_reference(hdr, hash_lock, tag); + arc_buf_destroy_impl(buf, B_TRUE); + mutex_exit(hash_lock); +} + +int32_t +arc_buf_size(arc_buf_t *buf) +{ + return (HDR_GET_LSIZE(buf->b_hdr)); +} + +/* + * Evict the arc_buf_hdr that is provided as a parameter. The resultant + * state of the header is dependent on its state prior to entering this + * function. The following transitions are possible: + * + * - arc_mru -> arc_mru_ghost + * - arc_mfu -> arc_mfu_ghost + * - arc_mru_ghost -> arc_l2c_only + * - arc_mru_ghost -> deleted + * - arc_mfu_ghost -> arc_l2c_only + * - arc_mfu_ghost -> deleted + */ +static int64_t +arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) +{ + arc_state_t *evicted_state, *state; + int64_t bytes_evicted = 0; + + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(HDR_HAS_L1HDR(hdr)); + + state = hdr->b_l1hdr.b_state; + if (GHOST_STATE(state)) { + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - mutex_enter(hash_lock); - (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_datacnt > 1) { - arc_buf_destroy(buf, FALSE, TRUE); - } else { - ASSERT(buf == hdr->b_buf); - ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_BUF_AVAILABLE; - } - mutex_exit(hash_lock); - } else if (HDR_IO_IN_PROGRESS(hdr)) { - int destroy_hdr; /* - * We are in the middle of an async write. Don't destroy - * this buffer unless the write completes before we finish - * decrementing the reference count. - */ - mutex_enter(&arc_eviction_mtx); - (void) remove_reference(hdr, NULL, tag); - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); - mutex_exit(&arc_eviction_mtx); - if (destroy_hdr) + * l2arc_write_buffers() relies on a header's L1 portion + * (i.e. its b_pdata field) during its write phase. + * Thus, we cannot push a header onto the arc_l2c_only + * state (removing it's L1 piece) until the header is + * done being written to the l2arc. + */ + if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { + ARCSTAT_BUMP(arcstat_evict_l2_skip); + return (bytes_evicted); + } + + ARCSTAT_BUMP(arcstat_deleted); + bytes_evicted += HDR_GET_LSIZE(hdr); + + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); + + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + if (HDR_HAS_L2HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_pdata == NULL); + /* + * This buffer is cached on the 2nd Level ARC; + * don't destroy the header. + */ + arc_change_state(arc_l2c_only, hdr, hash_lock); + /* + * dropping from L1+L2 cached to L2-only, + * realloc to remove the L1 header. + */ + hdr = arc_hdr_realloc(hdr, hdr_full_cache, + hdr_l2only_cache); + } else { + ASSERT(hdr->b_l1hdr.b_pdata == NULL); + arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); + } + return (bytes_evicted); + } + + ASSERT(state == arc_mru || state == arc_mfu); + evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + + /* prefetch buffers have a minimum lifespan */ + if (HDR_IO_IN_PROGRESS(hdr) || + ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && + ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < + arc_min_prefetch_lifespan)) { + ARCSTAT_BUMP(arcstat_evict_skip); + return (bytes_evicted); + } + + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + while (hdr->b_l1hdr.b_buf) { + arc_buf_t *buf = hdr->b_l1hdr.b_buf; + if (!mutex_tryenter(&buf->b_evict_lock)) { + ARCSTAT_BUMP(arcstat_mutex_miss); + break; + } + if (buf->b_data != NULL) + bytes_evicted += HDR_GET_LSIZE(hdr); + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy_impl(buf, B_TRUE); + } + + if (HDR_HAS_L2HDR(hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { - if (remove_reference(hdr, NULL, tag) > 0) { - ASSERT(HDR_IO_ERROR(hdr)); - arc_buf_destroy(buf, FALSE, TRUE); + if (l2arc_write_eligible(hdr->b_spa, hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_eligible, + HDR_GET_LSIZE(hdr)); } else { - arc_hdr_destroy(hdr); + ARCSTAT_INCR(arcstat_evict_l2_ineligible, + HDR_GET_LSIZE(hdr)); } } + + if (hdr->b_l1hdr.b_bufcnt == 0) { + arc_cksum_free(hdr); + + bytes_evicted += arc_hdr_size(hdr); + + /* + * If this hdr is being evicted and has a compressed + * buffer then we discard it here before we change states. + * This ensures that the accounting is updated correctly + * in arc_free_data_buf(). + */ + arc_hdr_free_pdata(hdr); + + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); + } + + return (bytes_evicted); } -int -arc_buf_remove_ref(arc_buf_t *buf, void* tag) +static uint64_t +arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, + uint64_t spa, int64_t bytes) { - arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock = HDR_LOCK(hdr); - int no_callback = (buf->b_efunc == NULL); + multilist_sublist_t *mls; + uint64_t bytes_evicted = 0; + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + int evict_count = 0; - if (hdr->b_state == arc_anon) { - ASSERT(hdr->b_datacnt == 1); - arc_buf_free(buf, tag); - return (no_callback); - } + ASSERT3P(marker, !=, NULL); + IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); - mutex_enter(hash_lock); - ASSERT(hdr->b_state != arc_anon); - ASSERT(buf->b_data != NULL); + mls = multilist_sublist_lock(ml, idx); - (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_datacnt > 1) { - if (no_callback) - arc_buf_destroy(buf, FALSE, TRUE); - } else if (no_callback) { - ASSERT(hdr->b_buf == buf && buf->b_next == NULL); - ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_BUF_AVAILABLE; + for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; + hdr = multilist_sublist_prev(mls, marker)) { + if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || + (evict_count >= zfs_arc_evict_batch_limit)) + break; + + /* + * To keep our iteration location, move the marker + * forward. Since we're not holding hdr's hash lock, we + * must be very careful and not remove 'hdr' from the + * sublist. Otherwise, other consumers might mistake the + * 'hdr' as not being on a sublist when they call the + * multilist_link_active() function (they all rely on + * the hash lock protecting concurrent insertions and + * removals). multilist_sublist_move_forward() was + * specifically implemented to ensure this is the case + * (only 'marker' will be removed and re-inserted). + */ + multilist_sublist_move_forward(mls, marker); + + /* + * The only case where the b_spa field should ever be + * zero, is the marker headers inserted by + * arc_evict_state(). It's possible for multiple threads + * to be calling arc_evict_state() concurrently (e.g. + * dsl_pool_close() and zio_inject_fault()), so we must + * skip any markers we see from these other threads. + */ + if (hdr->b_spa == 0) + continue; + + /* we're only interested in evicting buffers of a certain spa */ + if (spa != 0 && hdr->b_spa != spa) { + ARCSTAT_BUMP(arcstat_evict_skip); + continue; + } + + hash_lock = HDR_LOCK(hdr); + + /* + * We aren't calling this function from any code path + * that would already be holding a hash lock, so we're + * asserting on this assumption to be defensive in case + * this ever changes. Without this check, it would be + * possible to incorrectly increment arcstat_mutex_miss + * below (e.g. if the code changed such that we called + * this function with a hash lock held). + */ + ASSERT(!MUTEX_HELD(hash_lock)); + + if (mutex_tryenter(hash_lock)) { + uint64_t evicted = arc_evict_hdr(hdr, hash_lock); + mutex_exit(hash_lock); + + bytes_evicted += evicted; + + /* + * If evicted is zero, arc_evict_hdr() must have + * decided to skip this header, don't increment + * evict_count in this case. + */ + if (evicted != 0) + evict_count++; + + /* + * If arc_size isn't overflowing, signal any + * threads that might happen to be waiting. + * + * For each header evicted, we wake up a single + * thread. If we used cv_broadcast, we could + * wake up "too many" threads causing arc_size + * to significantly overflow arc_c; since + * arc_get_data_buf() doesn't check for overflow + * when it's woken up (it doesn't because it's + * possible for the ARC to be overflowing while + * full of un-evictable buffers, and the + * function should proceed in this case). + * + * If threads are left sleeping, due to not + * using cv_broadcast, they will be woken up + * just before arc_reclaim_thread() sleeps. + */ + mutex_enter(&arc_reclaim_lock); + if (!arc_is_overflowing()) + cv_signal(&arc_reclaim_waiters_cv); + mutex_exit(&arc_reclaim_lock); + } else { + ARCSTAT_BUMP(arcstat_mutex_miss); + } } - ASSERT(no_callback || hdr->b_datacnt > 1 || - refcount_is_zero(&hdr->b_refcnt)); - mutex_exit(hash_lock); - return (no_callback); + + multilist_sublist_unlock(mls); + + return (bytes_evicted); } -int -arc_buf_size(arc_buf_t *buf) +/* + * Evict buffers from the given arc state, until we've removed the + * specified number of bytes. Move the removed buffers to the + * appropriate evict state. + * + * This function makes a "best effort". It skips over any buffers + * it can't get a hash_lock on, and so, may not catch all candidates. + * It may also return without evicting as much space as requested. + * + * If bytes is specified using the special value ARC_EVICT_ALL, this + * will evict all available (i.e. unlocked and evictable) buffers from + * the given arc state; which is used by arc_flush(). + */ +static uint64_t +arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type) { - return (buf->b_hdr->b_size); + uint64_t total_evicted = 0; + multilist_t *ml = &state->arcs_list[type]; + int num_sublists; + arc_buf_hdr_t **markers; + + IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); + + num_sublists = multilist_get_num_sublists(ml); + + /* + * If we've tried to evict from each sublist, made some + * progress, but still have not hit the target number of bytes + * to evict, we want to keep trying. The markers allow us to + * pick up where we left off for each individual sublist, rather + * than starting from the tail each time. + */ + markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); + for (int i = 0; i < num_sublists; i++) { + markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); + + /* + * A b_spa of 0 is used to indicate that this header is + * a marker. This fact is used in arc_adjust_type() and + * arc_evict_state_impl(). + */ + markers[i]->b_spa = 0; + + multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_insert_tail(mls, markers[i]); + multilist_sublist_unlock(mls); + } + + /* + * While we haven't hit our target number of bytes to evict, or + * we're evicting all available buffers. + */ + while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { + /* + * Start eviction using a randomly selected sublist, + * this is to try and evenly balance eviction across all + * sublists. Always starting at the same sublist + * (e.g. index 0) would cause evictions to favor certain + * sublists over others. + */ + int sublist_idx = multilist_get_random_index(ml); + uint64_t scan_evicted = 0; + + for (int i = 0; i < num_sublists; i++) { + uint64_t bytes_remaining; + uint64_t bytes_evicted; + + if (bytes == ARC_EVICT_ALL) + bytes_remaining = ARC_EVICT_ALL; + else if (total_evicted < bytes) + bytes_remaining = bytes - total_evicted; + else + break; + + bytes_evicted = arc_evict_state_impl(ml, sublist_idx, + markers[sublist_idx], spa, bytes_remaining); + + scan_evicted += bytes_evicted; + total_evicted += bytes_evicted; + + /* we've reached the end, wrap to the beginning */ + if (++sublist_idx >= num_sublists) + sublist_idx = 0; + } + + /* + * If we didn't evict anything during this scan, we have + * no reason to believe we'll evict more during another + * scan, so break the loop. + */ + if (scan_evicted == 0) { + /* This isn't possible, let's make that obvious */ + ASSERT3S(bytes, !=, 0); + + /* + * When bytes is ARC_EVICT_ALL, the only way to + * break the loop is when scan_evicted is zero. + * In that case, we actually have evicted enough, + * so we don't want to increment the kstat. + */ + if (bytes != ARC_EVICT_ALL) { + ASSERT3S(total_evicted, <, bytes); + ARCSTAT_BUMP(arcstat_evict_not_enough); + } + + break; + } + } + + for (int i = 0; i < num_sublists; i++) { + multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_remove(mls, markers[i]); + multilist_sublist_unlock(mls); + + kmem_cache_free(hdr_full_cache, markers[i]); + } + kmem_free(markers, sizeof (*markers) * num_sublists); + + return (total_evicted); } /* - * Evict buffers from list until we've removed the specified number of - * bytes. Move the removed buffers to the appropriate evict state. - * If the recycle flag is set, then attempt to "recycle" a buffer: - * - look for a buffer to evict that is `bytes' long. - * - return the data block from this buffer rather than freeing it. - * This flag is used by callers that are trying to make space for a - * new buffer in a full arc cache. + * Flush all "evictable" data of the given type from the arc state + * specified. This will not evict any "active" buffers (i.e. referenced). * - * This function makes a "best effort". It skips over any buffers - * it can't get a hash_lock on, and so may not catch all candidates. - * It may also return without evicting as much space as requested. + * When 'retry' is set to B_FALSE, the function will make a single pass + * over the state and evict any buffers that it can. Since it doesn't + * continually retry the eviction, it might end up leaving some buffers + * in the ARC due to lock misses. + * + * When 'retry' is set to B_TRUE, the function will continually retry the + * eviction until *all* evictable buffers have been removed from the + * state. As a result, if concurrent insertions into the state are + * allowed (e.g. if the ARC isn't shutting down), this function might + * wind up in an infinite loop, continually trying to evict buffers. */ -static void * -arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, - arc_buf_contents_t type) +static uint64_t +arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, + boolean_t retry) { - arc_state_t *evicted_state; - uint64_t bytes_evicted = 0, skipped = 0, missed = 0; - arc_buf_hdr_t *ab, *ab_prev = NULL; - list_t *list = &state->arcs_list[type]; - kmutex_t *hash_lock; - boolean_t have_lock; - void *stolen = NULL; - - ASSERT(state == arc_mru || state == arc_mfu); + uint64_t evicted = 0; - evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + while (refcount_count(&state->arcs_esize[type]) != 0) { + evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); - mutex_enter(&state->arcs_mtx); - mutex_enter(&evicted_state->arcs_mtx); + if (!retry) + break; + } - for (ab = list_tail(list); ab; ab = ab_prev) { - ab_prev = list_prev(list, ab); - /* prefetch buffers have a minimum lifespan */ - if (HDR_IO_IN_PROGRESS(ab) || - (spa && ab->b_spa != spa) || - (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && - ddi_get_lbolt() - ab->b_arc_access < - arc_min_prefetch_lifespan)) { - skipped++; - continue; - } - /* "lookahead" for better eviction candidate */ - if (recycle && ab->b_size != bytes && - ab_prev && ab_prev->b_size == bytes) - continue; - hash_lock = HDR_LOCK(ab); - have_lock = MUTEX_HELD(hash_lock); - if (have_lock || mutex_tryenter(hash_lock)) { - ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); - ASSERT(ab->b_datacnt > 0); - while (ab->b_buf) { - arc_buf_t *buf = ab->b_buf; - if (!rw_tryenter(&buf->b_lock, RW_WRITER)) { - missed += 1; - break; - } - if (buf->b_data) { - bytes_evicted += ab->b_size; - if (recycle && ab->b_type == type && - ab->b_size == bytes && - !HDR_L2_WRITING(ab)) { - stolen = buf->b_data; - recycle = FALSE; - } - } - if (buf->b_efunc) { - mutex_enter(&arc_eviction_mtx); - arc_buf_destroy(buf, - buf->b_data == stolen, FALSE); - ab->b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&arc_eviction_mtx); - rw_exit(&buf->b_lock); - } else { - rw_exit(&buf->b_lock); - arc_buf_destroy(buf, - buf->b_data == stolen, TRUE); - } - } + return (evicted); +} - if (ab->b_l2hdr) { - ARCSTAT_INCR(arcstat_evict_l2_cached, - ab->b_size); - } else { - if (l2arc_write_eligible(ab->b_spa, ab)) { - ARCSTAT_INCR(arcstat_evict_l2_eligible, - ab->b_size); - } else { - ARCSTAT_INCR( - arcstat_evict_l2_ineligible, - ab->b_size); - } - } +/* + * Evict the specified number of bytes from the state specified, + * restricting eviction to the spa and type given. This function + * prevents us from trying to evict more from a state's list than + * is "evictable", and to skip evicting altogether when passed a + * negative value for "bytes". In contrast, arc_evict_state() will + * evict everything it can, when passed a negative value for "bytes". + */ +static uint64_t +arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type) +{ + int64_t delta; - if (ab->b_datacnt == 0) { - arc_change_state(evicted_state, ab, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(ab)); - ab->b_flags |= ARC_IN_HASH_TABLE; - ab->b_flags &= ~ARC_BUF_AVAILABLE; - DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); - } - if (!have_lock) - mutex_exit(hash_lock); - if (bytes >= 0 && bytes_evicted >= bytes) - break; - } else { - missed += 1; - } + if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { + delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); + return (arc_evict_state(state, spa, delta, type)); } - mutex_exit(&evicted_state->arcs_mtx); - mutex_exit(&state->arcs_mtx); + return (0); +} - if (bytes_evicted < bytes) - dprintf("only evicted %lld bytes from %x", - (longlong_t)bytes_evicted, state); +/* + * Evict metadata buffers from the cache, such that arc_meta_used is + * capped by the arc_meta_limit tunable. + */ +static uint64_t +arc_adjust_meta(void) +{ + uint64_t total_evicted = 0; + int64_t target; - if (skipped) - ARCSTAT_INCR(arcstat_evict_skip, skipped); + /* + * If we're over the meta limit, we want to evict enough + * metadata to get back under the meta limit. We don't want to + * evict so much that we drop the MRU below arc_p, though. If + * we're over the meta limit more than we're over arc_p, we + * evict some from the MRU here, and some from the MFU below. + */ + target = MIN((int64_t)(arc_meta_used - arc_meta_limit), + (int64_t)(refcount_count(&arc_anon->arcs_size) + + refcount_count(&arc_mru->arcs_size) - arc_p)); - if (missed) - ARCSTAT_INCR(arcstat_mutex_miss, missed); + total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); /* - * We have just evicted some date into the ghost state, make - * sure we also adjust the ghost state size if necessary. + * Similar to the above, we want to evict enough bytes to get us + * below the meta limit, but not so much as to drop us below the + * space alloted to the MFU (which is defined as arc_c - arc_p). */ - if (arc_no_grow && - arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { - int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + - arc_mru_ghost->arcs_size - arc_c; + target = MIN((int64_t)(arc_meta_used - arc_meta_limit), + (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); - if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { - int64_t todelete = - MIN(arc_mru_ghost->arcs_lsize[type], mru_over); - arc_evict_ghost(arc_mru_ghost, 0, todelete); - } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { - int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], - arc_mru_ghost->arcs_size + - arc_mfu_ghost->arcs_size - arc_c); - arc_evict_ghost(arc_mfu_ghost, 0, todelete); - } - } + total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); - return (stolen); + return (total_evicted); } /* - * Remove buffers from list until we've removed the specified number of - * bytes. Destroy the buffers that are removed. + * Return the type of the oldest buffer in the given arc state + * + * This function will select a random sublist of type ARC_BUFC_DATA and + * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist + * is compared, and the type which contains the "older" buffer will be + * returned. */ -static void -arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) +static arc_buf_contents_t +arc_adjust_type(arc_state_t *state) { - arc_buf_hdr_t *ab, *ab_prev; - list_t *list = &state->arcs_list[ARC_BUFC_DATA]; - kmutex_t *hash_lock; - uint64_t bytes_deleted = 0; - uint64_t bufs_skipped = 0; - boolean_t have_lock; + multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; + multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; + int data_idx = multilist_get_random_index(data_ml); + int meta_idx = multilist_get_random_index(meta_ml); + multilist_sublist_t *data_mls; + multilist_sublist_t *meta_mls; + arc_buf_contents_t type; + arc_buf_hdr_t *data_hdr; + arc_buf_hdr_t *meta_hdr; - ASSERT(GHOST_STATE(state)); -top: - mutex_enter(&state->arcs_mtx); - for (ab = list_tail(list); ab; ab = ab_prev) { - ab_prev = list_prev(list, ab); - if (spa && ab->b_spa != spa) - continue; - hash_lock = HDR_LOCK(ab); - have_lock = MUTEX_HELD(hash_lock); - if (have_lock || mutex_tryenter(hash_lock)) { - ASSERT(!HDR_IO_IN_PROGRESS(ab)); - ASSERT(ab->b_buf == NULL); - ARCSTAT_BUMP(arcstat_deleted); - bytes_deleted += ab->b_size; + /* + * We keep the sublist lock until we're finished, to prevent + * the headers from being destroyed via arc_evict_state(). + */ + data_mls = multilist_sublist_lock(data_ml, data_idx); + meta_mls = multilist_sublist_lock(meta_ml, meta_idx); - if (ab->b_l2hdr != NULL) { - /* - * This buffer is cached on the 2nd Level ARC; - * don't destroy the header. - */ - arc_change_state(arc_l2c_only, ab, hash_lock); - if (!have_lock) - mutex_exit(hash_lock); - } else { - arc_change_state(arc_anon, ab, hash_lock); - if (!have_lock) - mutex_exit(hash_lock); - arc_hdr_destroy(ab); - } + /* + * These two loops are to ensure we skip any markers that + * might be at the tail of the lists due to arc_evict_state(). + */ - DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); - if (bytes >= 0 && bytes_deleted >= bytes) - break; - } else { - if (bytes < 0) { - mutex_exit(&state->arcs_mtx); - mutex_enter(hash_lock); - mutex_exit(hash_lock); - goto top; - } - bufs_skipped += 1; - } + for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; + data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { + if (data_hdr->b_spa != 0) + break; } - mutex_exit(&state->arcs_mtx); - if (list == &state->arcs_list[ARC_BUFC_DATA] && - (bytes < 0 || bytes_deleted < bytes)) { - list = &state->arcs_list[ARC_BUFC_METADATA]; - goto top; + for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; + meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { + if (meta_hdr->b_spa != 0) + break; } - if (bufs_skipped) { - ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); - ASSERT(bytes >= 0); + if (data_hdr == NULL && meta_hdr == NULL) { + type = ARC_BUFC_DATA; + } else if (data_hdr == NULL) { + ASSERT3P(meta_hdr, !=, NULL); + type = ARC_BUFC_METADATA; + } else if (meta_hdr == NULL) { + ASSERT3P(data_hdr, !=, NULL); + type = ARC_BUFC_DATA; + } else { + ASSERT3P(data_hdr, !=, NULL); + ASSERT3P(meta_hdr, !=, NULL); + + /* The headers can't be on the sublist without an L1 header */ + ASSERT(HDR_HAS_L1HDR(data_hdr)); + ASSERT(HDR_HAS_L1HDR(meta_hdr)); + + if (data_hdr->b_l1hdr.b_arc_access < + meta_hdr->b_l1hdr.b_arc_access) { + type = ARC_BUFC_DATA; + } else { + type = ARC_BUFC_METADATA; + } } - if (bytes_deleted < bytes) - dprintf("only deleted %lld bytes from %p", - (longlong_t)bytes_deleted, state); + multilist_sublist_unlock(meta_mls); + multilist_sublist_unlock(data_mls); + + return (type); } -static void +/* + * Evict buffers from the cache, such that arc_size is capped by arc_c. + */ +static uint64_t arc_adjust(void) { - int64_t adjustment, delta; + uint64_t total_evicted = 0; + uint64_t bytes; + int64_t target; /* - * Adjust MRU size + * If we're over arc_meta_limit, we want to correct that before + * potentially evicting data buffers below. */ + total_evicted += arc_adjust_meta(); - adjustment = MIN(arc_size - arc_c, - arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p); + /* + * Adjust MRU size + * + * If we're over the target cache size, we want to evict enough + * from the list to get back to our target size. We don't want + * to evict too much from the MRU, such that it drops below + * arc_p. So, if we're over our target cache size more than + * the MRU is over arc_p, we'll evict enough to get back to + * arc_p here, and then evict more from the MFU below. + */ + target = MIN((int64_t)(arc_size - arc_c), + (int64_t)(refcount_count(&arc_anon->arcs_size) + + refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p)); + + /* + * If we're below arc_meta_min, always prefer to evict data. + * Otherwise, try to satisfy the requested number of bytes to + * evict from the type which contains older buffers; in an + * effort to keep newer buffers in the cache regardless of their + * type. If we cannot satisfy the number of bytes from this + * type, spill over into the next type. + */ + if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && + arc_meta_used > arc_meta_min) { + bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + total_evicted += bytes; - if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { - delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); - (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); - adjustment -= delta; - } + /* + * If we couldn't evict our target number of bytes from + * metadata, we try to get the rest from data. + */ + target -= bytes; - if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { - delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); - (void) arc_evict(arc_mru, 0, delta, FALSE, - ARC_BUFC_METADATA); + total_evicted += + arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + } else { + bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * data, we try to get the rest from metadata. + */ + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); } /* * Adjust MFU size - */ + * + * Now that we've tried to evict enough from the MRU to get its + * size back to arc_p, if we're still above the target cache + * size, we evict the rest from the MFU. + */ + target = arc_size - arc_c; + + if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && + arc_meta_used > arc_meta_min) { + bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * metadata, we try to get the rest from data. + */ + target -= bytes; - adjustment = arc_size - arc_c; + total_evicted += + arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + } else { + bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; - if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { - delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); - (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); - adjustment -= delta; - } + /* + * If we couldn't evict our target number of bytes from + * data, we try to get the rest from data. + */ + target -= bytes; - if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { - int64_t delta = MIN(adjustment, - arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); - (void) arc_evict(arc_mfu, 0, delta, FALSE, - ARC_BUFC_METADATA); + total_evicted += + arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); } /* * Adjust ghost lists + * + * In addition to the above, the ARC also defines target values + * for the ghost lists. The sum of the mru list and mru ghost + * list should never exceed the target size of the cache, and + * the sum of the mru list, mfu list, mru ghost list, and mfu + * ghost list should never exceed twice the target size of the + * cache. The following logic enforces these limits on the ghost + * caches, and evicts from them as needed. */ + target = refcount_count(&arc_mru->arcs_size) + + refcount_count(&arc_mru_ghost->arcs_size) - arc_c; - adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; + bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; - if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { - delta = MIN(arc_mru_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mru_ghost, 0, delta); - } + target -= bytes; - adjustment = - arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; + total_evicted += + arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); - if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { - delta = MIN(arc_mfu_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mfu_ghost, 0, delta); - } -} + /* + * We assume the sum of the mru list and mfu list is less than + * or equal to arc_c (we enforced this above), which means we + * can use the simpler of the two equations below: + * + * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c + * mru ghost + mfu ghost <= arc_c + */ + target = refcount_count(&arc_mru_ghost->arcs_size) + + refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; -static void -arc_do_user_evicts(void) -{ - mutex_enter(&arc_eviction_mtx); - while (arc_eviction_list != NULL) { - arc_buf_t *buf = arc_eviction_list; - arc_eviction_list = buf->b_next; - rw_enter(&buf->b_lock, RW_WRITER); - buf->b_hdr = NULL; - rw_exit(&buf->b_lock); - mutex_exit(&arc_eviction_mtx); + bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; - if (buf->b_efunc != NULL) - VERIFY(buf->b_efunc(buf) == 0); + target -= bytes; - buf->b_efunc = NULL; - buf->b_private = NULL; - kmem_cache_free(buf_cache, buf); - mutex_enter(&arc_eviction_mtx); - } - mutex_exit(&arc_eviction_mtx); + total_evicted += + arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); + + return (total_evicted); } -/* - * Flush all *evictable* data from the cache for the given spa. - * NOTE: this will not touch "active" (i.e. referenced) data. - */ void -arc_flush(spa_t *spa) +arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; - if (spa) - guid = spa_guid(spa); + /* + * If retry is B_TRUE, a spa must not be specified since we have + * no good way to determine if all of a spa's buffers have been + * evicted from an arc state. + */ + ASSERT(!retry || spa == 0); - while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa) - break; - } - while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa) - break; - } - while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa) - break; - } - while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa) - break; - } + if (spa != NULL) + guid = spa_load_guid(spa); + + (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); - arc_evict_ghost(arc_mru_ghost, guid, -1); - arc_evict_ghost(arc_mfu_ghost, guid, -1); + (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); - mutex_enter(&arc_reclaim_thr_lock); - arc_do_user_evicts(); - mutex_exit(&arc_reclaim_thr_lock); - ASSERT(spa || arc_eviction_list == NULL); + (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); } void -arc_shrink(void) +arc_shrink(int64_t to_free) { if (arc_c > arc_c_min) { - uint64_t to_free; - -#ifdef _KERNEL - to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); -#else - to_free = arc_c >> arc_shrink_shift; -#endif + DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, + arc_c_min, uint64_t, arc_p, uint64_t, to_free); if (arc_c > arc_c_min + to_free) atomic_add_64(&arc_c, -to_free); else @@ -1960,29 +3862,79 @@ arc_shrink(void) arc_c = MAX(arc_size, arc_c_min); if (arc_p > arc_c) arc_p = (arc_c >> 1); + + DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, + arc_p); + ASSERT(arc_c >= arc_c_min); ASSERT((int64_t)arc_p >= 0); } - if (arc_size > arc_c) - arc_adjust(); + if (arc_size > arc_c) { + DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, + uint64_t, arc_c); + (void) arc_adjust(); + } } -static int -arc_reclaim_needed(void) +static long needfree = 0; + +typedef enum free_memory_reason_t { + FMR_UNKNOWN, + FMR_NEEDFREE, + FMR_LOTSFREE, + FMR_SWAPFS_MINFREE, + FMR_PAGES_PP_MAXIMUM, + FMR_HEAP_ARENA, + FMR_ZIO_ARENA, + FMR_ZIO_FRAG, +} free_memory_reason_t; + +int64_t last_free_memory; +free_memory_reason_t last_free_reason; + +/* + * Additional reserve of pages for pp_reserve. + */ +int64_t arc_pages_pp_reserve = 64; + +/* + * Additional reserve of pages for swapfs. + */ +int64_t arc_swapfs_reserve = 64; + +/* + * Return the amount of memory that can be consumed before reclaim will be + * needed. Positive if there is sufficient free memory, negative indicates + * the amount of memory that needs to be freed up. + */ +static int64_t +arc_available_memory(void) { - uint64_t extra; + int64_t lowest = INT64_MAX; + int64_t n; + free_memory_reason_t r = FMR_UNKNOWN; #ifdef _KERNEL - - if (needfree) - return (1); + if (needfree > 0) { + n = PAGESIZE * (-needfree); + if (n < lowest) { + lowest = n; + r = FMR_NEEDFREE; + } + } /* - * take 'desfree' extra pages, so we reclaim sooner, rather than later + * Cooperate with pagedaemon when it's time for it to scan + * and reclaim some pages. */ - extra = desfree; + n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); + if (n < lowest) { + lowest = n; + r = FMR_LOTSFREE; + } +#ifdef illumos /* * check that we're out of range of the pageout scanner. It starts to * schedule paging if freemem is less than lotsfree and needfree. @@ -1990,8 +3942,11 @@ arc_reclaim_needed(void) * number of needed free pages. We add extra pages here to make sure * the scanner doesn't start up while we're freeing memory. */ - if (freemem < lotsfree + needfree + extra) - return (1); + n = PAGESIZE * (freemem - lotsfree - needfree - desfree); + if (n < lowest) { + lowest = n; + r = FMR_LOTSFREE; + } /* * check to make sure that swapfs has enough space so that anon @@ -2000,14 +3955,34 @@ arc_reclaim_needed(void) * swap pages. We also add a bit of extra here just to prevent * circumstances from getting really dire. */ - if (availrmem < swapfs_minfree + swapfs_reserve + extra) - return (1); + n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - + desfree - arc_swapfs_reserve); + if (n < lowest) { + lowest = n; + r = FMR_SWAPFS_MINFREE; + } + -#if defined(__i386) + /* + * Check that we have enough availrmem that memory locking (e.g., via + * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum + * stores the number of pages that cannot be locked; when availrmem + * drops below pages_pp_maximum, page locking mechanisms such as + * page_pp_lock() will fail.) + */ + n = PAGESIZE * (availrmem - pages_pp_maximum - + arc_pages_pp_reserve); + if (n < lowest) { + lowest = n; + r = FMR_PAGES_PP_MAXIMUM; + } + +#endif /* illumos */ +#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) /* * If we're on an i386 platform, it's possible that we'll exhaust the * kernel heap space before we ever run out of available physical - * memory. Most checks of the size of the kmem_area compare against + * memory. Most checks of the size of the heap_area compare against * tune.t_minarmem, which is the minimum available real memory that we * can have in the system. However, this is generally fixed at 25 pages * which is so low that it's useless. In this comparison, we seek to @@ -2015,27 +3990,87 @@ arc_reclaim_needed(void) * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ - if (btop(vmem_size(kmem_arena, VMEM_FREE)) < - (btop(vmem_size(kmem_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) - return (1); + n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - + (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); + if (n < lowest) { + lowest = n; + r = FMR_HEAP_ARENA; + } +#define zio_arena NULL +#else +#define zio_arena heap_arena #endif -#else - if (spa_get_random(100) == 0) - return (1); + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this arena remains + * above about 1/16th free. + * + * Note: The 1/16th arena free requirement was put in place + * to aggressively evict memory from the arc in order to avoid + * memory fragmentation issues. + */ + if (zio_arena != NULL) { + n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - + (vmem_size(zio_arena, VMEM_ALLOC) >> 4); + if (n < lowest) { + lowest = n; + r = FMR_ZIO_ARENA; + } + } + +#if __FreeBSD__ + /* + * Above limits know nothing about real level of KVA fragmentation. + * Start aggressive reclamation if too little sequential KVA left. + */ + if (lowest > 0) { + n = (vmem_size(heap_arena, VMEM_MAXFREE) < SPA_MAXBLOCKSIZE) ? + -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) : + INT64_MAX; + if (n < lowest) { + lowest = n; + r = FMR_ZIO_FRAG; + } + } #endif - return (0); + +#else /* _KERNEL */ + /* Every 100 calls, free a small amount */ + if (spa_get_random(100) == 0) + lowest = -1024; +#endif /* _KERNEL */ + + last_free_memory = lowest; + last_free_reason = r; + DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); + return (lowest); } -static void -arc_kmem_reap_now(arc_reclaim_strategy_t strat) + +/* + * Determine if the system is under memory pressure and is asking + * to reclaim memory. A return value of B_TRUE indicates that the system + * is under memory pressure and that the arc should adjust accordingly. + */ +static boolean_t +arc_reclaim_needed(void) +{ + return (arc_available_memory() < 0); +} + +extern kmem_cache_t *zio_buf_cache[]; +extern kmem_cache_t *zio_data_buf_cache[]; +extern kmem_cache_t *range_seg_cache; + +static __noinline void +arc_kmem_reap_now(void) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; - extern kmem_cache_t *zio_buf_cache[]; - extern kmem_cache_t *zio_data_buf_cache[]; + DTRACE_PROBE(arc__kmem_reap_start); #ifdef _KERNEL if (arc_meta_used >= arc_meta_limit) { /* @@ -2052,13 +4087,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t #endif #endif - /* - * An aggressive reclamation will shrink the cache size as well as - * reap free buffers from the arc kmem caches. - */ - if (strat == ARC_RECLAIM_AGGR) - arc_shrink(); - for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; @@ -2070,64 +4098,202 @@ arc_kmem_reap_now(arc_reclaim_strategy_t } } kmem_cache_reap_now(buf_cache); - kmem_cache_reap_now(hdr_cache); + kmem_cache_reap_now(hdr_full_cache); + kmem_cache_reap_now(hdr_l2only_cache); + kmem_cache_reap_now(range_seg_cache); + +#ifdef illumos + if (zio_arena != NULL) { + /* + * Ask the vmem arena to reclaim unused memory from its + * quantum caches. + */ + vmem_qcache_reap(zio_arena); + } +#endif + DTRACE_PROBE(arc__kmem_reap_end); } +/* + * Threads can block in arc_get_data_buf() waiting for this thread to evict + * enough data and signal them to proceed. When this happens, the threads in + * arc_get_data_buf() are sleeping while holding the hash lock for their + * particular arc header. Thus, we must be careful to never sleep on a + * hash lock in this thread. This is to prevent the following deadlock: + * + * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", + * waiting for the reclaim thread to signal it. + * + * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, + * fails, and goes to sleep forever. + * + * This possible deadlock is avoided by always acquiring a hash lock + * using mutex_tryenter() from arc_reclaim_thread(). + */ static void -arc_reclaim_thread(void *unused __unused) +arc_reclaim_thread(void *dummy __unused) { - clock_t growtime = 0; - arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; + hrtime_t growtime = 0; callb_cpr_t cpr; - CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); + CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); - mutex_enter(&arc_reclaim_thr_lock); - while (arc_thread_exit == 0) { - if (arc_reclaim_needed()) { + mutex_enter(&arc_reclaim_lock); + while (!arc_reclaim_thread_exit) { + uint64_t evicted = 0; - if (arc_no_grow) { - if (last_reclaim == ARC_RECLAIM_CONS) { - last_reclaim = ARC_RECLAIM_AGGR; - } else { - last_reclaim = ARC_RECLAIM_CONS; - } - } else { - arc_no_grow = TRUE; - last_reclaim = ARC_RECLAIM_AGGR; - membar_producer(); - } + /* + * This is necessary in order for the mdb ::arc dcmd to + * show up to date information. Since the ::arc command + * does not call the kstat's update function, without + * this call, the command may show stale stats for the + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this change, the data might be up to 1 second + * out of date; but that should suffice. The arc_state_t + * structures can be queried directly if more accurate + * information is needed. + */ + if (arc_ksp != NULL) + arc_ksp->ks_update(arc_ksp, KSTAT_READ); + + mutex_exit(&arc_reclaim_lock); + + /* + * We call arc_adjust() before (possibly) calling + * arc_kmem_reap_now(), so that we can wake up + * arc_get_data_buf() sooner. + */ + evicted = arc_adjust(); - /* reset the growth delay for every reclaim */ - growtime = ddi_get_lbolt() + (arc_grow_retry * hz); + int64_t free_memory = arc_available_memory(); + if (free_memory < 0) { - arc_kmem_reap_now(last_reclaim); + arc_no_grow = B_TRUE; arc_warm = B_TRUE; - } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { - arc_no_grow = FALSE; + /* + * Wait at least zfs_grow_retry (default 60) seconds + * before considering growing. + */ + growtime = gethrtime() + SEC2NSEC(arc_grow_retry); + + arc_kmem_reap_now(); + + /* + * If we are still low on memory, shrink the ARC + * so that we have arc_shrink_min free space. + */ + free_memory = arc_available_memory(); + + int64_t to_free = + (arc_c >> arc_shrink_shift) - free_memory; + if (to_free > 0) { +#ifdef _KERNEL + to_free = MAX(to_free, ptob(needfree)); +#endif + arc_shrink(to_free); + } + } else if (free_memory < arc_c >> arc_no_grow_shift) { + arc_no_grow = B_TRUE; + } else if (gethrtime() >= growtime) { + arc_no_grow = B_FALSE; } - if (2 * arc_c < arc_size + - arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size) - arc_adjust(); + mutex_enter(&arc_reclaim_lock); - if (arc_eviction_list != NULL) - arc_do_user_evicts(); + /* + * If evicted is zero, we couldn't evict anything via + * arc_adjust(). This could be due to hash lock + * collisions, but more likely due to the majority of + * arc buffers being unevictable. Therefore, even if + * arc_size is above arc_c, another pass is unlikely to + * be helpful and could potentially cause us to enter an + * infinite loop. + */ + if (arc_size <= arc_c || evicted == 0) { +#ifdef _KERNEL + needfree = 0; +#endif + /* + * We're either no longer overflowing, or we + * can't evict anything more, so we should wake + * up any threads before we go to sleep. + */ + cv_broadcast(&arc_reclaim_waiters_cv); - /* block until needed, or one second, whichever is shorter */ - CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait(&arc_reclaim_thr_cv, - &arc_reclaim_thr_lock, (hz)); - CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); + /* + * Block until signaled, or after one second (we + * might need to perform arc_kmem_reap_now() + * even if we aren't being signalled) + */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_hires(&arc_reclaim_thread_cv, + &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); + CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); + } } - arc_thread_exit = 0; - cv_broadcast(&arc_reclaim_thr_cv); - CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ + arc_reclaim_thread_exit = B_FALSE; + cv_broadcast(&arc_reclaim_thread_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ + thread_exit(); +} + +#ifdef __FreeBSD__ + +static u_int arc_dnlc_evicts_arg; +extern struct vfsops zfs_vfsops; + +static void +arc_dnlc_evicts_thread(void *dummy __unused) +{ + callb_cpr_t cpr; + u_int percent; + + CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG); + + mutex_enter(&arc_dnlc_evicts_lock); + while (!arc_dnlc_evicts_thread_exit) { + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); + CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock); + if (arc_dnlc_evicts_arg != 0) { + percent = arc_dnlc_evicts_arg; + mutex_exit(&arc_dnlc_evicts_lock); +#ifdef _KERNEL + vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops); +#endif + mutex_enter(&arc_dnlc_evicts_lock); + /* + * Clear our token only after vnlru_free() + * pass is done, to avoid false queueing of + * the requests. + */ + arc_dnlc_evicts_arg = 0; + } + } + arc_dnlc_evicts_thread_exit = FALSE; + cv_broadcast(&arc_dnlc_evicts_cv); + CALLB_CPR_EXIT(&cpr); thread_exit(); } +void +dnlc_reduce_cache(void *arg) +{ + u_int percent; + + percent = (u_int)(uintptr_t)arg; + mutex_enter(&arc_dnlc_evicts_lock); + if (arc_dnlc_evicts_arg == 0) { + arc_dnlc_evicts_arg = percent; + cv_broadcast(&arc_dnlc_evicts_cv); + } + mutex_exit(&arc_dnlc_evicts_lock); +} + +#endif + /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are comming from. This function is only called @@ -2138,6 +4304,8 @@ arc_adapt(int bytes, arc_state_t *state) { int mult; uint64_t arc_p_min = (arc_c >> arc_p_min_shift); + int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); + int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); if (state == arc_l2c_only) return; @@ -2152,15 +4320,15 @@ arc_adapt(int bytes, arc_state_t *state) * target size of the MRU list. */ if (state == arc_mru_ghost) { - mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? - 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); + mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); + mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { uint64_t delta; - mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? - 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); + mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); + mult = MIN(mult, 10); delta = MIN(bytes * mult, arc_p); arc_p = MAX(arc_p_min, arc_p - delta); @@ -2168,7 +4336,7 @@ arc_adapt(int bytes, arc_state_t *state) ASSERT((int64_t)arc_p >= 0); if (arc_reclaim_needed()) { - cv_signal(&arc_reclaim_thr_cv); + cv_signal(&arc_reclaim_thread_cv); return; } @@ -2183,6 +4351,7 @@ arc_adapt(int bytes, arc_state_t *state) * cache size, increment the target cache size */ if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { + DTRACE_PROBE1(arc__inc_adapt, int, bytes); atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; @@ -2195,135 +4364,144 @@ arc_adapt(int bytes, arc_state_t *state) } /* - * Check if the cache has reached its limits and eviction is required - * prior to insert. + * Check if arc_size has grown past our upper threshold, determined by + * zfs_arc_overflow_shift. */ -static int -arc_evict_needed(arc_buf_contents_t type) +static boolean_t +arc_is_overflowing(void) { - if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) - return (1); - -#ifdef _KERNEL - /* - * If zio data pages are being allocated out of a separate heap segment, - * then enforce that the size of available vmem for this area remains - * above about 1/32nd free. - */ - if (type == ARC_BUFC_DATA && zio_arena != NULL && - vmem_size(zio_arena, VMEM_FREE) < - (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) - return (1); -#endif + /* Always allow at least one block of overflow */ + uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, + arc_c >> zfs_arc_overflow_shift); - if (arc_reclaim_needed()) - return (1); - - return (arc_size > arc_c); + return (arc_size >= arc_c + overflow); } -/* - * The buffer, supplied as the first argument, needs a data block. - * So, if we are at cache max, determine which cache should be victimized. - * We have the following cases: - * - * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> - * In this situation if we're out of space, but the resident size of the MFU is - * under the limit, victimize the MFU cache to satisfy this insertion request. - * - * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> - * Here, we've used up all of the available space for the MRU, so we need to - * evict from our own cache instead. Evict from the set of resident MRU - * entries. - * - * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> - * c minus p represents the MFU space in the cache, since p is the size of the - * cache that is dedicated to the MRU. In this situation there's still space on - * the MFU side, so the MRU side needs to be victimized. - * - * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> - * MFU's resident set is consuming more space than it has been allotted. In - * this situation, we must victimize our own cache, the MFU, for this insertion. +/* + * Allocate a block and return it to the caller. If we are hitting the + * hard limit for the cache size, we must sleep, waiting for the eviction + * thread to catch up. If we're past the target size but below the hard + * limit, we'll only signal the reclaim thread and continue on. */ -static void -arc_get_data_buf(arc_buf_t *buf) +static void * +arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { - arc_state_t *state = buf->b_hdr->b_state; - uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = buf->b_hdr->b_type; + void *datap = NULL; + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); arc_adapt(size, state); /* - * We have not yet reached cache maximum size, - * just allocate a new buffer. + * If arc_size is currently overflowing, and has grown past our + * upper limit, we must be adding data faster than the evict + * thread can evict. Thus, to ensure we don't compound the + * problem by adding more data and forcing arc_size to grow even + * further past it's target size, we halt and wait for the + * eviction thread to catch up. + * + * It's also possible that the reclaim thread is unable to evict + * enough buffers to get arc_size below the overflow limit (e.g. + * due to buffers being un-evictable, or hash lock collisions). + * In this case, we want to proceed regardless if we're + * overflowing; thus we don't use a while loop here. */ - if (!arc_evict_needed(type)) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_DATA); - } else { - ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); - ARCSTAT_INCR(arcstat_data_size, size); - atomic_add_64(&arc_size, size); + if (arc_is_overflowing()) { + mutex_enter(&arc_reclaim_lock); + + /* + * Now that we've acquired the lock, we may no longer be + * over the overflow limit, lets check. + * + * We're ignoring the case of spurious wake ups. If that + * were to happen, it'd let this thread consume an ARC + * buffer before it should have (i.e. before we're under + * the overflow limit and were signalled by the reclaim + * thread). As long as that is a rare occurrence, it + * shouldn't cause any harm. + */ + if (arc_is_overflowing()) { + cv_signal(&arc_reclaim_thread_cv); + cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); } - goto out; + + mutex_exit(&arc_reclaim_lock); } - /* - * If we are prefetching from the mfu ghost list, this buffer - * will end up on the mru list; so steal space from there. - */ - if (state == arc_mfu_ghost) - state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; - else if (state == arc_mru_ghost) - state = arc_mru; - - if (state == arc_mru || state == arc_anon) { - uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; - state = (arc_mfu->arcs_lsize[type] >= size && - arc_p > mru_used) ? arc_mfu : arc_mru; + VERIFY3U(hdr->b_type, ==, type); + if (type == ARC_BUFC_METADATA) { + datap = zio_buf_alloc(size); + arc_space_consume(size, ARC_SPACE_META); } else { - /* MFU cases */ - uint64_t mfu_space = arc_c - arc_p; - state = (arc_mru->arcs_lsize[type] >= size && - mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; - } - if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_DATA); - } else { - ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); - ARCSTAT_INCR(arcstat_data_size, size); - atomic_add_64(&arc_size, size); - } - ARCSTAT_BUMP(arcstat_recycle_miss); + ASSERT(type == ARC_BUFC_DATA); + datap = zio_data_buf_alloc(size); + arc_space_consume(size, ARC_SPACE_DATA); } - ASSERT(buf->b_data != NULL); -out: + /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ - if (!GHOST_STATE(buf->b_hdr->b_state)) { - arc_buf_hdr_t *hdr = buf->b_hdr; + if (!GHOST_STATE(state)) { - atomic_add_64(&hdr->b_state->arcs_size, size); - if (list_link_active(&hdr->b_arc_node)) { - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - atomic_add_64(&hdr->b_state->arcs_lsize[type], size); + (void) refcount_add_many(&state->arcs_size, size, tag); + + /* + * If this is reached via arc_read, the link is + * protected by the hash lock. If reached via + * arc_buf_alloc, the header should not be accessed by + * any other thread. And, if reached via arc_read_done, + * the hash lock will protect it if it's found in the + * hash table; otherwise no other thread should be + * trying to [add|remove]_reference it. + */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + (void) refcount_add_many(&state->arcs_esize[type], + size, tag); } + /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ - if (arc_size < arc_c && hdr->b_state == arc_anon && - arc_anon->arcs_size + arc_mru->arcs_size > arc_p) + if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && + (refcount_count(&arc_anon->arcs_size) + + refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } + ARCSTAT_BUMP(arcstat_allocated); + return (datap); +} + +/* + * Free the arc data buffer. + */ +static void +arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) refcount_remove_many(&state->arcs_esize[type], + size, tag); + } + (void) refcount_remove_many(&state->arcs_size, size, tag); + + VERIFY3U(hdr->b_type, ==, type); + if (type == ARC_BUFC_METADATA) { + zio_buf_free(data, size); + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(data, size); + arc_space_return(size, ARC_SPACE_DATA); + } } /* @@ -2331,25 +4509,26 @@ out: * NOTE: the hash lock is dropped in this function. */ static void -arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) +arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { clock_t now; ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(HDR_HAS_L1HDR(hdr)); - if (buf->b_state == arc_anon) { + if (hdr->b_l1hdr.b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ - ASSERT(buf->b_arc_access == 0); - buf->b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); - arc_change_state(arc_mru, buf, hash_lock); + ASSERT0(hdr->b_l1hdr.b_arc_access); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mru, hdr, hash_lock); - } else if (buf->b_state == arc_mru) { + } else if (hdr->b_l1hdr.b_state == arc_mru) { now = ddi_get_lbolt(); /* @@ -2360,14 +4539,16 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ - if ((buf->b_flags & ARC_PREFETCH) != 0) { - if (refcount_count(&buf->b_refcnt) == 0) { - ASSERT(list_link_active(&buf->b_arc_node)); + if (HDR_PREFETCH(hdr)) { + if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { + /* link protected by hash lock */ + ASSERT(multilist_link_active( + &hdr->b_l1hdr.b_arc_node)); } else { - buf->b_flags &= ~ARC_PREFETCH; + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); ARCSTAT_BUMP(arcstat_mru_hits); } - buf->b_arc_access = now; + hdr->b_l1hdr.b_arc_access = now; return; } @@ -2376,18 +4557,18 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t * but it is still in the cache. Move it to the MFU * state. */ - if (now > buf->b_arc_access + ARC_MINTIME) { + if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ - buf->b_arc_access = now; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(arc_mfu, buf, hash_lock); + hdr->b_l1hdr.b_arc_access = now; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr, hash_lock); } ARCSTAT_BUMP(arcstat_mru_hits); - } else if (buf->b_state == arc_mru_ghost) { + } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but @@ -2395,21 +4576,21 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t * MFU state. */ - if (buf->b_flags & ARC_PREFETCH) { + if (HDR_PREFETCH(hdr)) { new_state = arc_mru; - if (refcount_count(&buf->b_refcnt) > 0) - buf->b_flags &= ~ARC_PREFETCH; - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } - buf->b_arc_access = ddi_get_lbolt(); - arc_change_state(new_state, buf, hash_lock); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + arc_change_state(new_state, hdr, hash_lock); ARCSTAT_BUMP(arcstat_mru_ghost_hits); - } else if (buf->b_state == arc_mfu) { + } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. @@ -2419,13 +4600,14 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t * If it was a prefetch, we will explicitly move it to * the head of the list now. */ - if ((buf->b_flags & ARC_PREFETCH) != 0) { - ASSERT(refcount_count(&buf->b_refcnt) == 0); - ASSERT(list_link_active(&buf->b_arc_node)); + if ((HDR_PREFETCH(hdr)) != 0) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + /* link protected by hash_lock */ + ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); } ARCSTAT_BUMP(arcstat_mfu_hits); - buf->b_arc_access = ddi_get_lbolt(); - } else if (buf->b_state == arc_mfu_ghost) { + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has @@ -2433,28 +4615,28 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t * MFU state. */ - if (buf->b_flags & ARC_PREFETCH) { + if (HDR_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ - ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); new_state = arc_mru; } - buf->b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(new_state, buf, hash_lock); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(new_state, hdr, hash_lock); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); - } else if (buf->b_state == arc_l2c_only) { + } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ - buf->b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(arc_mfu, buf, hash_lock); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr, hash_lock); } else { ASSERT(!"invalid arc state"); } @@ -2465,8 +4647,9 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { - bcopy(buf->b_data, arg, buf->b_hdr->b_size); - VERIFY(arc_buf_remove_ref(buf, arg) == 1); + if (zio == NULL || zio->io_error == 0) + bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr)); + arc_buf_destroy(buf, arg); } /* a generic arc_done_func_t */ @@ -2475,25 +4658,38 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *b { arc_buf_t **bufp = arg; if (zio && zio->io_error) { - VERIFY(arc_buf_remove_ref(buf, arg) == 1); + arc_buf_destroy(buf, arg); *bufp = NULL; } else { *bufp = buf; + ASSERT(buf->b_data); + } +} + +static void +arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) +{ + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { + ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + } else { + if (HDR_COMPRESSION_ENABLED(hdr)) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, + BP_GET_COMPRESS(bp)); + } + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); + ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); } } static void arc_read_done(zio_t *zio) { - arc_buf_hdr_t *hdr, *found; - arc_buf_t *buf; - arc_buf_t *abuf; /* buffer we're assigning to callback */ - kmutex_t *hash_lock; + arc_buf_hdr_t *hdr = zio->io_private; + arc_buf_t *abuf = NULL; /* buffer we're assigning to callback */ + kmutex_t *hash_lock = NULL; arc_callback_t *callback_list, *acb; - int freeable = FALSE; - - buf = zio->io_private; - hdr = buf->b_hdr; + int freeable = B_FALSE; /* * The hdr was inserted into hash-table and removed from lists @@ -2503,30 +4699,45 @@ arc_read_done(zio_t *zio) * reason for it not to be found is if we were freed during the * read. */ - found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, - &hash_lock); + if (HDR_IN_HASH_TABLE(hdr)) { + ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); + ASSERT3U(hdr->b_dva.dva_word[0], ==, + BP_IDENTITY(zio->io_bp)->dva_word[0]); + ASSERT3U(hdr->b_dva.dva_word[1], ==, + BP_IDENTITY(zio->io_bp)->dva_word[1]); + + arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, + &hash_lock); + + ASSERT((found == hdr && + DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || + (found == hdr && HDR_L2_READING(hdr))); + ASSERT3P(hash_lock, !=, NULL); + } - ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || - (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || - (found == hdr && HDR_L2_READING(hdr))); - - hdr->b_flags &= ~ARC_L2_EVICTED; - if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) - hdr->b_flags &= ~ARC_L2CACHE; - - /* byteswap if necessary */ - callback_list = hdr->b_acb; - ASSERT(callback_list != NULL); - if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { - arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? - byteswap_uint64_array : - dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; - func(buf->b_data, hdr->b_size); + if (zio->io_error == 0) { + /* byteswap if necessary */ + if (BP_SHOULD_BYTESWAP(zio->io_bp)) { + if (BP_GET_LEVEL(zio->io_bp) > 0) { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; + } else { + hdr->b_l1hdr.b_byteswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); + } + } else { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + } } - arc_cksum_compute(buf, B_FALSE); + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); + if (l2arc_noprefetch && HDR_PREFETCH(hdr)) + arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); - if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { + callback_list = hdr->b_l1hdr.b_acb; + ASSERT3P(callback_list, !=, NULL); + + if (hash_lock && zio->io_error == 0 && + hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already @@ -2537,33 +4748,55 @@ arc_read_done(zio_t *zio) } /* create copies of the data buffer for the callers */ - abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { - if (acb->acb_done) { - if (abuf == NULL) - abuf = arc_buf_clone(buf); - acb->acb_buf = abuf; - abuf = NULL; - } - } - hdr->b_acb = NULL; - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; - ASSERT(!HDR_BUF_AVAILABLE(hdr)); - if (abuf == buf) { - ASSERT(buf->b_efunc == NULL); - ASSERT(hdr->b_datacnt == 1); - hdr->b_flags |= ARC_BUF_AVAILABLE; - } - - ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); - - if (zio->io_error != 0) { - hdr->b_flags |= ARC_IO_ERROR; - if (hdr->b_state != arc_anon) + if (acb->acb_done != NULL) { + /* + * If we're here, then this must be a demand read + * since prefetch requests don't have callbacks. + * If a read request has a callback (i.e. acb_done is + * not NULL), then we decompress the data for the + * first request and clone the rest. This avoids + * having to waste cpu resources decompressing data + * that nobody is explicitly waiting to read. + */ + if (abuf == NULL) { + acb->acb_buf = arc_buf_alloc_impl(hdr, + acb->acb_private); + if (zio->io_error == 0) { + zio->io_error = + arc_decompress(acb->acb_buf); + } + abuf = acb->acb_buf; + } else { + add_reference(hdr, acb->acb_private); + acb->acb_buf = arc_buf_clone(abuf); + } + } + } + hdr->b_l1hdr.b_acb = NULL; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + if (abuf == NULL) { + /* + * This buffer didn't have a callback so it must + * be a prefetch. + */ + ASSERT(HDR_PREFETCH(hdr)); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + } + + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || + callback_list != NULL); + + if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); + if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); - freeable = refcount_is_zero(&hdr->b_refcnt); + freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* @@ -2571,9 +4804,9 @@ arc_read_done(zio_t *zio) * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ - cv_broadcast(&hdr->b_cv); + cv_broadcast(&hdr->b_l1hdr.b_cv); - if (hash_lock) { + if (hash_lock != NULL) { mutex_exit(hash_lock); } else { /* @@ -2582,8 +4815,8 @@ arc_read_done(zio_t *zio) * moved to the anonymous state (so that it won't show up * in the cache). */ - ASSERT3P(hdr->b_state, ==, arc_anon); - freeable = refcount_is_zero(&hdr->b_refcnt); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* execute each callback and free its structure */ @@ -2605,7 +4838,7 @@ arc_read_done(zio_t *zio) } /* - * "Read" the block block at the specified DVA (in bp) via the + * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was @@ -2621,58 +4854,75 @@ arc_read_done(zio_t *zio) * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. - * - * Normal callers should use arc_read and pass the arc buffer and offset - * for the bp. But if you know you don't need locking, you can use - * arc_read_bp. */ int -arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, - arc_done_func_t *done, void *private, int priority, int zio_flags, - uint32_t *arc_flags, const zbookmark_t *zb) -{ - int err; - - ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); - ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); - rw_enter(&pbuf->b_lock, RW_READER); - - err = arc_read_nolock(pio, spa, bp, done, private, priority, - zio_flags, arc_flags, zb); - rw_exit(&pbuf->b_lock); - - return (err); -} - -int -arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, - arc_done_func_t *done, void *private, int priority, int zio_flags, - uint32_t *arc_flags, const zbookmark_t *zb) +arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, + void *private, zio_priority_t priority, int zio_flags, + arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { - arc_buf_hdr_t *hdr; - arc_buf_t *buf; - kmutex_t *hash_lock; + arc_buf_hdr_t *hdr = NULL; + kmutex_t *hash_lock = NULL; zio_t *rzio; - uint64_t guid = spa_guid(spa); + uint64_t guid = spa_load_guid(spa); + + ASSERT(!BP_IS_EMBEDDED(bp) || + BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); top: - hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), - &hash_lock); - if (hdr && hdr->b_datacnt > 0) { + if (!BP_IS_EMBEDDED(bp)) { + /* + * Embedded BP's have no DVA and require no I/O to "read". + * Create an anonymous arc buf to back it. + */ + hdr = buf_hash_find(guid, bp, &hash_lock); + } - *arc_flags |= ARC_CACHED; + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) { + arc_buf_t *buf = NULL; + *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { - if (*arc_flags & ARC_WAIT) { - cv_wait(&hdr->b_cv, hash_lock); + if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && + priority == ZIO_PRIORITY_SYNC_READ) { + /* + * This sync read must wait for an + * in-progress async read (e.g. a predictive + * prefetch). Async reads are queued + * separately at the vdev_queue layer, so + * this is a form of priority inversion. + * Ideally, we would "inherit" the demand + * i/o's priority by moving the i/o from + * the async queue to the synchronous queue, + * but there is currently no mechanism to do + * so. Track this so that we can evaluate + * the magnitude of this potential performance + * problem. + * + * Note that if the prefetch i/o is already + * active (has been issued to the device), + * the prefetch improved performance, because + * we issued it sooner than we would have + * without the prefetch. + */ + DTRACE_PROBE1(arc__sync__wait__for__async, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_sync_wait_for_async); + } + if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); + } + + if (*arc_flags & ARC_FLAG_WAIT) { + cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } - ASSERT(*arc_flags & ARC_NOWAIT); + ASSERT(*arc_flags & ARC_FLAG_NOWAIT); if (done) { - arc_callback_t *acb = NULL; + arc_callback_t *acb = NULL; acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); @@ -2682,10 +4932,9 @@ top: acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); - ASSERT(acb->acb_done != NULL); - acb->acb_next = hdr->b_acb; - hdr->b_acb = acb; - add_reference(hdr, hash_lock, private); + ASSERT3P(acb->acb_done, !=, NULL); + acb->acb_next = hdr->b_l1hdr.b_acb; + hdr->b_l1hdr.b_acb = acb; mutex_exit(hash_lock); return (0); } @@ -2693,126 +4942,151 @@ top: return (0); } - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); if (done) { - add_reference(hdr, hash_lock, private); + if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { + /* + * This is a demand read which does not have to + * wait for i/o because we did a predictive + * prefetch i/o for it, which has completed. + */ + DTRACE_PROBE1( + arc__demand__hit__predictive__prefetch, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP( + arcstat_demand_hit_predictive_prefetch); + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); + } + ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); + /* * If this block is already in use, create a new * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ - buf = hdr->b_buf; - ASSERT(buf); - ASSERT(buf->b_data); - if (HDR_BUF_AVAILABLE(hdr)) { - ASSERT(buf->b_efunc == NULL); - hdr->b_flags &= ~ARC_BUF_AVAILABLE; + buf = hdr->b_l1hdr.b_buf; + if (buf == NULL) { + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + buf = arc_buf_alloc_impl(hdr, private); + VERIFY0(arc_decompress(buf)); } else { + add_reference(hdr, private); buf = arc_buf_clone(buf); } + ASSERT3P(buf->b_data, !=, NULL); - } else if (*arc_flags & ARC_PREFETCH && - refcount_count(&hdr->b_refcnt) == 0) { - hdr->b_flags |= ARC_PREFETCH; + } else if (*arc_flags & ARC_FLAG_PREFETCH && + refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); - if (*arc_flags & ARC_L2CACHE) - hdr->b_flags |= ARC_L2CACHE; + if (*arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); if (done) done(NULL, buf, private); } else { - uint64_t size = BP_GET_LSIZE(bp); - arc_callback_t *acb; + uint64_t lsize = BP_GET_LSIZE(bp); + uint64_t psize = BP_GET_PSIZE(bp); + arc_callback_t *acb; vdev_t *vd = NULL; - uint64_t addr; + uint64_t addr = 0; boolean_t devw = B_FALSE; + uint64_t size; if (hdr == NULL) { /* this block is not in the cache */ - arc_buf_hdr_t *exists; + arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); - buf = arc_buf_alloc(spa, size, private, type); - hdr = buf->b_hdr; - hdr->b_dva = *BP_IDENTITY(bp); - hdr->b_birth = BP_PHYSICAL_BIRTH(bp); - hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; - exists = buf_hash_insert(hdr, &hash_lock); - if (exists) { + hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + BP_GET_COMPRESS(bp), type); + + if (!BP_IS_EMBEDDED(bp)) { + hdr->b_dva = *BP_IDENTITY(bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(bp); + exists = buf_hash_insert(hdr, &hash_lock); + } + if (exists != NULL) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; - (void) arc_buf_remove_ref(buf, private); + buf_discard_identity(hdr); + arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } - /* if this is a prefetch, we don't have a reference */ - if (*arc_flags & ARC_PREFETCH) { - (void) remove_reference(hdr, hash_lock, - private); - hdr->b_flags |= ARC_PREFETCH; - } - if (*arc_flags & ARC_L2CACHE) - hdr->b_flags |= ARC_L2CACHE; - if (BP_GET_LEVEL(bp) > 0) - hdr->b_flags |= ARC_INDIRECT; } else { - /* this block is in the ghost cache */ - ASSERT(GHOST_STATE(hdr->b_state)); + /* + * This block is in the ghost cache. If it was L2-only + * (and thus didn't have an L1 hdr), we realloc the + * header to add an L1 hdr. + */ + if (!HDR_HAS_L1HDR(hdr)) { + hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, + hdr_full_cache); + } + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); - ASSERT(hdr->b_buf == NULL); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - /* if this is a prefetch, we don't have a reference */ - if (*arc_flags & ARC_PREFETCH) - hdr->b_flags |= ARC_PREFETCH; - else - add_reference(hdr, hash_lock, private); - if (*arc_flags & ARC_L2CACHE) - hdr->b_flags |= ARC_L2CACHE; - buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); - buf->b_hdr = hdr; - buf->b_data = NULL; - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_next = NULL; - hdr->b_buf = buf; - arc_get_data_buf(buf); - ASSERT(hdr->b_datacnt == 0); - hdr->b_datacnt = 1; + /* + * This is a delicate dance that we play here. + * This hdr is in the ghost list so we access it + * to move it out of the ghost list before we + * initiate the read. If it's a prefetch then + * it won't have a callback so we'll remove the + * reference that arc_buf_alloc_impl() created. We + * do this after we've called arc_access() to + * avoid hitting an assert in remove_reference(). + */ + arc_access(hdr, hash_lock); + arc_hdr_alloc_pdata(hdr); } - - acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); - acb->acb_done = done; - acb->acb_private = private; - - ASSERT(hdr->b_acb == NULL); - hdr->b_acb = acb; - hdr->b_flags |= ARC_IO_IN_PROGRESS; + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + size = arc_hdr_size(hdr); /* - * If the buffer has been evicted, migrate it to a present state - * before issuing the I/O. Once we drop the hash-table lock, - * the header will be marked as I/O in progress and have an - * attached buffer. At this point, anybody who finds this - * buffer ought to notice that it's legit but has a pending I/O. + * If compression is enabled on the hdr, then will do + * RAW I/O and will store the compressed data in the hdr's + * data block. Otherwise, the hdr's data block will contain + * the uncompressed data. */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + zio_flags |= ZIO_FLAG_RAW; + } - if (GHOST_STATE(hdr->b_state)) - arc_access(hdr, hash_lock); + if (*arc_flags & ARC_FLAG_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (*arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + if (BP_GET_LEVEL(bp) > 0) + arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); + if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); + ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); + + acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; - if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && - (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { - devw = hdr->b_l2hdr->b_dev->l2ad_writing; - addr = hdr->b_l2hdr->b_daddr; + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + hdr->b_l1hdr.b_acb = acb; + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + + if (HDR_HAS_L2HDR(hdr) && + (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { + devw = hdr->b_l2hdr.b_dev->l2ad_writing; + addr = hdr->b_l2hdr.b_daddr; /* * Lock out device removal. */ @@ -2821,15 +5095,39 @@ top: vd = NULL; } - mutex_exit(hash_lock); + if (priority == ZIO_PRIORITY_ASYNC_READ) + arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + else + arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + + if (hash_lock != NULL) + mutex_exit(hash_lock); + + /* + * At this point, we have a level 1 cache miss. Try again in + * L2ARC if possible. + */ + ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); - ASSERT3U(hdr->b_size, ==, size); DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, - uint64_t, size, zbookmark_t *, zb); + uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); +#ifdef __FreeBSD__ +#ifdef _KERNEL +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_force(curproc, RACCT_READBPS, size); + racct_add_force(curproc, RACCT_READIOPS, 1); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ + curthread->td_ru.ru_inblock++; +#endif +#endif if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* @@ -2841,42 +5139,59 @@ top: * also have invalidated the vdev. * 5. This isn't prefetch and l2arc_noprefetch is set. */ - if (hdr->b_l2hdr != NULL && + if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; + void* b_data; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); - cb->l2rcb_buf = buf; - cb->l2rcb_spa = spa; + cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; + uint64_t asize = vdev_psize_to_asize(vd, size); + if (asize != size) { + b_data = zio_data_buf_alloc(asize); + cb->l2rcb_data = b_data; + } else { + b_data = hdr->b_l1hdr.b_pdata; + } + + ASSERT(addr >= VDEV_LABEL_START_SIZE && + addr + asize < vd->vdev_psize - + VDEV_LABEL_END_SIZE); /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). + * Issue a null zio if the underlying buffer + * was squashed to zero size by compression. */ - rzio = zio_read_phys(pio, vd, addr, size, - buf->b_data, ZIO_CHECKSUM_OFF, - l2arc_read_done, cb, priority, zio_flags | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ASSERT3U(HDR_GET_COMPRESS(hdr), !=, + ZIO_COMPRESS_EMPTY); + rzio = zio_read_phys(pio, vd, addr, + asize, b_data, + ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, + zio_flags | ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, size); - if (*arc_flags & ARC_NOWAIT) { + if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); return (0); } - ASSERT(*arc_flags & ARC_WAIT); + ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) return (0); @@ -2899,230 +5214,272 @@ top: } } - rzio = zio_read(pio, spa, bp, buf->b_data, size, - arc_read_done, buf, priority, zio_flags, zb); + rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size, + arc_read_done, hdr, priority, zio_flags, zb); - if (*arc_flags & ARC_WAIT) + if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); - ASSERT(*arc_flags & ARC_NOWAIT); + ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } return (0); } -void -arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) -{ - ASSERT(buf->b_hdr != NULL); - ASSERT(buf->b_hdr->b_state != arc_anon); - ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); - ASSERT(buf->b_efunc == NULL); - ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); - - buf->b_efunc = func; - buf->b_private = private; -} - /* - * This is used by the DMU to let the ARC know that a buffer is - * being evicted, so the ARC should clean up. If this arc buf - * is not yet in the evicted state, it will be put there. + * Notify the arc that a block was freed, and thus will never be used again. */ -int -arc_buf_evict(arc_buf_t *buf) +void +arc_freed(spa_t *spa, const blkptr_t *bp) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; - arc_buf_t **bufp; + uint64_t guid = spa_load_guid(spa); - rw_enter(&buf->b_lock, RW_WRITER); - hdr = buf->b_hdr; - if (hdr == NULL) { - /* - * We are in arc_do_user_evicts(). - */ - ASSERT(buf->b_data == NULL); - rw_exit(&buf->b_lock); - return (0); - } else if (buf->b_data == NULL) { - arc_buf_t copy = *buf; /* structure assignment */ - /* - * We are on the eviction list; process this buffer now - * but let arc_do_user_evicts() do the reaping. - */ - buf->b_efunc = NULL; - rw_exit(&buf->b_lock); - VERIFY(copy.b_efunc(©) == 0); - return (1); - } - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); + ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT(buf->b_hdr == hdr); - ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + hdr = buf_hash_find(guid, bp, &hash_lock); + if (hdr == NULL) + return; /* - * Pull this buffer off of the hdr + * We might be trying to free a block that is still doing I/O + * (i.e. prefetch) or has a reference (i.e. a dedup-ed, + * dmu_sync-ed block). If this block is being prefetched, then it + * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr + * until the I/O completes. A block may also have a reference if it is + * part of a dedup-ed, dmu_synced write. The dmu_sync() function would + * have written the new block to its final resting place on disk but + * without the dedup flag set. This would have left the hdr in the MRU + * state and discoverable. When the txg finally syncs it detects that + * the block was overridden in open context and issues an override I/O. + * Since this is a dedup block, the override I/O will determine if the + * block is already in the DDT. If so, then it will replace the io_bp + * with the bp from the DDT and allow the I/O to finish. When the I/O + * reaches the done callback, dbuf_write_override_done, it will + * check to see if the io_bp and io_bp_override are identical. + * If they are not, then it indicates that the bp was replaced with + * the bp in the DDT and the override bp is freed. This allows + * us to arrive here with a reference on a block that is being + * freed. So if we have an I/O in progress, or a reference to + * this hdr, then we don't destroy the hdr. */ - bufp = &hdr->b_buf; - while (*bufp != buf) - bufp = &(*bufp)->b_next; - *bufp = buf->b_next; - - ASSERT(buf->b_data != NULL); - arc_buf_destroy(buf, FALSE, FALSE); - - if (hdr->b_datacnt == 0) { - arc_state_t *old_state = hdr->b_state; - arc_state_t *evicted_state; - - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - - evicted_state = - (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; - - mutex_enter(&old_state->arcs_mtx); - mutex_enter(&evicted_state->arcs_mtx); - - arc_change_state(evicted_state, hdr, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags |= ARC_IN_HASH_TABLE; - hdr->b_flags &= ~ARC_BUF_AVAILABLE; - - mutex_exit(&evicted_state->arcs_mtx); - mutex_exit(&old_state->arcs_mtx); + if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && + refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); + mutex_exit(hash_lock); + } else { + mutex_exit(hash_lock); } - mutex_exit(hash_lock); - rw_exit(&buf->b_lock); - VERIFY(buf->b_efunc(buf) == 0); - buf->b_efunc = NULL; - buf->b_private = NULL; - buf->b_hdr = NULL; - kmem_cache_free(buf_cache, buf); - return (1); } /* - * Release this buffer from the cache. This must be done - * after a read and prior to modifying the buffer contents. + * Release this buffer from the cache, making it an anonymous buffer. This + * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, void *tag) { - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock; - l2arc_buf_hdr_t *l2hdr; - uint64_t buf_size; - boolean_t released = B_FALSE; + arc_buf_hdr_t *hdr = buf->b_hdr; - rw_enter(&buf->b_lock, RW_WRITER); - hdr = buf->b_hdr; + /* + * It would be nice to assert that if it's DMU metadata (level > + * 0 || it's the dnode file), then it must be syncing context. + * But we don't know that information at this level. + */ - /* this buffer is not on any list */ - ASSERT(refcount_count(&hdr->b_refcnt) > 0); + mutex_enter(&buf->b_evict_lock); + + ASSERT(HDR_HAS_L1HDR(hdr)); + + /* + * We don't grab the hash lock prior to this check, because if + * the buffer's header is in the arc_anon state, it won't be + * linked into the hash table. + */ + if (hdr->b_l1hdr.b_state == arc_anon) { + mutex_exit(&buf->b_evict_lock); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + ASSERT(!HDR_HAS_L2HDR(hdr)); + ASSERT(HDR_EMPTY(hdr)); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + + hdr->b_l1hdr.b_arc_access = 0; - if (hdr->b_state == arc_anon) { - /* this buffer is already released */ - ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); - ASSERT(BUF_EMPTY(hdr)); - ASSERT(buf->b_efunc == NULL); + /* + * If the buf is being overridden then it may already + * have a hdr that is not empty. + */ + buf_discard_identity(hdr); arc_buf_thaw(buf); - rw_exit(&buf->b_lock); - released = B_TRUE; - } else { - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - } - l2hdr = hdr->b_l2hdr; - if (l2hdr) { - mutex_enter(&l2arc_buflist_mtx); - hdr->b_l2hdr = NULL; - buf_size = hdr->b_size; + return; } - if (released) - goto out; + kmutex_t *hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + /* + * This assignment is only valid as long as the hash_lock is + * held, we must be careful not to reference state or the + * b_state field after dropping the lock. + */ + arc_state_t *state = hdr->b_l1hdr.b_state; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + ASSERT3P(state, !=, arc_anon); + + /* this buffer is not on any list */ + ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); + + if (HDR_HAS_L2HDR(hdr)) { + mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); + + /* + * We have to recheck this conditional again now that + * we're holding the l2ad_mtx to prevent a race with + * another thread which might be concurrently calling + * l2arc_evict(). In that case, l2arc_evict() might have + * destroyed the header's L2 portion as we were waiting + * to acquire the l2ad_mtx. + */ + if (HDR_HAS_L2HDR(hdr)) { + l2arc_trim(hdr); + arc_hdr_l2hdr_destroy(hdr); + } + + mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); + } /* * Do we have more than one buf? */ - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; - uint64_t blksz = hdr->b_size; uint64_t spa = hdr->b_spa; - arc_buf_contents_t type = hdr->b_type; - uint32_t flags = hdr->b_flags; + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + enum zio_compress compress = HDR_GET_COMPRESS(hdr); + arc_buf_contents_t type = arc_buf_type(hdr); + VERIFY3U(hdr->b_type, ==, type); + + ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); + (void) remove_reference(hdr, hash_lock, tag); + + if (arc_buf_is_shared(buf)) { + ASSERT(HDR_SHARED_DATA(hdr)); + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(buf)); + } - ASSERT(hdr->b_buf != buf || buf->b_next != NULL); /* - * Pull the data off of this buf and attach it to - * a new anonymous buf. + * Pull the data off of this hdr and attach it to + * a new anonymous hdr. Also find the last buffer + * in the hdr's buffer list. */ - (void) remove_reference(hdr, hash_lock, tag); - bufp = &hdr->b_buf; - while (*bufp != buf) - bufp = &(*bufp)->b_next; - *bufp = (*bufp)->b_next; + arc_buf_t *lastbuf = NULL; + bufp = &hdr->b_l1hdr.b_buf; + while (*bufp != NULL) { + if (*bufp == buf) { + *bufp = buf->b_next; + } + + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + ASSERT3P(lastbuf, !=, NULL); + + /* + * If the current arc_buf_t and the hdr are sharing their data + * buffer, then we must stop sharing that block, transfer + * ownership and setup sharing with a new arc_buf_t at the end + * of the hdr's b_buf list. + */ + if (arc_buf_is_shared(buf)) { + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(lastbuf)); + VERIFY(!arc_buf_is_shared(lastbuf)); + + /* + * First, sever the block sharing relationship between + * buf and the arc_buf_hdr_t. Then, setup a new + * block sharing relationship with the last buffer + * on the arc_buf_t list. + */ + arc_unshare_buf(hdr, buf); + arc_share_buf(hdr, lastbuf); + VERIFY3P(lastbuf->b_data, !=, NULL); + } else if (HDR_SHARED_DATA(hdr)) { + ASSERT(arc_buf_is_shared(lastbuf)); + } + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3P(state, !=, arc_l2c_only); - ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); - atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); - if (refcount_is_zero(&hdr->b_refcnt)) { - uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; - ASSERT3U(*size, >=, hdr->b_size); - atomic_add_64(size, -hdr->b_size); + (void) refcount_remove_many(&state->arcs_size, + HDR_GET_LSIZE(hdr), buf); + + if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + ASSERT3P(state, !=, arc_l2c_only); + (void) refcount_remove_many(&state->arcs_esize[type], + HDR_GET_LSIZE(hdr), buf); } - hdr->b_datacnt -= 1; + + hdr->b_l1hdr.b_bufcnt -= 1; arc_cksum_verify(buf); +#ifdef illumos + arc_buf_unwatch(buf); +#endif mutex_exit(hash_lock); - nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); - nhdr->b_size = blksz; - nhdr->b_spa = spa; - nhdr->b_type = type; - nhdr->b_buf = buf; - nhdr->b_state = arc_anon; - nhdr->b_arc_access = 0; - nhdr->b_flags = flags & ARC_L2_WRITING; - nhdr->b_l2hdr = NULL; - nhdr->b_datacnt = 1; - nhdr->b_freeze_cksum = NULL; - (void) refcount_add(&nhdr->b_refcnt, tag); + /* + * Allocate a new hdr. The new hdr will contain a b_pdata + * buffer which will be freed in arc_write(). + */ + nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); + ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(nhdr->b_l1hdr.b_bufcnt); + ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); + VERIFY3U(nhdr->b_type, ==, type); + ASSERT(!HDR_SHARED_DATA(nhdr)); + + nhdr->b_l1hdr.b_buf = buf; + nhdr->b_l1hdr.b_bufcnt = 1; + (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; - rw_exit(&buf->b_lock); - atomic_add_64(&arc_anon->arcs_size, blksz); + + mutex_exit(&buf->b_evict_lock); + (void) refcount_add_many(&arc_anon->arcs_size, + HDR_GET_LSIZE(nhdr), buf); } else { - rw_exit(&buf->b_lock); - ASSERT(refcount_count(&hdr->b_refcnt) == 1); - ASSERT(!list_link_active(&hdr->b_arc_node)); + mutex_exit(&buf->b_evict_lock); + ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); + /* protected by hash lock, or hdr is on arc_anon */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_change_state(arc_anon, hdr, hash_lock); - hdr->b_arc_access = 0; + hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); - bzero(&hdr->b_dva, sizeof (dva_t)); - hdr->b_birth = 0; - hdr->b_cksum0 = 0; + buf_discard_identity(hdr); arc_buf_thaw(buf); } - buf->b_efunc = NULL; - buf->b_private = NULL; - -out: - if (l2hdr) { - list_remove(l2hdr->b_dev->l2ad_buflist, hdr); - kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); - ARCSTAT_INCR(arcstat_l2_size, -buf_size); - mutex_exit(&l2arc_buflist_mtx); - } } int @@ -3130,62 +5487,132 @@ arc_released(arc_buf_t *buf) { int released; - rw_enter(&buf->b_lock, RW_READER); - released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); - rw_exit(&buf->b_lock); + mutex_enter(&buf->b_evict_lock); + released = (buf->b_data != NULL && + buf->b_hdr->b_l1hdr.b_state == arc_anon); + mutex_exit(&buf->b_evict_lock); return (released); } -int -arc_has_callback(arc_buf_t *buf) -{ - int callback; - - rw_enter(&buf->b_lock, RW_READER); - callback = (buf->b_efunc != NULL); - rw_exit(&buf->b_lock); - return (callback); -} - #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { int referenced; - rw_enter(&buf->b_lock, RW_READER); - referenced = (refcount_count(&buf->b_hdr->b_refcnt)); - rw_exit(&buf->b_lock); + mutex_enter(&buf->b_evict_lock); + referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); + mutex_exit(&buf->b_evict_lock); return (referenced); } -#endif +#endif + +static void +arc_write_ready(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; + uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + + /* + * If we're reexecuting this zio because the pool suspended, then + * cleanup any state that was previously set the first time the + * callback as invoked. + */ + if (zio->io_flags & ZIO_FLAG_REEXECUTED) { + arc_cksum_free(hdr); +#ifdef illumos + arc_buf_unwatch(buf); +#endif + if (hdr->b_l1hdr.b_pdata != NULL) { + if (arc_buf_is_shared(buf)) { + ASSERT(HDR_SHARED_DATA(hdr)); + + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_pdata(hdr); + } + } + } + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + + callback->awcb_ready(zio, buf, callback->awcb_private); + + if (HDR_IO_IN_PROGRESS(hdr)) + ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); + + arc_cksum_compute(buf); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + + enum zio_compress compress; + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { + compress = ZIO_COMPRESS_OFF; + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); + compress = BP_GET_COMPRESS(zio->io_bp); + } + HDR_SET_PSIZE(hdr, psize); + arc_hdr_set_compress(hdr, compress); + + /* + * If the hdr is compressed, then copy the compressed + * zio contents into arc_buf_hdr_t. Otherwise, copy the original + * data buf into the hdr. Ideally, we would like to always copy the + * io_data into b_pdata but the user may have disabled compressed + * arc thus the on-disk block may or may not match what we maintain + * in the hdr's b_pdata field. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF); + ASSERT3U(psize, >, 0); + arc_hdr_alloc_pdata(hdr); + bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); + } else { + ASSERT3P(buf->b_data, ==, zio->io_orig_data); + ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr)); + ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + /* + * This hdr is not compressed so we're able to share + * the arc_buf_t data buffer with the hdr. + */ + arc_share_buf(hdr, buf); + VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, + HDR_GET_LSIZE(hdr))); + } + arc_hdr_verify(hdr, zio->io_bp); +} static void -arc_write_ready(zio_t *zio) +arc_write_children_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; - arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); - callback->awcb_ready(zio, buf, callback->awcb_private); + callback->awcb_children_ready(zio, buf, callback->awcb_private); +} - /* - * If the IO is already in progress, then this is a re-write - * attempt, so we need to thaw and re-compute the cksum. - * It is the responsibility of the callback to handle the - * accounting for any re-write attempt. - */ - if (HDR_IO_IN_PROGRESS(hdr)) { - mutex_enter(&hdr->b_freeze_lock); - if (hdr->b_freeze_cksum != NULL) { - kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); - hdr->b_freeze_cksum = NULL; - } - mutex_exit(&hdr->b_freeze_lock); - } - arc_cksum_compute(buf, B_FALSE); - hdr->b_flags |= ARC_IO_IN_PROGRESS; +/* + * The SPA calls this callback for each physical write that happens on behalf + * of a logical write. See the comment in dbuf_write_physdone() for details. + */ +static void +arc_write_physdone(zio_t *zio) +{ + arc_write_callback_t *cb = zio->io_private; + if (cb->awcb_physdone != NULL) + cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); } static void @@ -3195,23 +5622,28 @@ arc_write_done(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(hdr->b_acb == NULL); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { - hdr->b_dva = *BP_IDENTITY(zio->io_bp); - hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); - hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; + arc_hdr_verify(hdr, zio->io_bp); + + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { + buf_discard_identity(hdr); + } else { + hdr->b_dva = *BP_IDENTITY(zio->io_bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); + } } else { - ASSERT(BUF_EMPTY(hdr)); + ASSERT(HDR_EMPTY(hdr)); } /* - * If the block to be written was all-zero, we may have - * compressed it away. In this case no write was performed - * so there will be no dva/birth-date/checksum. The buffer - * must therefor remain anonymous (and uncached). + * If the block to be written was all-zero or compressed enough to be + * embedded in the BP, no write was performed so there will be no + * dva/birth/checksum. The buffer must therefore remain anonymous + * (and uncached). */ - if (!BUF_EMPTY(hdr)) { + if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; @@ -3220,7 +5652,7 @@ arc_write_done(zio_t *zio) arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); - if (exists) { + if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove @@ -3230,116 +5662,115 @@ arc_write_done(zio_t *zio) if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); - ASSERT(refcount_is_zero(&exists->b_refcnt)); + ASSERT(refcount_is_zero( + &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); mutex_exit(hash_lock); arc_hdr_destroy(exists); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); + } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { + /* nopwrite */ + ASSERT(zio->io_prop.zp_nopwrite); + if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) + panic("bad nopwrite, hdr=%p exists=%p", + (void *)hdr, (void *)exists); } else { /* Dedup */ - ASSERT(hdr->b_datacnt == 1); - ASSERT(hdr->b_state == arc_anon); + ASSERT(hdr->b_l1hdr.b_bufcnt == 1); + ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); /* if it's not anon, we are doing a scrub */ - if (!exists && hdr->b_state == arc_anon) + if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); } - ASSERT(!refcount_is_zero(&hdr->b_refcnt)); + ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); kmem_free(callback, sizeof (arc_write_callback_t)); } zio_t * -arc_write(zio_t *pio, spa_t *spa, uint64_t txg, - blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, - arc_done_func_t *ready, arc_done_func_t *done, void *private, - int priority, int zio_flags, const zbookmark_t *zb) +arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, + boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, + arc_done_func_t *children_ready, arc_done_func_t *physdone, + arc_done_func_t *done, void *private, zio_priority_t priority, + int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; - ASSERT(ready != NULL); - ASSERT(done != NULL); + ASSERT3P(ready, !=, NULL); + ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); - ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); - ASSERT(hdr->b_acb == NULL); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) - hdr->b_flags |= ARC_L2CACHE; + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; + callback->awcb_children_ready = children_ready; + callback->awcb_physdone = physdone; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; - zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, - arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); - - return (zio); -} - -void -arc_free(spa_t *spa, const blkptr_t *bp) -{ - arc_buf_hdr_t *ab; - kmutex_t *hash_lock; - uint64_t guid = spa_guid(spa); - /* - * If this buffer is in the cache, release it, so it can be re-used. + * The hdr's b_pdata is now stale, free it now. A new data block + * will be allocated when the zio pipeline calls arc_write_ready(). */ - ab = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), - &hash_lock); - if (ab != NULL) { - if (ab->b_state != arc_anon) - arc_change_state(arc_anon, ab, hash_lock); - if (HDR_IO_IN_PROGRESS(ab)) { - /* - * This should only happen when we prefetch. - */ - ASSERT(ab->b_flags & ARC_PREFETCH); - ASSERT3U(ab->b_datacnt, ==, 1); - ab->b_flags |= ARC_FREED_IN_READ; - if (HDR_IN_HASH_TABLE(ab)) - buf_hash_remove(ab); - ab->b_arc_access = 0; - bzero(&ab->b_dva, sizeof (dva_t)); - ab->b_birth = 0; - ab->b_cksum0 = 0; - ab->b_buf->b_efunc = NULL; - ab->b_buf->b_private = NULL; - mutex_exit(hash_lock); + if (hdr->b_l1hdr.b_pdata != NULL) { + /* + * If the buf is currently sharing the data block with + * the hdr then we need to break that relationship here. + * The hdr will remain with a NULL data pointer and the + * buf will take sole ownership of the block. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_LAST(buf)); + arc_unshare_buf(hdr, buf); } else { - ASSERT(refcount_is_zero(&ab->b_refcnt)); - ab->b_flags |= ARC_FREE_IN_PROGRESS; - mutex_exit(hash_lock); - arc_hdr_destroy(ab); - ARCSTAT_BUMP(arcstat_deleted); + arc_hdr_free_pdata(hdr); } + VERIFY3P(buf->b_data, !=, NULL); + arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); } + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); + + zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp, + arc_write_ready, + (children_ready != NULL) ? arc_write_children_ready : NULL, + arc_write_physdone, arc_write_done, callback, + priority, zio_flags, zb); + + return (zio); } static int -arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) +arc_memory_throttle(uint64_t reserve, uint64_t txg) { #ifdef _KERNEL uint64_t available_memory = ptob(freemem); static uint64_t page_load = 0; static uint64_t last_txg = 0; +#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) available_memory = - MIN(available_memory, vmem_size(kmem_arena, VMEM_FREE)); - if (available_memory >= zfs_write_limit_max) + MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); +#endif + + if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) return (0); if (txg > last_txg) { @@ -3351,32 +5782,18 @@ arc_memory_throttle(uint64_t reserve, ui * the arc is already going to be evicting, so we just want to * continue to let page writes occur as quickly as possible. */ - if (curproc == proc_pageout) { + if (curlwp == uvm.pagedaemon_lwp) { if (page_load > MAX(ptob(minfree), available_memory) / 4) - return (ERESTART); + return (SET_ERROR(ERESTART)); /* Note: reserve is inflated, so we deflate */ page_load += reserve / 8; return (0); } else if (page_load > 0 && arc_reclaim_needed()) { /* memory is low, delay before restarting */ ARCSTAT_INCR(arcstat_memory_throttle_count, 1); - return (EAGAIN); + return (SET_ERROR(EAGAIN)); } page_load = 0; - - if (arc_size > arc_c_min) { - uint64_t evictable_memory = - arc_mru->arcs_lsize[ARC_BUFC_DATA] + - arc_mru->arcs_lsize[ARC_BUFC_METADATA] + - arc_mfu->arcs_lsize[ARC_BUFC_DATA] + - arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; - available_memory += MIN(evictable_memory, arc_size - arc_c_min); - } - - if (inflight_data > available_memory / 4) { - ARCSTAT_INCR(arcstat_memory_throttle_count, 1); - return (ERESTART); - } #endif return (0); } @@ -3394,33 +5811,28 @@ arc_tempreserve_space(uint64_t reserve, int error; uint64_t anon_size; -#ifdef ZFS_DEBUG - /* - * Once in a while, fail for no reason. Everything should cope. - */ - if (spa_get_random(10000) == 0) { - dprintf("forcing random failure\n"); - return (ERESTART); - } -#endif - if (reserve > arc_c/4 && !arc_no_grow) + if (reserve > arc_c/4 && !arc_no_grow) { arc_c = MIN(arc_c_max, reserve * 4); + DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); + } if (reserve > arc_c) - return (ENOMEM); + return (SET_ERROR(ENOMEM)); /* * Don't count loaned bufs as in flight dirty data to prevent long * network delays from blocking transactions that are ready to be * assigned to a txg. */ - anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); + anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - + arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations - * in order to compress/encrypt/etc the data. We therefor need to + * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ - if (error = arc_memory_throttle(reserve, anon_size, txg)) + error = arc_memory_throttle(reserve, txg); + if (error != 0) return (error); /* @@ -3433,86 +5845,301 @@ arc_tempreserve_space(uint64_t reserve, if (reserve + arc_tempreserve + anon_size > arc_c / 2 && anon_size > arc_c / 4) { + uint64_t meta_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + uint64_t data_esize = + refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", - arc_tempreserve>>10, - arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, - arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, - reserve>>10, arc_c>>10); - return (ERESTART); + arc_tempreserve >> 10, meta_esize >> 10, + data_esize >> 10, reserve >> 10, arc_c >> 10); + return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); return (0); } -#if defined(__NetBSD__) && defined(_KERNEL) -/* Reclaim hook registered to uvm for reclaiming KVM and memory */ static void -arc_uvm_reclaim_hook(void) +arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, + kstat_named_t *evict_data, kstat_named_t *evict_metadata) { - - if (mutex_tryenter(&arc_reclaim_thr_lock)) { - cv_broadcast(&arc_reclaim_thr_cv); - mutex_exit(&arc_reclaim_thr_lock); - } + size->value.ui64 = refcount_count(&state->arcs_size); + evict_data->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); + evict_metadata->value.ui64 = + refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int -arc_kva_reclaim_callback(struct callback_entry *ce, void *obj, void *arg) +arc_kstat_update(kstat_t *ksp, int rw) { + arc_stats_t *as = ksp->ks_data; - - if (mutex_tryenter(&arc_reclaim_thr_lock)) { - cv_broadcast(&arc_reclaim_thr_cv); - mutex_exit(&arc_reclaim_thr_lock); + if (rw == KSTAT_WRITE) { + return (EACCES); + } else { + arc_kstat_update_state(arc_anon, + &as->arcstat_anon_size, + &as->arcstat_anon_evictable_data, + &as->arcstat_anon_evictable_metadata); + arc_kstat_update_state(arc_mru, + &as->arcstat_mru_size, + &as->arcstat_mru_evictable_data, + &as->arcstat_mru_evictable_metadata); + arc_kstat_update_state(arc_mru_ghost, + &as->arcstat_mru_ghost_size, + &as->arcstat_mru_ghost_evictable_data, + &as->arcstat_mru_ghost_evictable_metadata); + arc_kstat_update_state(arc_mfu, + &as->arcstat_mfu_size, + &as->arcstat_mfu_evictable_data, + &as->arcstat_mfu_evictable_metadata); + arc_kstat_update_state(arc_mfu_ghost, + &as->arcstat_mfu_ghost_size, + &as->arcstat_mfu_ghost_evictable_data, + &as->arcstat_mfu_ghost_evictable_metadata); } - - return CALLBACK_CHAIN_CONTINUE; + + return (0); +} + +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the ARC eviction + * code is laid out; arc_evict_state() assumes ARC buffers are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +unsigned int +arc_state_multilist_index_func(multilist_t *ml, void *obj) +{ + arc_buf_hdr_t *hdr = obj; + + /* + * We rely on b_dva to generate evenly distributed index + * numbers using buf_hash below. So, as an added precaution, + * let's make sure we never add empty buffers to the arc lists. + */ + ASSERT(!HDR_EMPTY(hdr)); + + /* + * The assumption here, is the hash value for a given + * arc_buf_hdr_t will remain constant throughout it's lifetime + * (i.e. it's b_spa, b_dva, and b_birth fields don't change). + * Thus, we don't need to store the header's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % + multilist_get_num_sublists(ml)); +} + +#ifdef _KERNEL +#ifdef __FreeBSD__ +static eventhandler_tag arc_event_lowmem = NULL; +#endif + +static void +arc_lowmem(void *arg __unused, int howto __unused) +{ + + mutex_enter(&arc_reclaim_lock); + /* XXX: Memory deficit should be passed as argument. */ + needfree = btoc(arc_c >> arc_shrink_shift); + DTRACE_PROBE(arc__needfree); + cv_signal(&arc_reclaim_thread_cv); + + /* + * It is unsafe to block here in arbitrary threads, because we can come + * here from ARC itself and may hold ARC locks and thus risk a deadlock + * with ARC reclaim thread. + */ + if (curlwp == uvm.pagedaemon_lwp) + (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); + mutex_exit(&arc_reclaim_lock); +} +#endif + +static void +arc_state_init(void) +{ + arc_anon = &ARC_anon; + arc_mru = &ARC_mru; + arc_mru_ghost = &ARC_mru_ghost; + arc_mfu = &ARC_mfu; + arc_mfu_ghost = &ARC_mfu_ghost; + arc_l2c_only = &ARC_l2c_only; + + multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + refcount_create(&arc_anon->arcs_size); + refcount_create(&arc_mru->arcs_size); + refcount_create(&arc_mru_ghost->arcs_size); + refcount_create(&arc_mfu->arcs_size); + refcount_create(&arc_mfu_ghost->arcs_size); + refcount_create(&arc_l2c_only->arcs_size); +} + +static void +arc_state_fini(void) +{ + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + refcount_destroy(&arc_anon->arcs_size); + refcount_destroy(&arc_mru->arcs_size); + refcount_destroy(&arc_mru_ghost->arcs_size); + refcount_destroy(&arc_mfu->arcs_size); + refcount_destroy(&arc_mfu_ghost->arcs_size); + refcount_destroy(&arc_l2c_only->arcs_size); + + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); } -#endif /* __NetBSD__ */ +uint64_t +arc_max_bytes(void) +{ + return (arc_c_max); +} void arc_init(void) { - mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); + int i, prefetch_tunable_set = 0; + + mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); + cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); + +#ifdef __FreeBSD__ + mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); +#endif /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; /* Start out with 1/8 of all memory */ - arc_c = physmem * PAGESIZE / 8; + arc_c = kmem_size() / 8; +#ifdef illumos #ifdef _KERNEL /* * On architectures where the physical memory can be larger * than the addressable space (intel in 32-bit mode), we may * need to limit the cache to 1/8 of VM size. */ - arc_c = MIN(arc_c, vmem_size(kmem_arena, VMEM_ALLOC | VMEM_FREE) / 8); + arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); #endif - - /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ - arc_c_min = MAX(arc_c / 4, 64<<20); - /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ - if (arc_c * 8 >= 1<<30) - arc_c_max = (arc_c * 8) - (1<<30); +#endif /* illumos */ + /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ + arc_c_min = MAX(arc_c / 4, arc_abs_min); + /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ + if (arc_c * 8 >= 1 << 30) + arc_c_max = (arc_c * 8) - (1 << 30); else arc_c_max = arc_c_min; - arc_c_max = MAX(arc_c * 6, arc_c_max); + arc_c_max = MAX(arc_c * 5, arc_c_max); + + /* + * In userland, there's only the memory pressure that we artificially + * create (see arc_available_memory()). Don't let arc_c get too + * small, because it can cause transactions to be larger than + * arc_c, causing arc_tempreserve_space() to fail. + */ +#ifndef _KERNEL + arc_c_min = arc_c_max / 2; +#endif +#ifdef _KERNEL /* * Allow the tunables to override our calculations if they are - * reasonable (ie. over 64MB) + * reasonable. */ - if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) + if (zfs_arc_max > arc_abs_min && zfs_arc_max < kmem_size()) { arc_c_max = zfs_arc_max; - if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) + arc_c_min = MIN(arc_c_min, arc_c_max); + } + if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) arc_c_min = zfs_arc_min; +#endif arc_c = arc_c_max; arc_p = (arc_c >> 1); + arc_size = 0; /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; @@ -3524,143 +6151,182 @@ arc_init(void) if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) arc_c_min = arc_meta_limit / 2; + if (zfs_arc_meta_min > 0) { + arc_meta_min = zfs_arc_meta_min; + } else { + arc_meta_min = arc_c_min / 2; + } + if (zfs_arc_grow_retry > 0) arc_grow_retry = zfs_arc_grow_retry; if (zfs_arc_shrink_shift > 0) arc_shrink_shift = zfs_arc_shrink_shift; + /* + * Ensure that arc_no_grow_shift is less than arc_shrink_shift. + */ + if (arc_no_grow_shift >= arc_shrink_shift) + arc_no_grow_shift = arc_shrink_shift - 1; + if (zfs_arc_p_min_shift > 0) arc_p_min_shift = zfs_arc_p_min_shift; + if (zfs_arc_num_sublists_per_state < 1) + zfs_arc_num_sublists_per_state = MAX(max_ncpus, 1); + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; - arc_anon = &ARC_anon; - arc_mru = &ARC_mru; - arc_mru_ghost = &ARC_mru_ghost; - arc_mfu = &ARC_mfu; - arc_mfu_ghost = &ARC_mfu_ghost; - arc_l2c_only = &ARC_l2c_only; - arc_size = 0; - - mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - - list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + zfs_arc_min = arc_c_min; + zfs_arc_max = arc_c_max; + arc_state_init(); buf_init(); - arc_thread_exit = 0; - arc_eviction_list = NULL; - mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); - bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); + arc_reclaim_thread_exit = B_FALSE; +#ifdef __FreeBSD__ + arc_dnlc_evicts_thread_exit = FALSE; +#endif arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; + arc_ksp->ks_update = arc_kstat_update; kstat_install(arc_ksp); } (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, - TS_RUN, maxclsyspri); - -#if defined(__NetBSD__) && defined(_KERNEL) -/* arc_hook.uvm_reclaim_hook = &arc_uvm_reclaim_hook; + TS_RUN, minclsyspri); - uvm_reclaim_hook_add(&arc_hook); - callback_register(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback, - &arc_kva_reclaim_entry, NULL, arc_kva_reclaim_callback); */ +#ifdef __FreeBSD__ +#ifdef _KERNEL + arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, + EVENTHANDLER_PRI_FIRST); +#endif + (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); #endif - arc_dead = FALSE; + arc_dead = B_FALSE; arc_warm = B_FALSE; - if (zfs_write_limit_max == 0) - zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; - else - zfs_write_limit_shift = 0; - mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); + /* + * Calculate maximum amount of dirty data per pool. + * + * If it has been set by /etc/system, take that. + * Otherwise, use a percentage of physical memory defined by + * zfs_dirty_data_max_percent (default 10%) with a cap at + * zfs_dirty_data_max_max (default 4GB). + */ + if (zfs_dirty_data_max == 0) { + zfs_dirty_data_max = ptob(physmem) * + zfs_dirty_data_max_percent / 100; + zfs_dirty_data_max = MIN(zfs_dirty_data_max, + zfs_dirty_data_max_max); + } + +#ifdef _KERNEL + if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) + prefetch_tunable_set = 1; + +#ifdef __i386__ + if (prefetch_tunable_set == 0) { + printf("ZFS NOTICE: Prefetch is disabled by default on i386 " + "-- to enable,\n"); + printf(" add \"vfs.zfs.prefetch_disable=0\" " + "to /boot/loader.conf.\n"); + zfs_prefetch_disable = 1; + } +#else + if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && + prefetch_tunable_set == 0) { + printf("ZFS NOTICE: Prefetch is disabled by default if less " + "than 4GB of RAM is present;\n" + " to enable, add \"vfs.zfs.prefetch_disable=0\" " + "to /boot/loader.conf.\n"); + zfs_prefetch_disable = 1; + } +#endif + /* Warn about ZFS memory and address space requirements. */ + if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { + printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " + "expect unstable behavior.\n"); + } + if (kmem_size() < 512 * (1 << 20)) { + printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " + "expect unstable behavior.\n"); + printf(" Consider tuning vm.kmem_size and " + "vm.kmem_size_max\n"); + printf(" in /boot/loader.conf.\n"); + } +#endif } void arc_fini(void) { - mutex_enter(&arc_reclaim_thr_lock); - arc_thread_exit = 1; - while (arc_thread_exit != 0) - cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); - mutex_exit(&arc_reclaim_thr_lock); + mutex_enter(&arc_reclaim_lock); + arc_reclaim_thread_exit = B_TRUE; + /* + * The reclaim thread will set arc_reclaim_thread_exit back to + * B_FALSE when it is finished exiting; we're waiting for that. + */ + while (arc_reclaim_thread_exit) { + cv_signal(&arc_reclaim_thread_cv); + cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); + } + mutex_exit(&arc_reclaim_lock); - arc_flush(NULL); + /* Use B_TRUE to ensure *all* buffers are evicted */ + arc_flush(NULL, B_TRUE); - arc_dead = TRUE; +#ifdef __FreeBSD__ + mutex_enter(&arc_dnlc_evicts_lock); + arc_dnlc_evicts_thread_exit = TRUE; + + /* + * The user evicts thread will set arc_user_evicts_thread_exit + * to FALSE when it is finished exiting; we're waiting for that. + */ + while (arc_dnlc_evicts_thread_exit) { + cv_signal(&arc_dnlc_evicts_cv); + cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); + } + mutex_exit(&arc_dnlc_evicts_lock); + + mutex_destroy(&arc_dnlc_evicts_lock); + cv_destroy(&arc_dnlc_evicts_cv); +#endif + + arc_dead = B_TRUE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } - mutex_destroy(&arc_eviction_mtx); - mutex_destroy(&arc_reclaim_thr_lock); - cv_destroy(&arc_reclaim_thr_cv); - - list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - - mutex_destroy(&arc_anon->arcs_mtx); - mutex_destroy(&arc_mru->arcs_mtx); - mutex_destroy(&arc_mru_ghost->arcs_mtx); - mutex_destroy(&arc_mfu->arcs_mtx); - mutex_destroy(&arc_mfu_ghost->arcs_mtx); - mutex_destroy(&arc_l2c_only->arcs_mtx); - - mutex_destroy(&zfs_write_limit_lock); - -#if defined(__NetBSD__) && defined(_KERNEL) -/* uvm_reclaim_hook_del(&arc_hook); - callback_unregister(&vm_map_to_kernel(kernel_map)->vmk_reclaim_callback, - &arc_kva_reclaim_entry); */ -#endif - + mutex_destroy(&arc_reclaim_lock); + cv_destroy(&arc_reclaim_thread_cv); + cv_destroy(&arc_reclaim_waiters_cv); + + arc_state_fini(); buf_fini(); - ASSERT(arc_loaned_bytes == 0); + ASSERT0(arc_loaned_bytes); + +#ifdef __FreeBSD__ +#ifdef _KERNEL + if (arc_event_lowmem != NULL) + EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); +#endif +#endif } /* @@ -3718,8 +6384,12 @@ arc_fini(void) * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are - * not already there. It scans until a headroom of buffers is satisfied, - * which itself is a buffer for ARC eviction. The thread that does this is + * not already there. It scans until a headroom of buffers is satisfied, + * which itself is a buffer for ARC eviction. If a compressible buffer is + * found during scanning and selected for writing to an L2ARC device, we + * temporarily boost scanning headroom during the next scan cycle to make + * sure we adapt to compression effects (which might significantly reduce + * the data volume we write to L2ARC). The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * @@ -3784,6 +6454,11 @@ arc_fini(void) * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache + * l2arc_headroom_boost when we find compressed buffers during ARC + * scanning, we multiply headroom by this + * percentage factor for the next scan cycle, + * since more compressed buffers are likely to + * be present * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are @@ -3800,7 +6475,7 @@ arc_fini(void) */ static boolean_t -l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) +l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: @@ -3809,22 +6484,45 @@ l2arc_write_eligible(uint64_t spa_guid, * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ - if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || - HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) + if (hdr->b_spa != spa_guid) { + ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); + return (B_FALSE); + } + if (HDR_HAS_L2HDR(hdr)) { + ARCSTAT_BUMP(arcstat_l2_write_in_l2); + return (B_FALSE); + } + if (HDR_IO_IN_PROGRESS(hdr)) { + ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); + return (B_FALSE); + } + if (!HDR_L2CACHE(hdr)) { + ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); return (B_FALSE); + } return (B_TRUE); } static uint64_t -l2arc_write_size(l2arc_dev_t *dev) +l2arc_write_size(void) { uint64_t size; - size = dev->l2ad_write; + /* + * Make sure our globals have meaningful values in case the user + * altered them. + */ + size = l2arc_write_max; + if (size == 0) { + cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " + "be greater than zero, resetting it to the default (%d)", + L2ARC_WRITE_SIZE); + size = l2arc_write_max = L2ARC_WRITE_SIZE; + } if (arc_warm == B_FALSE) - size += dev->l2ad_boost; + size += l2arc_write_boost; return (size); @@ -3852,20 +6550,6 @@ l2arc_write_interval(clock_t began, uint return (next); } -static void -l2arc_hdr_stat_add(void) -{ - ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); - ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); -} - -static void -l2arc_hdr_stat_remove(void) -{ - ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); - ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); -} - /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. @@ -3941,9 +6625,13 @@ l2arc_do_free_on_write() for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); - ASSERT(df->l2df_data != NULL); - ASSERT(df->l2df_func != NULL); - df->l2df_func(df->l2df_data, df->l2df_size); + ASSERT3P(df->l2df_data, !=, NULL); + if (df->l2df_type == ARC_BUFC_METADATA) { + zio_buf_free(df->l2df_data, df->l2df_size); + } else { + ASSERT(df->l2df_type == ARC_BUFC_DATA); + zio_data_buf_free(df->l2df_data, df->l2df_size); + } list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } @@ -3961,66 +6649,107 @@ l2arc_write_done(zio_t *zio) l2arc_write_callback_t *cb; l2arc_dev_t *dev; list_t *buflist; - arc_buf_hdr_t *head, *ab, *ab_prev; - l2arc_buf_hdr_t *abl2; + arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; + int64_t bytes_dropped = 0; cb = zio->io_private; - ASSERT(cb != NULL); + ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; - ASSERT(dev != NULL); + ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; - ASSERT(head != NULL); - buflist = dev->l2ad_buflist; - ASSERT(buflist != NULL); + ASSERT3P(head, !=, NULL); + buflist = &dev->l2ad_buflist; + ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); - mutex_enter(&l2arc_buflist_mtx); - /* * All writes completed, or an error was hit. */ - for (ab = list_prev(buflist, head); ab; ab = ab_prev) { - ab_prev = list_prev(buflist, ab); +top: + mutex_enter(&dev->l2ad_mtx); + for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(buflist, hdr); + + hash_lock = HDR_LOCK(hdr); - hash_lock = HDR_LOCK(ab); + /* + * We cannot use mutex_enter or else we can deadlock + * with l2arc_write_buffers (due to swapping the order + * the hash lock and l2ad_mtx are taken). + */ if (!mutex_tryenter(hash_lock)) { /* - * This buffer misses out. It may be in a stage - * of eviction. Its ARC_L2_WRITING flag will be - * left set, denying reads to this buffer. + * Missed the hash lock. We must retry so we + * don't leave the ARC_FLAG_L2_WRITING bit set. */ - ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); - continue; + ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); + + /* + * We don't want to rescan the headers we've + * already marked as having been written out, so + * we reinsert the head node so we can pick up + * where we left off. + */ + list_remove(buflist, head); + list_insert_after(buflist, hdr, head); + + mutex_exit(&dev->l2ad_mtx); + + /* + * We wait for the hash lock to become available + * to try and prevent busy waiting, and increase + * the chance we'll be able to acquire the lock + * the next time around. + */ + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; } + /* + * We could not have been moved into the arc_l2c_only + * state while in-flight due to our ARC_FLAG_L2_WRITING + * bit being set. Let's just ensure that's being enforced. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ - list_remove(buflist, ab); - abl2 = ab->b_l2hdr; - ab->b_l2hdr = NULL; - kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); - ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + list_remove(buflist, hdr); + l2arc_trim(hdr); + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); + + ARCSTAT_INCR(arcstat_l2_asize, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); + + bytes_dropped += arc_hdr_size(hdr); + (void) refcount_remove_many(&dev->l2ad_alloc, + arc_hdr_size(hdr), hdr); } /* - * Allow ARC to begin reads to this L2ARC entry. + * Allow ARC to begin reads and ghost list evictions to + * this L2ARC entry. */ - ab->b_flags &= ~ARC_L2_WRITING; + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); - kmem_cache_free(hdr_cache, head); - mutex_exit(&l2arc_buflist_mtx); + ASSERT(!HDR_HAS_L1HDR(head)); + kmem_cache_free(hdr_l2only_cache, head); + mutex_exit(&dev->l2ad_mtx); + + vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); @@ -4036,34 +6765,63 @@ l2arc_read_done(zio_t *zio) { l2arc_read_callback_t *cb; arc_buf_hdr_t *hdr; - arc_buf_t *buf; kmutex_t *hash_lock; - int equal; + boolean_t valid_cksum; - ASSERT(zio->io_vd != NULL); + ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); cb = zio->io_private; - ASSERT(cb != NULL); - buf = cb->l2rcb_buf; - ASSERT(buf != NULL); - hdr = buf->b_hdr; - ASSERT(hdr != NULL); + ASSERT3P(cb, !=, NULL); + hdr = cb->l2rcb_hdr; + ASSERT3P(hdr, !=, NULL); hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + + /* + * If the data was read into a temporary buffer, + * move it and free the buffer. + */ + if (cb->l2rcb_data != NULL) { + ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); + if (zio->io_error == 0) { + bcopy(cb->l2rcb_data, hdr->b_l1hdr.b_pdata, + arc_hdr_size(hdr)); + } + + /* + * The following must be done regardless of whether + * there was an error: + * - free the temporary buffer + * - point zio to the real ARC buffer + * - set zio size accordingly + * These are required because zio is either re-used for + * an I/O of the block in the case of the error + * or the zio is passed to arc_read_done() and it + * needs real data. + */ + zio_data_buf_free(cb->l2rcb_data, zio->io_size); + zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); + zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_pdata; + } + + ASSERT3P(zio->io_data, !=, NULL); /* * Check this survived the L2ARC journey. */ - equal = arc_cksum_equal(buf); - if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata); + zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ + zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + + valid_cksum = arc_cksum_is_equal(hdr, zio); + if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); - zio->io_private = buf; - zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ - zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + zio->io_private = hdr; arc_read_done(zio); } else { mutex_exit(hash_lock); @@ -4074,9 +6832,9 @@ l2arc_read_done(zio_t *zio) if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { - zio->io_error = EIO; + zio->io_error = SET_ERROR(EIO); } - if (!equal) + if (!valid_cksum) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* @@ -4089,9 +6847,10 @@ l2arc_read_done(zio_t *zio) ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); - zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, - buf->b_data, zio->io_size, arc_read_done, buf, - zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, + hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done, + hdr, zio->io_priority, cb->l2rcb_flags, + &cb->l2rcb_zb)); } } @@ -4108,35 +6867,37 @@ l2arc_read_done(zio_t *zio) * the data lists. This function returns a locked list, and also returns * the lock pointer. */ -static list_t * -l2arc_list_locked(int list_num, kmutex_t **lock) +static multilist_sublist_t * +l2arc_sublist_lock(int list_num) { - list_t *list; + multilist_t *ml = NULL; + unsigned int idx; ASSERT(list_num >= 0 && list_num <= 3); switch (list_num) { case 0: - list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; - *lock = &arc_mfu->arcs_mtx; + ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: - list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; - *lock = &arc_mru->arcs_mtx; + ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: - list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; - *lock = &arc_mfu->arcs_mtx; + ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: - list = &arc_mru->arcs_list[ARC_BUFC_DATA]; - *lock = &arc_mru->arcs_mtx; + ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; } - ASSERT(!(MUTEX_HELD(*lock))); - mutex_enter(*lock); - return (list); + /* + * Return a randomly-selected sublist. This is acceptable + * because the caller feeds only a little bit of data for each + * call (8MB). Subsequent calls will result in different + * sublists being selected. + */ + idx = multilist_get_random_index(ml); + return (multilist_sublist_lock(ml, idx)); } /* @@ -4149,15 +6910,11 @@ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; - l2arc_buf_hdr_t *abl2; - arc_buf_hdr_t *ab, *ab_prev; + arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; - buflist = dev->l2ad_buflist; - - if (buflist == NULL) - return; + buflist = &dev->l2ad_buflist; if (!all && dev->l2ad_first) { /* @@ -4180,35 +6937,41 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t d uint64_t, taddr, boolean_t, all); top: - mutex_enter(&l2arc_buflist_mtx); - for (ab = list_tail(buflist); ab; ab = ab_prev) { - ab_prev = list_prev(buflist, ab); + mutex_enter(&dev->l2ad_mtx); + for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(buflist, hdr); + + hash_lock = HDR_LOCK(hdr); - hash_lock = HDR_LOCK(ab); + /* + * We cannot use mutex_enter or else we can deadlock + * with l2arc_write_buffers (due to swapping the order + * the hash lock and l2ad_mtx are taken). + */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } - if (HDR_L2_WRITE_HEAD(ab)) { + if (HDR_L2_WRITE_HEAD(hdr)) { /* * We hit a write head node. Leave it for * l2arc_write_done(). */ - list_remove(buflist, ab); + list_remove(buflist, hdr); mutex_exit(hash_lock); continue; } - if (!all && ab->b_l2hdr != NULL && - (ab->b_l2hdr->b_daddr > taddr || - ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { + if (!all && HDR_HAS_L2HDR(hdr) && + (hdr->b_l2hdr.b_daddr >= taddr || + hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. @@ -4217,94 +6980,78 @@ top: break; } - if (HDR_FREE_IN_PROGRESS(ab)) { - /* - * Already on the path to destruction. - */ - mutex_exit(hash_lock); - continue; - } - - if (ab->b_state == arc_l2c_only) { - ASSERT(!HDR_L2_READING(ab)); + ASSERT(HDR_HAS_L2HDR(hdr)); + if (!HDR_HAS_L1HDR(hdr)) { + ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_size. */ - arc_change_state(arc_anon, ab, hash_lock); - arc_hdr_destroy(ab); + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); } else { + ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); + ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ - if (HDR_L2_READING(ab)) { + if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); - ab->b_flags |= ARC_L2_EVICTED; + arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } - /* - * Tell ARC this no longer exists in L2ARC. - */ - if (ab->b_l2hdr != NULL) { - abl2 = ab->b_l2hdr; - ab->b_l2hdr = NULL; - kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); - ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); - } - list_remove(buflist, ab); + /* Ensure this header has finished being written */ + ASSERT(!HDR_L2_WRITING(hdr)); - /* - * This may have been leftover after a - * failed write. - */ - ab->b_flags &= ~ARC_L2_WRITING; + arc_hdr_l2hdr_destroy(hdr); } mutex_exit(hash_lock); } - mutex_exit(&l2arc_buflist_mtx); - - vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); - dev->l2ad_evict = taddr; + mutex_exit(&dev->l2ad_mtx); } /* * Find and write ARC buffers to the L2ARC device. * - * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid + * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. + * The headroom_boost is an in-out parameter used to maintain headroom boost + * state between calls to this function. + * + * Returns the number of bytes actually written (which may be smaller than + * the delta by which the device hand has changed due to alignment). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { - arc_buf_hdr_t *ab, *ab_prev, *head; - l2arc_buf_hdr_t *hdrl2; - list_t *list; - uint64_t passed_sz, write_sz, buf_sz, headroom; - void *buf_data; - kmutex_t *hash_lock, *list_lock; - boolean_t have_lock, full; + arc_buf_hdr_t *hdr, *hdr_prev, *head; + uint64_t write_asize, write_psize, write_sz, headroom; + boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; - uint64_t guid = spa_guid(spa); + uint64_t guid = spa_load_guid(spa); + int try; - ASSERT(dev->l2ad_vdev != NULL); + ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; - write_sz = 0; + write_sz = write_asize = write_psize = 0; full = B_FALSE; - head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); - head->b_flags |= ARC_L2_WRITE_HEAD; + head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); + arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); + ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); /* * Copy buffers for L2ARC writing. */ - mutex_enter(&l2arc_buflist_mtx); - for (int try = 0; try <= 3; try++) { - list = l2arc_list_locked(try, &list_lock); - passed_sz = 0; + for (try = 0; try <= 3; try++) { + multilist_sublist_t *mls = l2arc_sublist_lock(try); + uint64_t passed_sz = 0; + + ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); /* * L2ARC fast warmup. @@ -4312,44 +7059,70 @@ l2arc_write_buffers(spa_t *spa, l2arc_de * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ - headroom = target_sz * l2arc_headroom; if (arc_warm == B_FALSE) - ab = list_head(list); + hdr = multilist_sublist_head(mls); else - ab = list_tail(list); + hdr = multilist_sublist_tail(mls); + if (hdr == NULL) + ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); + + headroom = target_sz * l2arc_headroom; + if (zfs_compressed_arc_enabled) + headroom = (headroom * l2arc_headroom_boost) / 100; + + for (; hdr; hdr = hdr_prev) { + kmutex_t *hash_lock; - for (; ab; ab = ab_prev) { if (arc_warm == B_FALSE) - ab_prev = list_next(list, ab); + hdr_prev = multilist_sublist_next(mls, hdr); else - ab_prev = list_prev(list, ab); - - hash_lock = HDR_LOCK(ab); - have_lock = MUTEX_HELD(hash_lock); - if (!have_lock && !mutex_tryenter(hash_lock)) { + hdr_prev = multilist_sublist_prev(mls, hdr); + ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, + HDR_GET_LSIZE(hdr)); + + hash_lock = HDR_LOCK(hdr); + if (!mutex_tryenter(hash_lock)) { + ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); /* * Skip this buffer rather than waiting. */ continue; } - passed_sz += ab->b_size; + passed_sz += HDR_GET_LSIZE(hdr); if (passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); break; } - if (!l2arc_write_eligible(guid, ab)) { + if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); continue; } - if ((write_sz + ab->b_size) > target_sz) { + /* + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + + ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); + ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); + ASSERT3U(arc_hdr_size(hdr), >, 0); + uint64_t size = arc_hdr_size(hdr); + uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, + size); + + if ((write_psize + asize) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_full); break; } @@ -4359,7 +7132,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_de * l2arc_write_done() can find where the * write buffers begin without searching. */ - list_insert_head(dev->l2ad_buflist, head); + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_buflist, head); + mutex_exit(&dev->l2ad_mtx); cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); @@ -4367,76 +7142,92 @@ l2arc_write_buffers(spa_t *spa, l2arc_de cb->l2wcb_head = head; pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); + ARCSTAT_BUMP(arcstat_l2_write_pios); } - /* - * Create and add a new L2ARC header. - */ - hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); - hdrl2->b_dev = dev; - hdrl2->b_daddr = dev->l2ad_hand; - - ab->b_flags |= ARC_L2_WRITING; - ab->b_l2hdr = hdrl2; - list_insert_head(dev->l2ad_buflist, ab); - buf_data = ab->b_buf->b_data; - buf_sz = ab->b_size; + hdr->b_l2hdr.b_dev = dev; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + arc_hdr_set_flags(hdr, + ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); + + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_buflist, hdr); + mutex_exit(&dev->l2ad_mtx); + + (void) refcount_add_many(&dev->l2ad_alloc, size, hdr); /* - * Compute and store the buffer cksum before - * writing. On debug the cksum is verified first. + * Normally the L2ARC can use the hdr's data, but if + * we're sharing data between the hdr and one of its + * bufs, L2ARC needs its own copy of the data so that + * the ZIO below can't race with the buf consumer. To + * ensure that this copy will be available for the + * lifetime of the ZIO and be cleaned up afterwards, we + * add it to the l2arc_free_on_write queue. */ - arc_cksum_verify(ab->b_buf); - arc_cksum_compute(ab->b_buf, B_TRUE); - - mutex_exit(hash_lock); + void *to_write; + if (!HDR_SHARED_DATA(hdr) && size == asize) { + to_write = hdr->b_l1hdr.b_pdata; + } else { + arc_buf_contents_t type = arc_buf_type(hdr); + if (type == ARC_BUFC_METADATA) { + to_write = zio_buf_alloc(asize); + } else { + ASSERT3U(type, ==, ARC_BUFC_DATA); + to_write = zio_data_buf_alloc(asize); + } + bcopy(hdr->b_l1hdr.b_pdata, to_write, size); + if (asize != size) + bzero(to_write + size, asize - size); + l2arc_free_data_on_write(to_write, asize, type); + } wzio = zio_write_phys(pio, dev->l2ad_vdev, - dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, - NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, + hdr->b_l2hdr.b_daddr, asize, to_write, + ZIO_CHECKSUM_OFF, NULL, hdr, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); + write_sz += HDR_GET_LSIZE(hdr); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); - (void) zio_nowait(wzio); - /* - * Keep the clock hand suitably device-aligned. - */ - buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); + write_asize += size; + write_psize += asize; + dev->l2ad_hand += asize; + + mutex_exit(hash_lock); - write_sz += buf_sz; - dev->l2ad_hand += buf_sz; + (void) zio_nowait(wzio); } - mutex_exit(list_lock); + multilist_sublist_unlock(mls); if (full == B_TRUE) break; } - mutex_exit(&l2arc_buflist_mtx); + /* No buffers selected for writing? */ if (pio == NULL) { - ASSERT3U(write_sz, ==, 0); - kmem_cache_free(hdr_cache, head); + ASSERT0(write_sz); + ASSERT(!HDR_HAS_L1HDR(head)); + kmem_cache_free(hdr_l2only_cache, head); return (0); } - ASSERT3U(write_sz, <=, target_sz); + ASSERT3U(write_psize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); - ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); + ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); - vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0); + ARCSTAT_INCR(arcstat_l2_asize, write_asize); + vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { - vdev_space_update(dev->l2ad_vdev, - dev->l2ad_end - dev->l2ad_hand, 0, 0); dev->l2ad_hand = dev->l2ad_start; - dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; } @@ -4444,7 +7235,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_de (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; - return (write_sz); + return (write_asize); } /* @@ -4452,13 +7243,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_de * heart of the L2ARC. */ static void -l2arc_feed_thread(void *unused __unused) +l2arc_feed_thread(void *dummy __unused) { callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; - clock_t begin, next = ddi_get_lbolt(); + clock_t begin, next = ddi_get_lbolt() + hz; CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); @@ -4467,9 +7258,9 @@ l2arc_feed_thread(void *unused __unused) while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, - (hz * l2arc_feed_secs)); + next - ddi_get_lbolt()); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); - next = ddi_get_lbolt(); + next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. @@ -4496,7 +7287,17 @@ l2arc_feed_thread(void *unused __unused) continue; spa = dev->l2ad_spa; - ASSERT(spa != NULL); + ASSERT3P(spa, !=, NULL); + + /* + * If the pool is read-only then force the feed thread to + * sleep a little longer. + */ + if (!spa_writeable(spa)) { + next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } /* * Avoid contributing to memory pressure. @@ -4509,7 +7310,7 @@ l2arc_feed_thread(void *unused __unused) ARCSTAT_BUMP(arcstat_l2_feeds); - size = l2arc_write_size(dev); + size = l2arc_write_size(); /* * Evict L2ARC buffers that will be overwritten. @@ -4561,31 +7362,30 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) ASSERT(!l2arc_vdev_present(vd)); + vdev_ashift_optimize(vd); + /* * Create a new l2arc device entry. */ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; - adddev->l2ad_write = l2arc_write_max; - adddev->l2ad_boost = l2arc_write_boost; adddev->l2ad_start = VDEV_LABEL_START_SIZE; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; - adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; - ASSERT3U(adddev->l2ad_write, >, 0); + mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ - adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); - list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l2node)); + list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); + refcount_create(&adddev->l2ad_alloc); /* * Add device to global list @@ -4615,7 +7415,7 @@ l2arc_remove_vdev(vdev_t *vd) break; } } - ASSERT(remdev != NULL); + ASSERT3P(remdev, !=, NULL); /* * Remove device from global list @@ -4629,8 +7429,9 @@ l2arc_remove_vdev(vdev_t *vd) * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); - list_destroy(remdev->l2ad_buflist); - kmem_free(remdev->l2ad_buflist, sizeof (list_t)); + list_destroy(&remdev->l2ad_buflist); + mutex_destroy(&remdev->l2ad_mtx); + refcount_destroy(&remdev->l2ad_alloc); kmem_free(remdev, sizeof (l2arc_dev_t)); } @@ -4645,7 +7446,6 @@ l2arc_init(void) mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; @@ -4670,7 +7470,6 @@ l2arc_fini(void) mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_dev_mtx); - mutex_destroy(&l2arc_buflist_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/blkptr.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/blkptr.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/blkptr.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/blkptr.c 17 Jul 2014 16:23:18 -0000 @@ -0,0 +1,119 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#include +#include +#include + +/* + * Embedded-data Block Pointers + * + * Normally, block pointers point (via their DVAs) to a block which holds data. + * If the data that we need to store is very small, this is an inefficient + * use of space, because a block must be at minimum 1 sector (typically 512 + * bytes or 4KB). Additionally, reading these small blocks tends to generate + * more random reads. + * + * Embedded-data Block Pointers allow small pieces of data (the "payload", + * up to 112 bytes) to be stored in the block pointer itself, instead of + * being pointed to. The "Pointer" part of this name is a bit of a + * misnomer, as nothing is pointed to. + * + * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to + * be embedded in the block pointer. The logic for this is handled in + * the SPA, by the zio pipeline. Therefore most code outside the zio + * pipeline doesn't need special-cases to handle these block pointers. + * + * See spa.h for details on the exact layout of embedded block pointers. + */ + +void +encode_embedded_bp_compressed(blkptr_t *bp, void *data, + enum zio_compress comp, int uncompressed_size, int compressed_size) +{ + uint64_t *bp64 = (uint64_t *)bp; + uint64_t w = 0; + uint8_t *data8 = data; + + ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE); + ASSERT(uncompressed_size == compressed_size || + comp != ZIO_COMPRESS_OFF); + ASSERT3U(comp, >=, ZIO_COMPRESS_OFF); + ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); + + bzero(bp, sizeof (*bp)); + BP_SET_EMBEDDED(bp, B_TRUE); + BP_SET_COMPRESS(bp, comp); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + BPE_SET_LSIZE(bp, uncompressed_size); + BPE_SET_PSIZE(bp, compressed_size); + + /* + * Encode the byte array into the words of the block pointer. + * First byte goes into low bits of first word (little endian). + */ + for (int i = 0; i < compressed_size; i++) { + BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]); + if (i % sizeof (w) == sizeof (w) - 1) { + /* we've reached the end of a word */ + ASSERT3P(bp64, <, bp + 1); + *bp64 = w; + bp64++; + if (!BPE_IS_PAYLOADWORD(bp, bp64)) + bp64++; + w = 0; + } + } + /* write last partial word */ + if (bp64 < (uint64_t *)(bp + 1)) + *bp64 = w; +} + +/* + * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be + * more than BPE_PAYLOAD_SIZE bytes). + */ +void +decode_embedded_bp_compressed(const blkptr_t *bp, void *buf) +{ + int psize; + uint8_t *buf8 = buf; + uint64_t w = 0; + const uint64_t *bp64 = (const uint64_t *)bp; + + ASSERT(BP_IS_EMBEDDED(bp)); + + psize = BPE_GET_PSIZE(bp); + + /* + * Decode the words of the block pointer into the byte array. + * Low bits of first word are the first byte (little endian). + */ + for (int i = 0; i < psize; i++) { + if (i % sizeof (w) == 0) { + /* beginning of a word */ + ASSERT3P(bp64, <, bp + 1); + w = *bp64; + bp64++; + if (!BPE_IS_PAYLOADWORD(bp, bp64)) + bp64++; + } + buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY); + } +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 bplist.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c 27 Feb 2010 22:30:41 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c 23 Mar 2013 15:29:23 -0000 @@ -19,351 +19,59 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. */ #include #include + void -bplist_init(bplist_t *bpl) +bplist_create(bplist_t *bpl) { - bzero(bpl, sizeof (*bpl)); mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&bpl->bpl_list, sizeof (bplist_entry_t), + offsetof(bplist_entry_t, bpe_node)); } void -bplist_fini(bplist_t *bpl) +bplist_destroy(bplist_t *bpl) { - ASSERT(bpl->bpl_queue == NULL); + list_destroy(&bpl->bpl_list); mutex_destroy(&bpl->bpl_lock); } -static int -bplist_hold(bplist_t *bpl) -{ - ASSERT(MUTEX_HELD(&bpl->bpl_lock)); - if (bpl->bpl_dbuf == NULL) { - int err = dmu_bonus_hold(bpl->bpl_mos, - bpl->bpl_object, bpl, &bpl->bpl_dbuf); - if (err) - return (err); - bpl->bpl_phys = bpl->bpl_dbuf->db_data; - } - return (0); -} - -uint64_t -bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx) -{ - int size; - - size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ? - BPLIST_SIZE_V0 : sizeof (bplist_phys_t); - - return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize, - DMU_OT_BPLIST_HDR, size, tx)); -} - void -bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx) +bplist_append(bplist_t *bpl, const blkptr_t *bp) { - VERIFY(dmu_object_free(mos, object, tx) == 0); -} - -int -bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object) -{ - dmu_object_info_t doi; - int err; - - err = dmu_object_info(mos, object, &doi); - if (err) - return (err); - - mutex_enter(&bpl->bpl_lock); - - ASSERT(bpl->bpl_dbuf == NULL); - ASSERT(bpl->bpl_phys == NULL); - ASSERT(bpl->bpl_cached_dbuf == NULL); - ASSERT(bpl->bpl_queue == NULL); - ASSERT(object != 0); - ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST); - ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR); - - bpl->bpl_mos = mos; - bpl->bpl_object = object; - bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1); - bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT; - bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t)); - - mutex_exit(&bpl->bpl_lock); - return (0); -} + bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP); -void -bplist_close(bplist_t *bpl) -{ mutex_enter(&bpl->bpl_lock); - - ASSERT(bpl->bpl_queue == NULL); - - if (bpl->bpl_cached_dbuf) { - dmu_buf_rele(bpl->bpl_cached_dbuf, bpl); - bpl->bpl_cached_dbuf = NULL; - } - if (bpl->bpl_dbuf) { - dmu_buf_rele(bpl->bpl_dbuf, bpl); - bpl->bpl_dbuf = NULL; - bpl->bpl_phys = NULL; - } - + bpe->bpe_blk = *bp; + list_insert_tail(&bpl->bpl_list, bpe); mutex_exit(&bpl->bpl_lock); } -boolean_t -bplist_empty(bplist_t *bpl) -{ - boolean_t rv; - - if (bpl->bpl_object == 0) - return (B_TRUE); - - mutex_enter(&bpl->bpl_lock); - VERIFY(0 == bplist_hold(bpl)); /* XXX */ - rv = (bpl->bpl_phys->bpl_entries == 0); - mutex_exit(&bpl->bpl_lock); - - return (rv); -} - -static int -bplist_cache(bplist_t *bpl, uint64_t blkid) -{ - int err = 0; - - if (bpl->bpl_cached_dbuf == NULL || - bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) { - if (bpl->bpl_cached_dbuf != NULL) - dmu_buf_rele(bpl->bpl_cached_dbuf, bpl); - err = dmu_buf_hold(bpl->bpl_mos, - bpl->bpl_object, blkid << bpl->bpl_blockshift, - bpl, &bpl->bpl_cached_dbuf); - ASSERT(err || bpl->bpl_cached_dbuf->db_size == - 1ULL << bpl->bpl_blockshift); - } - return (err); -} - -int -bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp) -{ - uint64_t blk, off; - blkptr_t *bparray; - int err; - - mutex_enter(&bpl->bpl_lock); - - err = bplist_hold(bpl); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - if (*itorp >= bpl->bpl_phys->bpl_entries) { - mutex_exit(&bpl->bpl_lock); - return (ENOENT); - } - - blk = *itorp >> bpl->bpl_bpshift; - off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift); - - err = bplist_cache(bpl, blk); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - bparray = bpl->bpl_cached_dbuf->db_data; - *bp = bparray[off]; - (*itorp)++; - mutex_exit(&bpl->bpl_lock); - return (0); -} - -int -bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx) -{ - uint64_t blk, off; - blkptr_t *bparray; - int err; - - ASSERT(!BP_IS_HOLE(bp)); - mutex_enter(&bpl->bpl_lock); - err = bplist_hold(bpl); - if (err) - return (err); - - blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift; - off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift); - - err = bplist_cache(bpl, blk); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx); - bparray = bpl->bpl_cached_dbuf->db_data; - bparray[off] = *bp; - - /* We never need the fill count. */ - bparray[off].blk_fill = 0; - - /* The bplist will compress better if we can leave off the checksum */ - if (!BP_GET_DEDUP(&bparray[off])) - bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum)); - - dmu_buf_will_dirty(bpl->bpl_dbuf, tx); - bpl->bpl_phys->bpl_entries++; - bpl->bpl_phys->bpl_bytes += - bp_get_dsize_sync(dmu_objset_spa(bpl->bpl_mos), bp); - if (bpl->bpl_havecomp) { - bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp); - bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp); - } - mutex_exit(&bpl->bpl_lock); - - return (0); -} - -void -bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx) -{ - VERIFY(bplist_enqueue(bpl, bp, tx) == 0); -} - /* - * Deferred entry; will be processed later by bplist_sync(). + * To aid debugging, we keep the most recently removed entry. This way if + * we are in the callback, we can easily locate the entry. */ -void -bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp) -{ - bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP); - - ASSERT(!BP_IS_HOLE(bp)); - mutex_enter(&bpl->bpl_lock); - bpq->bpq_blk = *bp; - bpq->bpq_next = bpl->bpl_queue; - bpl->bpl_queue = bpq; - mutex_exit(&bpl->bpl_lock); -} +static bplist_entry_t *bplist_iterate_last_removed; void -bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func, void *arg, dmu_tx_t *tx) +bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) { - bplist_q_t *bpq; + bplist_entry_t *bpe; mutex_enter(&bpl->bpl_lock); - while ((bpq = bpl->bpl_queue) != NULL) { - bpl->bpl_queue = bpq->bpq_next; + while (bpe = list_head(&bpl->bpl_list)) { + bplist_iterate_last_removed = bpe; + list_remove(&bpl->bpl_list, bpe); mutex_exit(&bpl->bpl_lock); - func(arg, &bpq->bpq_blk, tx); - kmem_free(bpq, sizeof (*bpq)); + func(arg, &bpe->bpe_blk, tx); + kmem_free(bpe, sizeof (*bpe)); mutex_enter(&bpl->bpl_lock); } mutex_exit(&bpl->bpl_lock); } - -void -bplist_vacate(bplist_t *bpl, dmu_tx_t *tx) -{ - mutex_enter(&bpl->bpl_lock); - ASSERT3P(bpl->bpl_queue, ==, NULL); - VERIFY(0 == bplist_hold(bpl)); - dmu_buf_will_dirty(bpl->bpl_dbuf, tx); - VERIFY(0 == dmu_free_range(bpl->bpl_mos, - bpl->bpl_object, 0, -1ULL, tx)); - bpl->bpl_phys->bpl_entries = 0; - bpl->bpl_phys->bpl_bytes = 0; - if (bpl->bpl_havecomp) { - bpl->bpl_phys->bpl_comp = 0; - bpl->bpl_phys->bpl_uncomp = 0; - } - mutex_exit(&bpl->bpl_lock); -} - -int -bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) -{ - int err; - - mutex_enter(&bpl->bpl_lock); - - err = bplist_hold(bpl); - if (err) { - mutex_exit(&bpl->bpl_lock); - return (err); - } - - *usedp = bpl->bpl_phys->bpl_bytes; - if (bpl->bpl_havecomp) { - *compp = bpl->bpl_phys->bpl_comp; - *uncompp = bpl->bpl_phys->bpl_uncomp; - } - mutex_exit(&bpl->bpl_lock); - - if (!bpl->bpl_havecomp) { - uint64_t itor = 0, comp = 0, uncomp = 0; - blkptr_t bp; - - while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { - comp += BP_GET_PSIZE(&bp); - uncomp += BP_GET_UCSIZE(&bp); - } - if (err == ENOENT) - err = 0; - *compp = comp; - *uncompp = uncomp; - } - - return (err); -} - -/* - * Return (in *dsizep) the amount of space on the deadlist which is: - * mintxg < blk_birth <= maxtxg - */ -int -bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg, - uint64_t *dsizep) -{ - uint64_t size = 0; - uint64_t itor = 0; - blkptr_t bp; - int err; - - /* - * As an optimization, if they want the whole txg range, just - * get bpl_bytes rather than iterating over the bps. - */ - if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) { - mutex_enter(&bpl->bpl_lock); - err = bplist_hold(bpl); - if (err == 0) - *dsizep = bpl->bpl_phys->bpl_bytes; - mutex_exit(&bpl->bpl_lock); - return (err); - } - - while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { - if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) { - size += bp_get_dsize(dmu_objset_spa(bpl->bpl_mos), &bp); - } - } - if (err == ENOENT) - err = 0; - *dsizep = size; - return (err); -} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/bpobj.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/bpobj.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/bpobj.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/bpobj.c 27 Mar 2016 02:52:19 -0000 @@ -0,0 +1,592 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include +#include +#include +#include +#include +#include + +/* + * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj). + */ +uint64_t +bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx) +{ + spa_t *spa = dmu_objset_spa(os); + dsl_pool_t *dp = dmu_objset_pool(os); + + if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) { + if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) { + ASSERT0(dp->dp_empty_bpobj); + dp->dp_empty_bpobj = + bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY(zap_add(os, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, + &dp->dp_empty_bpobj, tx) == 0); + } + spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx); + ASSERT(dp->dp_empty_bpobj != 0); + return (dp->dp_empty_bpobj); + } else { + return (bpobj_alloc(os, blocksize, tx)); + } +} + +void +bpobj_decr_empty(objset_t *os, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_objset_pool(os); + + spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx); + if (!spa_feature_is_active(dmu_objset_spa(os), + SPA_FEATURE_EMPTY_BPOBJ)) { + VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_EMPTY_BPOBJ, tx)); + VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx)); + dp->dp_empty_bpobj = 0; + } +} + +uint64_t +bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) +{ + int size; + + if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) + size = BPOBJ_SIZE_V0; + else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) + size = BPOBJ_SIZE_V1; + else + size = sizeof (bpobj_phys_t); + + return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, + DMU_OT_BPOBJ_HDR, size, tx)); +} + +void +bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + int64_t i; + bpobj_t bpo; + dmu_object_info_t doi; + int epb; + dmu_buf_t *dbuf = NULL; + + ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj); + VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); + + mutex_enter(&bpo.bpo_lock); + + if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) + goto out; + + VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); + epb = doi.doi_data_block_size / sizeof (uint64_t); + + for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { + uint64_t *objarray; + uint64_t offset, blkoff; + + offset = i * sizeof (uint64_t); + blkoff = P2PHASE(i, epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + VERIFY3U(0, ==, dmu_buf_hold(os, + bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + objarray = dbuf->db_data; + bpobj_free(os, objarray[blkoff], tx); + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); + +out: + mutex_exit(&bpo.bpo_lock); + bpobj_close(&bpo); + + VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); +} + +int +bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) +{ + dmu_object_info_t doi; + int err; + + err = dmu_object_info(os, object, &doi); + if (err) + return (err); + + bzero(bpo, sizeof (*bpo)); + mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); + + ASSERT(bpo->bpo_dbuf == NULL); + ASSERT(bpo->bpo_phys == NULL); + ASSERT(object != 0); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); + + err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); + if (err) + return (err); + + bpo->bpo_os = os; + bpo->bpo_object = object; + bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; + bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); + bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); + bpo->bpo_phys = bpo->bpo_dbuf->db_data; + return (0); +} + +void +bpobj_close(bpobj_t *bpo) +{ + /* Lame workaround for closing a bpobj that was never opened. */ + if (bpo->bpo_object == 0) + return; + + dmu_buf_rele(bpo->bpo_dbuf, bpo); + if (bpo->bpo_cached_dbuf != NULL) + dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); + bpo->bpo_dbuf = NULL; + bpo->bpo_phys = NULL; + bpo->bpo_cached_dbuf = NULL; + bpo->bpo_object = 0; + + mutex_destroy(&bpo->bpo_lock); +} + +static boolean_t +bpobj_hasentries(bpobj_t *bpo) +{ + return (bpo->bpo_phys->bpo_num_blkptrs != 0 || + (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0)); +} + +static int +bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, + boolean_t free) +{ + dmu_object_info_t doi; + int epb; + int64_t i; + int err = 0; + dmu_buf_t *dbuf = NULL; + + mutex_enter(&bpo->bpo_lock); + + if (free) + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + + for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) { + blkptr_t *bparray; + blkptr_t *bp; + uint64_t offset, blkoff; + + offset = i * sizeof (blkptr_t); + blkoff = P2PHASE(i, bpo->bpo_epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset, + FTAG, &dbuf, 0); + if (err) + break; + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + bparray = dbuf->db_data; + bp = &bparray[blkoff]; + err = func(arg, bp, tx); + if (err) + break; + if (free) { + bpo->bpo_phys->bpo_bytes -= + bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); + ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); + if (bpo->bpo_havecomp) { + bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp); + } + bpo->bpo_phys->bpo_num_blkptrs--; + ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); + } + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + if (free) { + VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, + (i + 1) * sizeof (blkptr_t), -1ULL, tx)); + } + if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) + goto out; + + ASSERT(bpo->bpo_havecomp); + err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi); + if (err) { + mutex_exit(&bpo->bpo_lock); + return (err); + } + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); + epb = doi.doi_data_block_size / sizeof (uint64_t); + + for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { + uint64_t *objarray; + uint64_t offset, blkoff; + bpobj_t sublist; + uint64_t used_before, comp_before, uncomp_before; + uint64_t used_after, comp_after, uncomp_after; + + offset = i * sizeof (uint64_t); + blkoff = P2PHASE(i, epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + err = dmu_buf_hold(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0); + if (err) + break; + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + objarray = dbuf->db_data; + err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); + if (err) + break; + if (free) { + err = bpobj_space(&sublist, + &used_before, &comp_before, &uncomp_before); + if (err != 0) { + bpobj_close(&sublist); + break; + } + } + err = bpobj_iterate_impl(&sublist, func, arg, tx, free); + if (free) { + VERIFY3U(0, ==, bpobj_space(&sublist, + &used_after, &comp_after, &uncomp_after)); + bpo->bpo_phys->bpo_bytes -= used_before - used_after; + ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0); + bpo->bpo_phys->bpo_comp -= comp_before - comp_after; + bpo->bpo_phys->bpo_uncomp -= + uncomp_before - uncomp_after; + } + + bpobj_close(&sublist); + if (err) + break; + if (free) { + err = dmu_object_free(bpo->bpo_os, + objarray[blkoff], tx); + if (err) + break; + bpo->bpo_phys->bpo_num_subobjs--; + ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0); + } + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + if (free) { + VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + (i + 1) * sizeof (uint64_t), -1ULL, tx)); + } + +out: + /* If there are no entries, there should be no bytes. */ + if (!bpobj_hasentries(bpo)) { + ASSERT0(bpo->bpo_phys->bpo_bytes); + ASSERT0(bpo->bpo_phys->bpo_comp); + ASSERT0(bpo->bpo_phys->bpo_uncomp); + } + + mutex_exit(&bpo->bpo_lock); + return (err); +} + +/* + * Iterate and remove the entries. If func returns nonzero, iteration + * will stop and that entry will not be removed. + */ +int +bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +{ + return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE)); +} + +/* + * Iterate the entries. If func returns nonzero, iteration will stop. + */ +int +bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +{ + return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE)); +} + +void +bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) +{ + bpobj_t subbpo; + uint64_t used, comp, uncomp, subsubobjs; + + ASSERT(bpo->bpo_havesubobj); + ASSERT(bpo->bpo_havecomp); + ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); + + if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) { + bpobj_decr_empty(bpo->bpo_os, tx); + return; + } + + VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); + VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); + + if (!bpobj_hasentries(&subbpo)) { + /* No point in having an empty subobj. */ + bpobj_close(&subbpo); + bpobj_free(bpo->bpo_os, subobj, tx); + return; + } + + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + if (bpo->bpo_phys->bpo_subobjs == 0) { + bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os, + DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, + DMU_OT_NONE, 0, tx); + } + + dmu_object_info_t doi; + ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi)); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); + + mutex_enter(&bpo->bpo_lock); + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + sizeof (subobj), &subobj, tx); + bpo->bpo_phys->bpo_num_subobjs++; + + /* + * If subobj has only one block of subobjs, then move subobj's + * subobjs to bpo's subobj list directly. This reduces + * recursion in bpobj_iterate due to nested subobjs. + */ + subsubobjs = subbpo.bpo_phys->bpo_subobjs; + if (subsubobjs != 0) { + dmu_object_info_t doi; + + VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); + if (doi.doi_max_offset == doi.doi_data_block_size) { + dmu_buf_t *subdb; + uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; + + VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs, + 0, FTAG, &subdb, 0)); + /* + * Make sure that we are not asking dmu_write() + * to write more data than we have in our buffer. + */ + VERIFY3U(subdb->db_size, >=, + numsubsub * sizeof (subobj)); + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + numsubsub * sizeof (subobj), subdb->db_data, tx); + dmu_buf_rele(subdb, FTAG); + bpo->bpo_phys->bpo_num_subobjs += numsubsub; + + dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); + subbpo.bpo_phys->bpo_subobjs = 0; + VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os, + subsubobjs, tx)); + } + } + bpo->bpo_phys->bpo_bytes += used; + bpo->bpo_phys->bpo_comp += comp; + bpo->bpo_phys->bpo_uncomp += uncomp; + mutex_exit(&bpo->bpo_lock); + + bpobj_close(&subbpo); +} + +void +bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) +{ + blkptr_t stored_bp = *bp; + uint64_t offset; + int blkoff; + blkptr_t *bparray; + + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); + + if (BP_IS_EMBEDDED(bp)) { + /* + * The bpobj will compress better without the payload. + * + * Note that we store EMBEDDED bp's because they have an + * uncompressed size, which must be accounted for. An + * alternative would be to add their size to bpo_uncomp + * without storing the bp, but that would create additional + * complications: bpo_uncomp would be inconsistent with the + * set of BP's stored, and bpobj_iterate() wouldn't visit + * all the space accounted for in the bpobj. + */ + bzero(&stored_bp, sizeof (stored_bp)); + stored_bp.blk_prop = bp->blk_prop; + stored_bp.blk_birth = bp->blk_birth; + } else if (!BP_GET_DEDUP(bp)) { + /* The bpobj will compress better without the checksum */ + bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); + } + + /* We never need the fill count. */ + stored_bp.blk_fill = 0; + + mutex_enter(&bpo->bpo_lock); + + offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); + blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); + + if (bpo->bpo_cached_dbuf == NULL || + offset < bpo->bpo_cached_dbuf->db_offset || + offset >= bpo->bpo_cached_dbuf->db_offset + + bpo->bpo_cached_dbuf->db_size) { + if (bpo->bpo_cached_dbuf) + dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); + VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, + offset, bpo, &bpo->bpo_cached_dbuf, 0)); + } + + dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); + bparray = bpo->bpo_cached_dbuf->db_data; + bparray[blkoff] = stored_bp; + + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + bpo->bpo_phys->bpo_num_blkptrs++; + bpo->bpo_phys->bpo_bytes += + bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); + if (bpo->bpo_havecomp) { + bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp); + } + mutex_exit(&bpo->bpo_lock); +} + +struct space_range_arg { + spa_t *spa; + uint64_t mintxg; + uint64_t maxtxg; + uint64_t used; + uint64_t comp; + uint64_t uncomp; +}; + +/* ARGSUSED */ +static int +space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + struct space_range_arg *sra = arg; + + if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { + if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) + sra->used += bp_get_dsize_sync(sra->spa, bp); + else + sra->used += bp_get_dsize(sra->spa, bp); + sra->comp += BP_GET_PSIZE(bp); + sra->uncomp += BP_GET_UCSIZE(bp); + } + return (0); +} + +int +bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + mutex_enter(&bpo->bpo_lock); + + *usedp = bpo->bpo_phys->bpo_bytes; + if (bpo->bpo_havecomp) { + *compp = bpo->bpo_phys->bpo_comp; + *uncompp = bpo->bpo_phys->bpo_uncomp; + mutex_exit(&bpo->bpo_lock); + return (0); + } else { + mutex_exit(&bpo->bpo_lock); + return (bpobj_space_range(bpo, 0, UINT64_MAX, + usedp, compp, uncompp)); + } +} + +/* + * Return the amount of space in the bpobj which is: + * mintxg < blk_birth <= maxtxg + */ +int +bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + struct space_range_arg sra = { 0 }; + int err; + + /* + * As an optimization, if they want the whole txg range, just + * get bpo_bytes rather than iterating over the bps. + */ + if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) + return (bpobj_space(bpo, usedp, compp, uncompp)); + + sra.spa = dmu_objset_spa(bpo->bpo_os); + sra.mintxg = mintxg; + sra.maxtxg = maxtxg; + + err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); + *usedp = sra.used; + *compp = sra.comp; + *uncompp = sra.uncomp; + return (err); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/bptree.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/bptree.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/bptree.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/bptree.c 10 Oct 2016 11:09:56 -0000 @@ -0,0 +1,301 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * A bptree is a queue of root block pointers from destroyed datasets. When a + * dataset is destroyed its root block pointer is put on the end of the pool's + * bptree queue so the dataset's blocks can be freed asynchronously by + * dsl_scan_sync. This allows the delete operation to finish without traversing + * all the dataset's blocks. + * + * Note that while bt_begin and bt_end are only ever incremented in this code, + * they are effectively reset to 0 every time the entire bptree is freed because + * the bptree's object is destroyed and re-created. + */ + +struct bptree_args { + bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ + boolean_t ba_free; /* true if freeing during traversal */ + + bptree_itor_t *ba_func; /* function to call for each blockpointer */ + void *ba_arg; /* caller supplied argument to ba_func */ + dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ +} bptree_args_t; + +uint64_t +bptree_alloc(objset_t *os, dmu_tx_t *tx) +{ + uint64_t obj; + dmu_buf_t *db; + bptree_phys_t *bt; + + obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, + SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, + sizeof (bptree_phys_t), tx); + + /* + * Bonus buffer contents are already initialized to 0, but for + * readability we make it explicit. + */ + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + bt = db->db_data; + bt->bt_begin = 0; + bt->bt_end = 0; + bt->bt_bytes = 0; + bt->bt_comp = 0; + bt->bt_uncomp = 0; + dmu_buf_rele(db, FTAG); + + return (obj); +} + +int +bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + ASSERT3U(bt->bt_begin, ==, bt->bt_end); + ASSERT0(bt->bt_bytes); + ASSERT0(bt->bt_comp); + ASSERT0(bt->bt_uncomp); + dmu_buf_rele(db, FTAG); + + return (dmu_object_free(os, obj, tx)); +} + +boolean_t +bptree_is_empty(objset_t *os, uint64_t obj) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + boolean_t rv; + + VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + rv = (bt->bt_begin == bt->bt_end); + dmu_buf_rele(db, FTAG); + return (rv); +} + +void +bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, + uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + bptree_entry_phys_t bte = { 0 }; + + /* + * bptree objects are in the pool mos, therefore they can only be + * modified in syncing context. Furthermore, this is only modified + * by the sync thread, so no locking is necessary. + */ + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + + bte.be_birth_txg = birth_txg; + bte.be_bp = *bp; + dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); + + dmu_buf_will_dirty(db, tx); + bt->bt_end++; + bt->bt_bytes += bytes; + bt->bt_comp += comp; + bt->bt_uncomp += uncomp; + dmu_buf_rele(db, FTAG); +} + +/* ARGSUSED */ +static int +bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + int err; + struct bptree_args *ba = arg; + + if (bp == NULL || BP_IS_HOLE(bp)) + return (0); + + err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); + if (err == 0 && ba->ba_free) { + ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); + ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); + ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); + } + return (err); +} + +/* + * If "free" is set: + * - It is assumed that "func" will be freeing the block pointers. + * - If "func" returns nonzero, the bookmark will be remembered and + * iteration will be restarted from this point on next invocation. + * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), + * bptree_iterate will remember the bookmark, continue traversing + * any additional entries, and return 0. + * + * If "free" is not set, traversal will stop and return an error if + * an i/o error is encountered. + * + * In either case, if zfs_free_leak_on_eio is set, i/o errors will be + * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to + * traverse_dataset_destroyed()). + */ +int +bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, + void *arg, dmu_tx_t *tx) +{ + boolean_t ioerr = B_FALSE; + int err; + uint64_t i; + dmu_buf_t *db; + struct bptree_args ba; + + ASSERT(!free || dmu_tx_is_syncing(tx)); + + err = dmu_bonus_hold(os, obj, FTAG, &db); + if (err != 0) + return (err); + + if (free) + dmu_buf_will_dirty(db, tx); + + ba.ba_phys = db->db_data; + ba.ba_free = free; + ba.ba_func = func; + ba.ba_arg = arg; + ba.ba_tx = tx; + + err = 0; + for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { + bptree_entry_phys_t bte; + int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; + + err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), + &bte, DMU_READ_NO_PREFETCH); + if (err != 0) + break; + + if (zfs_free_leak_on_eio) + flags |= TRAVERSE_HARD; + zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld " + "bookmark %lld/%lld/%lld/%lld", + (longlong_t)i, + (longlong_t)bte.be_birth_txg, + (longlong_t)bte.be_zb.zb_objset, + (longlong_t)bte.be_zb.zb_object, + (longlong_t)bte.be_zb.zb_level, + (longlong_t)bte.be_zb.zb_blkid); + err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, + bte.be_birth_txg, &bte.be_zb, flags, + bptree_visit_cb, &ba); + if (free) { + /* + * The callback has freed the visited block pointers. + * Record our traversal progress on disk, either by + * updating this record's bookmark, or by logically + * removing this record by advancing bt_begin. + */ + if (err != 0) { + /* save bookmark for future resume */ + ASSERT3U(bte.be_zb.zb_objset, ==, + ZB_DESTROYED_OBJSET); + ASSERT0(bte.be_zb.zb_level); + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); + if (err == EIO || err == ECKSUM || + err == ENXIO) { + /* + * Skip the rest of this tree and + * continue on to the next entry. + */ + err = 0; + ioerr = B_TRUE; + } else { + break; + } + } else if (ioerr) { + /* + * This entry is finished, but there were + * i/o errors on previous entries, so we + * can't adjust bt_begin. Set this entry's + * be_birth_txg such that it will be + * treated as a no-op in future traversals. + */ + bte.be_birth_txg = UINT64_MAX; + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); + } + + if (!ioerr) { + ba.ba_phys->bt_begin++; + (void) dmu_free_range(os, obj, + i * sizeof (bte), sizeof (bte), tx); + } + } else if (err != 0) { + break; + } + } + + ASSERT(!free || err != 0 || ioerr || + ba.ba_phys->bt_begin == ba.ba_phys->bt_end); + + /* if all blocks are free there should be no used space */ + if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { + if (zfs_free_leak_on_eio) { + ba.ba_phys->bt_bytes = 0; + ba.ba_phys->bt_comp = 0; + ba.ba_phys->bt_uncomp = 0; + } + + ASSERT0(ba.ba_phys->bt_bytes); + ASSERT0(ba.ba_phys->bt_comp); + ASSERT0(ba.ba_phys->bt_uncomp); + } + + dmu_buf_rele(db, FTAG); + + return (err); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/bqueue.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/bqueue.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/bqueue.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/bqueue.c 30 Aug 2015 02:18:11 -0000 @@ -0,0 +1,111 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + +#include +#include + +static inline bqueue_node_t * +obj2node(bqueue_t *q, void *data) +{ + return ((bqueue_node_t *)((char *)data + q->bq_node_offset)); +} + +/* + * Initialize a blocking queue The maximum capacity of the queue is set to + * size. Types that want to be stored in a bqueue must contain a bqueue_node_t, + * and offset should give its offset from the start of the struct. Return 0 on + * success, or -1 on failure. + */ +int +bqueue_init(bqueue_t *q, uint64_t size, size_t node_offset) +{ + list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t), + node_offset + offsetof(bqueue_node_t, bqn_node)); + cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL); + cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL); + q->bq_node_offset = node_offset; + q->bq_size = 0; + q->bq_maxsize = size; + return (0); +} + +/* + * Destroy a blocking queue. This function asserts that there are no + * elements in the queue, and no one is blocked on the condition + * variables. + */ +void +bqueue_destroy(bqueue_t *q) +{ + ASSERT0(q->bq_size); + cv_destroy(&q->bq_add_cv); + cv_destroy(&q->bq_pop_cv); + mutex_destroy(&q->bq_lock); + list_destroy(&q->bq_list); +} + +/* + * Add data to q, consuming size units of capacity. If there is insufficient + * capacity to consume size units, block until capacity exists. Asserts size is + * > 0. + */ +void +bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) +{ + ASSERT3U(item_size, >, 0); + ASSERT3U(item_size, <, q->bq_maxsize); + mutex_enter(&q->bq_lock); + obj2node(q, data)->bqn_size = item_size; + while (q->bq_size + item_size > q->bq_maxsize) { + cv_wait(&q->bq_add_cv, &q->bq_lock); + } + q->bq_size += item_size; + list_insert_tail(&q->bq_list, data); + cv_signal(&q->bq_pop_cv); + mutex_exit(&q->bq_lock); +} +/* + * Take the first element off of q. If there are no elements on the queue, wait + * until one is put there. Return the removed element. + */ +void * +bqueue_dequeue(bqueue_t *q) +{ + void *ret; + uint64_t item_size; + mutex_enter(&q->bq_lock); + while (q->bq_size == 0) { + cv_wait(&q->bq_pop_cv, &q->bq_lock); + } + ret = list_remove_head(&q->bq_list); + item_size = obj2node(q, ret)->bqn_size; + q->bq_size -= item_size; + mutex_exit(&q->bq_lock); + cv_signal(&q->bq_add_cv); + return (ret); +} + +/* + * Returns true if the space used is 0. + */ +boolean_t +bqueue_empty(bqueue_t *q) +{ + return (q->bq_size == 0); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c,v retrieving revision 1.5 diff -u -p -r1.5 dbuf.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c 19 Feb 2016 19:25:59 -0000 1.5 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c 26 Apr 2017 00:40:35 -0000 @@ -19,12 +19,18 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ #include #include +#include #include #include #include @@ -34,26 +40,113 @@ #include #include #include +#include +#include +#include +#include +#include +#include -static void dbuf_destroy(dmu_buf_impl_t *db); -static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +uint_t zfs_dbuf_evict_key; + +static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); +#ifndef __lint +extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, + dmu_buf_evict_func_t *evict_func_sync, + dmu_buf_evict_func_t *evict_func_async, + dmu_buf_t **clear_on_evict_dbufp); +#endif /* ! __lint */ + /* * Global data structures and functions for the dbuf cache. */ -static kmem_cache_t *dbuf_cache; +static kmem_cache_t *dbuf_kmem_cache; +static taskq_t *dbu_evict_taskq; + +static kthread_t *dbuf_cache_evict_thread; +static kmutex_t dbuf_evict_lock; +static kcondvar_t dbuf_evict_cv; +static boolean_t dbuf_evict_thread_exit; + +/* + * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that + * are not currently held but have been recently released. These dbufs + * are not eligible for arc eviction until they are aged out of the cache. + * Dbufs are added to the dbuf cache once the last hold is released. If a + * dbuf is later accessed and still exists in the dbuf cache, then it will + * be removed from the cache and later re-added to the head of the cache. + * Dbufs that are aged out of the cache will be immediately destroyed and + * become eligible for arc eviction. + */ +static multilist_t dbuf_cache; +static refcount_t dbuf_cache_size; +uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024; + +/* Cap the size of the dbuf cache to log2 fraction of arc size. */ +int dbuf_cache_max_shift = 5; + +/* + * The dbuf cache uses a three-stage eviction policy: + * - A low water marker designates when the dbuf eviction thread + * should stop evicting from the dbuf cache. + * - When we reach the maximum size (aka mid water mark), we + * signal the eviction thread to run. + * - The high water mark indicates when the eviction thread + * is unable to keep up with the incoming load and eviction must + * happen in the context of the calling thread. + * + * The dbuf cache: + * (max size) + * low water mid water hi water + * +----------------------------------------+----------+----------+ + * | | | | + * | | | | + * | | | | + * | | | | + * +----------------------------------------+----------+----------+ + * stop signal evict + * evicting eviction directly + * thread + * + * The high and low water marks indicate the operating range for the eviction + * thread. The low water mark is, by default, 90% of the total size of the + * cache and the high water mark is at 110% (both of these percentages can be + * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, + * respectively). The eviction thread will try to ensure that the cache remains + * within this range by waking up every second and checking if the cache is + * above the low water mark. The thread can also be woken up by callers adding + * elements into the cache if the cache is larger than the mid water (i.e max + * cache size). Once the eviction thread is woken up and eviction is required, + * it will continue evicting buffers until it's able to reduce the cache size + * to the low water mark. If the cache size continues to grow and hits the high + * water mark, then callers adding elments to the cache will begin to evict + * directly from the cache until the cache is no longer above the high water + * mark. + */ + +/* + * The percentage above and below the maximum cache size. + */ +uint_t dbuf_cache_hiwater_pct = 10; +uint_t dbuf_cache_lowater_pct = 10; /* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) { - dmu_buf_impl_t *db = unused; - bzero(db, sizeof (dmu_buf_impl_t)); + dmu_buf_impl_t *db = vdb; +#ifdef __NetBSD__ + db = unused; +#endif + bzero(db, sizeof (dmu_buf_impl_t)); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); + multilist_link_init(&db->db_cache_link); refcount_create(&db->db_holds); + return (0); } @@ -61,9 +154,14 @@ dbuf_cons(void *vdb, void *unused, int k static void dbuf_dest(void *vdb, void *unused) { - dmu_buf_impl_t *db = unused; + dmu_buf_impl_t *db = vdb; + +#ifdef __NetBSD__ + db = unused; +#endif mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); + ASSERT(!multilist_link_active(&db->db_cache_link)); refcount_destroy(&db->db_holds); } @@ -93,8 +191,6 @@ dbuf_hash(void *os, uint64_t obj, uint8_ return (crc); } -#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); - #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ @@ -102,12 +198,10 @@ dbuf_hash(void *os, uint64_t obj, uint8_ (dbuf)->db_blkid == (blkid)) dmu_buf_impl_t * -dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) +dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; - objset_t *os = dn->dn_objset; - uint64_t obj = dn->dn_object; - uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t hv = dbuf_hash(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *db; @@ -126,6 +220,24 @@ dbuf_find(dnode_t *dn, uint8_t level, ui return (NULL); } +static dmu_buf_impl_t * +dbuf_find_bonus(objset_t *os, uint64_t object) +{ + dnode_t *dn; + dmu_buf_impl_t *db = NULL; + + if (dnode_hold(os, object, FTAG, &dn) == 0) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_bonus != NULL) { + db = dn->dn_bonus; + mutex_enter(&db->db_mtx); + } + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + } + return (db); +} + /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element @@ -140,7 +252,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid = db->db_blkid; - uint64_t hv = DBUF_HASH(os, obj, level, blkid); + uint64_t hv = dbuf_hash(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf; @@ -160,26 +272,25 @@ dbuf_hash_insert(dmu_buf_impl_t *db) db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; mutex_exit(DBUF_HASH_MUTEX(h, idx)); - atomic_add_64(&dbuf_hash_count, 1); + atomic_inc_64(&dbuf_hash_count); return (NULL); } /* - * Remove an entry from the hash table. This operation will - * fail if there are any existing holds on the db. + * Remove an entry from the hash table. It must be in the EVICTING state. */ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; - uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, + uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf, **dbp; /* - * We musn't hold db_mtx to maintin lock ordering: + * We musn't hold db_mtx to maintain lock ordering: * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(refcount_is_zero(&db->db_holds)); @@ -195,36 +306,281 @@ dbuf_hash_remove(dmu_buf_impl_t *db) *dbp = db->db_hash_next; db->db_hash_next = NULL; mutex_exit(DBUF_HASH_MUTEX(h, idx)); - atomic_add_64(&dbuf_hash_count, -1); + atomic_dec_64(&dbuf_hash_count); } -static arc_evict_func_t dbuf_do_evict; +typedef enum { + DBVU_EVICTING, + DBVU_NOT_EVICTING +} dbvu_verify_type_t; + +static void +dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) +{ +#ifdef ZFS_DEBUG + int64_t holds; + + if (db->db_user == NULL) + return; + + /* Only data blocks support the attachment of user data. */ + ASSERT(db->db_level == 0); + + /* Clients must resolve a dbuf before attaching user data. */ + ASSERT(db->db.db_data != NULL); + ASSERT3U(db->db_state, ==, DB_CACHED); + + holds = refcount_count(&db->db_holds); + if (verify_type == DBVU_EVICTING) { + /* + * Immediate eviction occurs when holds == dirtycnt. + * For normal eviction buffers, holds is zero on + * eviction, except when dbuf_fix_old_data() calls + * dbuf_clear_data(). However, the hold count can grow + * during eviction even though db_mtx is held (see + * dmu_bonus_hold() for an example), so we can only + * test the generic invariant that holds >= dirtycnt. + */ + ASSERT3U(holds, >=, db->db_dirtycnt); + } else { + if (db->db_user_immediate_evict == TRUE) + ASSERT3U(holds, >=, db->db_dirtycnt); + else + ASSERT3U(holds, >, 0); + } +#endif +} static void dbuf_evict_user(dmu_buf_impl_t *db) { + dmu_buf_user_t *dbu = db->db_user; + ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_level != 0 || db->db_evict_func == NULL) + if (dbu == NULL) return; - if (db->db_user_data_ptr_ptr) - *db->db_user_data_ptr_ptr = db->db.db_data; - db->db_evict_func(&db->db, db->db_user_ptr); - db->db_user_ptr = NULL; - db->db_user_data_ptr_ptr = NULL; - db->db_evict_func = NULL; + dbuf_verify_user(db, DBVU_EVICTING); + db->db_user = NULL; + +#ifdef ZFS_DEBUG + if (dbu->dbu_clear_on_evict_dbufp != NULL) + *dbu->dbu_clear_on_evict_dbufp = NULL; +#endif + + /* + * There are two eviction callbacks - one that we call synchronously + * and one that we invoke via a taskq. The async one is useful for + * avoiding lock order reversals and limiting stack depth. + * + * Note that if we have a sync callback but no async callback, + * it's likely that the sync callback will free the structure + * containing the dbu. In that case we need to take care to not + * dereference dbu after calling the sync evict func. + */ + boolean_t has_async = (dbu->dbu_evict_func_async != NULL); + + if (dbu->dbu_evict_func_sync != NULL) + dbu->dbu_evict_func_sync(dbu); + + if (has_async) { + taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async, + dbu, 0, &dbu->dbu_tqent); + } } -void -dbuf_evict(dmu_buf_impl_t *db) +boolean_t +dbuf_is_metadata(dmu_buf_impl_t *db) { - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_buf == NULL); - ASSERT(db->db_data_pending == NULL); + if (db->db_level > 0) { + return (B_TRUE); + } else { + boolean_t is_metadata; + + DB_DNODE_ENTER(db); + is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); + DB_DNODE_EXIT(db); + + return (is_metadata); + } +} + +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the dbuf eviction + * code is laid out; dbuf_evict_thread() assumes dbufs are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +unsigned int +dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) +{ + dmu_buf_impl_t *db = obj; + + /* + * The assumption here, is the hash value for a given + * dmu_buf_impl_t will remain constant throughout it's lifetime + * (i.e. it's objset, object, level and blkid fields don't change). + * Thus, we don't need to store the dbuf's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (dbuf_hash(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid) % + multilist_get_num_sublists(ml)); +} + +static inline boolean_t +dbuf_cache_above_hiwater(void) +{ + uint64_t dbuf_cache_hiwater_bytes = + (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100; + + return (refcount_count(&dbuf_cache_size) > + dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes); +} + +static inline boolean_t +dbuf_cache_above_lowater(void) +{ + uint64_t dbuf_cache_lowater_bytes = + (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100; + + return (refcount_count(&dbuf_cache_size) > + dbuf_cache_max_bytes - dbuf_cache_lowater_bytes); +} + +/* + * Evict the oldest eligible dbuf from the dbuf cache. + */ +static void +dbuf_evict_one(void) +{ + int idx = multilist_get_random_index(&dbuf_cache); + multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx); + + ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); + + /* + * Set the thread's tsd to indicate that it's processing evictions. + * Once a thread stops evicting from the dbuf cache it will + * reset its tsd to NULL. + */ + ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL); + (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE); + + dmu_buf_impl_t *db = multilist_sublist_tail(mls); + while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { + db = multilist_sublist_prev(mls, db); + } + + DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, + multilist_sublist_t *, mls); + + if (db != NULL) { + multilist_sublist_remove(mls, db); + multilist_sublist_unlock(mls); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + dbuf_destroy(db); + } else { + multilist_sublist_unlock(mls); + } + (void) tsd_set(zfs_dbuf_evict_key, NULL); +} + +/* + * The dbuf evict thread is responsible for aging out dbufs from the + * cache. Once the cache has reached it's maximum size, dbufs are removed + * and destroyed. The eviction thread will continue running until the size + * of the dbuf cache is at or below the maximum size. Once the dbuf is aged + * out of the cache it is destroyed and becomes eligible for arc eviction. + */ +static void +dbuf_evict_thread(void *dummy __unused) +{ + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); + + mutex_enter(&dbuf_evict_lock); + while (!dbuf_evict_thread_exit) { + while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_hires(&dbuf_evict_cv, + &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); + CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + + /* + * Keep evicting as long as we're above the low water mark + * for the cache. We do this without holding the locks to + * minimize lock contention. + */ + while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + dbuf_evict_one(); + } + + mutex_enter(&dbuf_evict_lock); + } + + dbuf_evict_thread_exit = B_FALSE; + cv_broadcast(&dbuf_evict_cv); + CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ + thread_exit(); +} + +/* + * Wake up the dbuf eviction thread if the dbuf cache is at its max size. + * If the dbuf cache is at its high water mark, then evict a dbuf from the + * dbuf cache using the callers context. + */ +static void +dbuf_evict_notify(void) +{ + + /* + * We use thread specific data to track when a thread has + * started processing evictions. This allows us to avoid deeply + * nested stacks that would have a call flow similar to this: + * + * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() + * ^ | + * | | + * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ + * + * The dbuf_eviction_thread will always have its tsd set until + * that thread exits. All other threads will only set their tsd + * if they are participating in the eviction process. This only + * happens if the eviction thread is unable to process evictions + * fast enough. To keep the dbuf cache size in check, other threads + * can evict from the dbuf cache directly. Those threads will set + * their tsd values so that we ensure that they only evict one dbuf + * from the dbuf cache. + */ + if (tsd_get(zfs_dbuf_evict_key) != NULL) + return; + + if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + boolean_t evict_now = B_FALSE; - dbuf_clear(db); - dbuf_destroy(db); + mutex_enter(&dbuf_evict_lock); + if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) { + evict_now = dbuf_cache_above_hiwater(); + cv_signal(&dbuf_evict_cv); + } + mutex_exit(&dbuf_evict_lock); + + if (evict_now) { + dbuf_evict_one(); + } + } } void @@ -233,7 +589,7 @@ dbuf_init(void) uint64_t hsize = 1ULL << 16; dbuf_hash_table_t *h = &dbuf_hash_table; int i; - + /* * The hash table is big enough to fill all of physical memory * with an average 4K block size. The table will take up @@ -252,12 +608,38 @@ retry: goto retry; } - dbuf_cache = kmem_cache_create("dmu_buf_impl_t", + dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + + /* + * Setup the parameters for the dbuf cache. We cap the size of the + * dbuf cache to 1/32nd (default) of the size of the ARC. + */ + dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes, + arc_max_bytes() >> dbuf_cache_max_shift); + + /* + * All entries are queued via taskq_dispatch_ent(), so min/maxalloc + * configuration is not required. + */ + dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); + + multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_cache_link), + zfs_arc_num_sublists_per_state, + dbuf_cache_multilist_index_func); + refcount_create(&dbuf_cache_size); + + tsd_create(&zfs_dbuf_evict_key, NULL); + dbuf_evict_thread_exit = B_FALSE; + mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); + dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, + NULL, 0, &p0, TS_RUN, minclsyspri); } void @@ -269,7 +651,23 @@ dbuf_fini(void) for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); - kmem_cache_destroy(dbuf_cache); + kmem_cache_destroy(dbuf_kmem_cache); + taskq_destroy(dbu_evict_taskq); + + mutex_enter(&dbuf_evict_lock); + dbuf_evict_thread_exit = B_TRUE; + while (dbuf_evict_thread_exit) { + cv_signal(&dbuf_evict_cv); + cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + tsd_destroy(&zfs_dbuf_evict_key); + + mutex_destroy(&dbuf_evict_lock); + cv_destroy(&dbuf_evict_cv); + + refcount_destroy(&dbuf_cache_size); + multilist_destroy(&dbuf_cache); } /* @@ -280,7 +678,7 @@ dbuf_fini(void) static void dbuf_verify(dmu_buf_impl_t *db) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; dbuf_dirty_record_t *dr; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -289,6 +687,8 @@ dbuf_verify(dmu_buf_impl_t *db) return; ASSERT(db->db_objset != NULL); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); if (dn == NULL) { ASSERT(db->db_parent == NULL); ASSERT(db->db_blkptr == NULL); @@ -296,13 +696,18 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); ASSERT3U(db->db_level, <, dn->dn_nlevels); - ASSERT(db->db_blkid == DB_BONUS_BLKID || - list_head(&dn->dn_dbufs)); + ASSERT(db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID || + !avl_is_empty(&dn->dn_dbufs)); } - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { + ASSERT(dn != NULL); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); + ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); + } else if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); - ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); + ASSERT0(db->db.db_offset); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } @@ -336,8 +741,9 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); - ASSERT3P(db->db_blkptr, ==, - &dn->dn_phys->dn_blkptr[db->db_blkid]); + if (db->db_blkid != DMU_SPILL_BLKID) + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); } else { /* db is pointed to by an indirect block */ int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; @@ -349,7 +755,7 @@ dbuf_verify(dmu_buf_impl_t *db) * have the struct_rwlock. XXX indblksz no longer * grows. safe to do this now? */ - if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); @@ -357,53 +763,83 @@ dbuf_verify(dmu_buf_impl_t *db) } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && - db->db.db_data && db->db_blkid != DB_BONUS_BLKID && + (db->db_buf == NULL || db->db_buf->b_data) && + db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_FILL && !dn->dn_free_txg) { /* * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that * data when we evict this buffer. + * + * There is an exception to this rule for indirect blocks; in + * this case, if the indirect block is a hole, we fill in a few + * fields on each of the child blocks (importantly, birth time) + * to prevent hole birth times from being lost when you + * partially fill in a hole. */ if (db->db_dirtycnt == 0) { - uint64_t *buf = db->db.db_data; - int i; - - for (i = 0; i < db->db.db_size >> 3; i++) { - ASSERT(buf[i] == 0); + if (db->db_level == 0) { + uint64_t *buf = db->db.db_data; + int i; + + for (i = 0; i < db->db.db_size >> 3; i++) { + ASSERT(buf[i] == 0); + } + } else { + blkptr_t *bps = db->db.db_data; + ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, + db->db.db_size); + /* + * We want to verify that all the blkptrs in the + * indirect block are holes, but we may have + * automatically set up a few fields for them. + * We iterate through each blkptr and verify + * they only have those fields set. + */ + for (int i = 0; + i < db->db.db_size / sizeof (blkptr_t); + i++) { + blkptr_t *bp = &bps[i]; + ASSERT(ZIO_CHECKSUM_IS_ZERO( + &bp->blk_cksum)); + ASSERT( + DVA_IS_EMPTY(&bp->blk_dva[0]) && + DVA_IS_EMPTY(&bp->blk_dva[1]) && + DVA_IS_EMPTY(&bp->blk_dva[2])); + ASSERT0(bp->blk_fill); + ASSERT0(bp->blk_pad[0]); + ASSERT0(bp->blk_pad[1]); + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(BP_IS_HOLE(bp)); + ASSERT0(bp->blk_phys_birth); + } } } } + DB_DNODE_EXIT(db); } #endif static void -dbuf_update_data(dmu_buf_impl_t *db) +dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_level == 0 && db->db_user_data_ptr_ptr) { - ASSERT(!refcount_is_zero(&db->db_holds)); - *db->db_user_data_ptr_ptr = db->db.db_data; - } + dbuf_evict_user(db); + ASSERT3P(db->db_buf, ==, NULL); + db->db.db_data = NULL; + if (db->db_state != DB_NOFILL) + db->db_state = DB_UNCACHED; } static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); + ASSERT(buf != NULL); + db->db_buf = buf; - if (buf != NULL) { - ASSERT(buf->b_data != NULL); - db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); - dbuf_update_data(db); - } else { - dbuf_evict_user(db); - db->db.db_data = NULL; - if (db->db_state != DB_NOFILL) - db->db_state = DB_UNCACHED; - } + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; } /* @@ -414,26 +850,54 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; + ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; + spa_t *spa = db->db_objset->os_spa; + mutex_exit(&db->db_mtx); - abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz); + abuf = arc_loan_buf(spa, blksz); bcopy(db->db.db_data, abuf->b_data, blksz); } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); - dbuf_set_data(db, NULL); + db->db_buf = NULL; + dbuf_clear_data(db); mutex_exit(&db->db_mtx); } return (abuf); } +/* + * Calculate which level n block references the data at the level 0 offset + * provided. + */ uint64_t -dbuf_whichblock(dnode_t *dn, uint64_t offset) +dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) { - if (dn->dn_datablkshift) { - return (offset >> dn->dn_datablkshift); + if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { + /* + * The level n blkid is equal to the level 0 blkid divided by + * the number of level 0s in a level n block. + * + * The level 0 blkid is offset >> datablkshift = + * offset / 2^datablkshift. + * + * The number of level 0s in a level n is the number of block + * pointers in an indirect block, raised to the power of level. + * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = + * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). + * + * Thus, the level n blkid is: offset / + * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) + * = offset / 2^(datablkshift + level * + * (indblkshift - SPA_BLKPTRSHIFT)) + * = offset >> (datablkshift + level * + * (indblkshift - SPA_BLKPTRSHIFT)) + */ + return (offset >> (dn->dn_datablkshift + level * + (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); } else { ASSERT3U(offset, <, dn->dn_datablksz); return (0); @@ -465,24 +929,24 @@ dbuf_read_done(zio_t *zio, arc_buf_t *bu dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else { - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); - VERIFY(arc_buf_remove_ref(buf, db) == 1); + arc_buf_destroy(buf, db); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); - mutex_exit(&db->db_mtx); - dbuf_rele(db, NULL); + dbuf_rele_and_unlock(db, NULL); } static void -dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) +dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { - dnode_t *dn = db->db_dnode; - zbookmark_t zb; - uint32_t aflags = ARC_NOWAIT; - arc_buf_t *pbuf; + dnode_t *dn; + zbookmark_phys_t zb; + arc_flags_t aflags = ARC_FLAG_NOWAIT; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); @@ -490,7 +954,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); ASSERT3U(bonuslen, <=, db->db.db_size); @@ -500,7 +964,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t bzero(db->db.db_data, DN_MAX_BONUSLEN); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); - dbuf_update_data(db); + DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return; @@ -516,47 +980,63 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, + dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); + + if (db->db_blkptr != NULL && db->db_level > 0 && + BP_IS_HOLE(db->db_blkptr) && + db->db_blkptr->blk_birth != 0) { + blkptr_t *bps = db->db.db_data; + for (int i = 0; i < ((1 << + DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t)); + i++) { + blkptr_t *bp = &bps[i]; + ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, + 1 << dn->dn_indblkshift); + BP_SET_LSIZE(bp, + BP_GET_LEVEL(db->db_blkptr) == 1 ? + dn->dn_datablksz : + BP_GET_LSIZE(db->db_blkptr)); + BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); + BP_SET_LEVEL(bp, + BP_GET_LEVEL(db->db_blkptr) - 1); + BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); + } + } + DB_DNODE_EXIT(db); db->db_state = DB_CACHED; - *flags |= DB_RF_CACHED; mutex_exit(&db->db_mtx); return; } + DB_DNODE_EXIT(db); + db->db_state = DB_READ; mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) - aflags |= ARC_L2CACHE; + aflags |= ARC_FLAG_L2CACHE; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); dbuf_add_ref(db, NULL); - /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ - - if (db->db_parent) - pbuf = db->db_parent->db_buf; - else - pbuf = db->db_objset->os_phys_buf; - (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, + (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, - (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, + (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); - if (aflags & ARC_CACHED) - *flags |= DB_RF_CACHED; } int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { int err = 0; - int havepzio = (zio != NULL); - int prefetch; + boolean_t havepzio = (zio != NULL); + boolean_t prefetch; + dnode_t *dn; /* * We don't have to hold the mutex to check db_state because it @@ -565,59 +1045,72 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio ASSERT(!refcount_is_zero(&db->db_holds)); if (db->db_state == DB_NOFILL) - return (EIO); + return (SET_ERROR(EIO)); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); + rw_enter(&dn->dn_struct_rwlock, RW_READER); - prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL && + prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { mutex_exit(&db->db_mtx); if (prefetch) - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, - db->db.db_size, TRUE); + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); } else if (db->db_state == DB_UNCACHED) { - if (zio == NULL) { - zio = zio_root(db->db_dnode->dn_objset->os_spa, - NULL, NULL, ZIO_FLAG_CANFAIL); - } - dbuf_read_impl(db, zio, &flags); + spa_t *spa = dn->dn_objset->os_spa; + + if (zio == NULL) + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + dbuf_read_impl(db, zio, flags); /* dbuf_read_impl has dropped db_mtx for us */ if (prefetch) - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, - db->db.db_size, flags & DB_RF_CACHED); + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); if (!havepzio) err = zio_wait(zio); } else { + /* + * Another reader came in while the dbuf was in flight + * between UNCACHED and CACHED. Either a writer will finish + * writing the buffer (sending the dbuf to CACHED) or the + * first reader's request will reach the read_done callback + * and send the dbuf to CACHED. Otherwise, a failure + * occurred and the dbuf went to UNCACHED. + */ mutex_exit(&db->db_mtx); if (prefetch) - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, - db->db.db_size, TRUE); + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); + /* Skip the wait per the caller's request. */ mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { while (db->db_state == DB_READ || db->db_state == DB_FILL) { ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); + DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, + db, zio_t *, zio); cv_wait(&db->db_changed, &db->db_mtx); } if (db->db_state == DB_UNCACHED) - err = EIO; + err = SET_ERROR(EIO); } mutex_exit(&db->db_mtx); } @@ -630,20 +1123,20 @@ static void dbuf_noread(dmu_buf_impl_t *db) { ASSERT(!refcount_is_zero(&db->db_holds)); - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + spa_t *spa = db->db_objset->os_spa; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, - db->db.db_size, db, type)); + dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { - dbuf_set_data(db, NULL); + dbuf_clear_data(db); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } @@ -675,18 +1168,18 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, ui if (dr == NULL || (dr->dt.dl.dr_data != - ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf))) + ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) return; /* * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: - * reset the reference to point to a new copy, + * reset the reference to point to a new copy, * or (if there a no active holders) * just null out the current db_data pointer. */ ASSERT(dr->dr_txg >= txg - 2); - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); @@ -694,11 +1187,13 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, ui } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dr->dt.dl.dr_data = arc_buf_alloc( - db->db_dnode->dn_objset->os_spa, size, db, type); + spa_t *spa = db->db_objset->os_spa; + + dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { - dbuf_set_data(db, NULL); + db->db_buf = NULL; + dbuf_clear_data(db); } } @@ -713,17 +1208,19 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); ASSERT(db->db_level == 0); - if (db->db_blkid == DB_BONUS_BLKID || + if (db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) return; ASSERT(db->db_data_pending != dr); /* free this block */ - if (!BP_IS_HOLE(bp)) - dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp); + if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) + zio_free(db->db_objset->os_spa, txg, bp); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + dr->dt.dl.dr_nopwrite = B_FALSE; + /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are @@ -738,54 +1235,48 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find - * empty blocks. Also, if we happen accross any level-1 dbufs in the - * range that have not already been marked dirty, mark them dirty so - * they stay in memory. + * empty blocks. */ void -dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) +dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, + dmu_tx_t *tx) { + dmu_buf_impl_t db_search; dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - uint64_t first_l1 = start >> epbs; - uint64_t last_l1 = end >> epbs; - - if (end > dn->dn_maxblkid) { - end = dn->dn_maxblkid; - last_l1 = end >> epbs; - } - dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); + avl_index_t where; + + if (end_blkid > dn->dn_maxblkid && + !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID)) + end_blkid = dn->dn_maxblkid; + dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); + + db_search.db_level = 0; + db_search.db_blkid = start_blkid; + db_search.db_state = DB_SEARCH; + mutex_enter(&dn->dn_dbufs_mtx); - for (db = list_head(&dn->dn_dbufs); db; db = db_next) { - db_next = list_next(&dn->dn_dbufs, db); - ASSERT(db->db_blkid != DB_BONUS_BLKID); + db = avl_find(&dn->dn_dbufs, &db_search, &where); + ASSERT3P(db, ==, NULL); - if (db->db_level == 1 && - db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { - mutex_enter(&db->db_mtx); - if (db->db_last_dirty && - db->db_last_dirty->dr_txg < txg) { - dbuf_add_ref(db, FTAG); - mutex_exit(&db->db_mtx); - dbuf_will_dirty(db, tx); - dbuf_rele(db, FTAG); - } else { - mutex_exit(&db->db_mtx); - } - } + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); - if (db->db_level != 0) - continue; - dprintf_dbuf(db, "found buf %s\n", ""); - if (db->db_blkid < start || db->db_blkid > end) - continue; + for (; db != NULL; db = db_next) { + db_next = AVL_NEXT(&dn->dn_dbufs, db); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + if (db->db_level != 0 || db->db_blkid > end_blkid) { + break; + } + ASSERT3U(db->db_blkid, >=, start_blkid); /* found a level 0 buffer in the range */ - if (dbuf_undirty(db, tx)) + mutex_enter(&db->db_mtx); + if (dbuf_undirty(db, tx)) { + /* mutex has been dropped and dbuf destroyed */ continue; + } - mutex_enter(&db->db_mtx); if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { @@ -801,7 +1292,7 @@ dbuf_free_range(dnode_t *dn, uint64_t st } if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); - dbuf_clear(db); + dbuf_destroy(db); continue; } /* The dbuf is referenced */ @@ -815,7 +1306,8 @@ dbuf_free_range(dnode_t *dn, uint64_t st * size to reflect that this buffer may * contain new data when we sync. */ - if (db->db_blkid > dn->dn_maxblkid) + if (db->db_blkid != DMU_SPILL_BLKID && + db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); } else { @@ -851,19 +1343,29 @@ dbuf_block_freeable(dmu_buf_impl_t *db) * We don't need any locking to protect db_blkptr: * If it's syncing, then db_last_dirty will be set * so we'll ignore db_blkptr. + * + * This logic ensures that only block births for + * filled blocks are considered. */ ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_last_dirty) + if (db->db_last_dirty && (db->db_blkptr == NULL || + !BP_IS_HOLE(db->db_blkptr))) { birth_txg = db->db_last_dirty->dr_txg; - else if (db->db_blkptr) + } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { birth_txg = db->db_blkptr->blk_birth; + } - /* If we don't exist or are in a snapshot, we can't be freed */ - if (birth_txg) + /* + * If this block don't exist or is in a snapshot, it can't be freed. + * Don't pass the bp to dsl_dataset_block_freeable() since we + * are holding the db_mtx lock and might deadlock if we are + * prefetching a dedup-ed block. + */ + if (birth_txg != 0) return (ds == NULL || - dsl_dataset_block_freeable(ds, birth_txg)); + dsl_dataset_block_freeable(ds, NULL, birth_txg)); else - return (FALSE); + return (B_FALSE); } void @@ -872,14 +1374,18 @@ dbuf_new_size(dmu_buf_impl_t *db, int si arc_buf_t *buf, *obuf; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + dnode_t *dn; + + ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(db->db_blkid != DB_BONUS_BLKID); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); /* XXX does *this* func really need the lock? */ - ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* - * This call to dbuf_will_dirty() with the dn_struct_rwlock held + * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held * is OK, because there can be no other references to the db * when we are changing its size, so no concurrent DB_FILL can * be happening. @@ -888,10 +1394,10 @@ dbuf_new_size(dmu_buf_impl_t *db, int si * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers */ - dbuf_will_dirty(db, tx); + dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ - buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); + buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ obuf = db->db_buf; @@ -902,7 +1408,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int si mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); - VERIFY(arc_buf_remove_ref(obuf, db) == 1); + arc_buf_destroy(obuf, db); db->db.db_size = size; if (db->db_level == 0) { @@ -911,14 +1417,54 @@ dbuf_new_size(dmu_buf_impl_t *db, int si } mutex_exit(&db->db_mtx); - dnode_willuse_space(db->db_dnode, size-osize, tx); + dnode_willuse_space(dn, size-osize, tx); + DB_DNODE_EXIT(db); +} + +void +dbuf_release_bp(dmu_buf_impl_t *db) +{ + objset_t *os = db->db_objset; + + ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); + ASSERT(arc_released(os->os_phys_buf) || + list_link_active(&os->os_dsl_dataset->ds_synced_link)); + ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); + + (void) arc_release(db->db_buf, db); +} + +/* + * We already have a dirty record for this TXG, and we are being + * dirtied again. + */ +static void +dbuf_redirty(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { + /* + * If this buffer has already been written out, + * we now need to reset its state. + */ + dbuf_unoverride(dr); + if (db->db.db_object != DMU_META_DNODE_OBJECT && + db->db_state != DB_NOFILL) { + /* Already released on initial dirty, so just thaw. */ + ASSERT(arc_released(db->db_buf)); + arc_buf_thaw(db->db_buf); + } + } } dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dnode_t *dn = db->db_dnode; - objset_t *os = dn->dn_objset; + dnode_t *dn; + objset_t *os; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; boolean_t do_free_accounting = B_FALSE; @@ -928,15 +1474,25 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t ASSERT(!refcount_is_zero(&db->db_holds)); DMU_TX_DIRTY_BUF(tx, db); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they * were already pre-dirtied in open context. */ +#ifdef DEBUG + if (dn->dn_objset->os_dsl_dataset != NULL) { + rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, + RW_READER, FTAG); + } ASSERT(!dmu_tx_is_syncing(tx) || BP_IS_HOLE(dn->dn_objset->os_rootbp) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_objset->os_dsl_dataset == NULL); + if (dn->dn_objset->os_dsl_dataset != NULL) + rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); +#endif /* * We make this assert for private objects as well, but after we * check if we're already dirty. They are allowed to re-dirty @@ -961,15 +1517,27 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t * Don't set dirtyctx to SYNC if we're just modifying this as we * initialize the objset. */ - if (dn->dn_dirtyctx == DN_UNDIRTIED && - !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { - dn->dn_dirtyctx = - (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); - ASSERT(dn->dn_dirtyctx_firstset == NULL); - dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); + if (dn->dn_dirtyctx == DN_UNDIRTIED) { + if (dn->dn_objset->os_dsl_dataset != NULL) { + rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, + RW_READER, FTAG); + } + if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { + dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? + DN_DIRTY_SYNC : DN_DIRTY_OPEN); + ASSERT(dn->dn_dirtyctx_firstset == NULL); + dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); + } + if (dn->dn_objset->os_dsl_dataset != NULL) { + rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, + FTAG); + } } mutex_exit(&dn->dn_mtx); + if (db->db_blkid == DMU_SPILL_BLKID) + dn->dn_have_spill = B_TRUE; + /* * If this buffer is already dirty, we're done. */ @@ -979,16 +1547,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) drp = &dr->dr_next; if (dr && dr->dr_txg == tx->tx_txg) { - if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { - /* - * If this buffer has already been written out, - * we now need to reset its state. - */ - dbuf_unoverride(dr); - if (db->db.db_object != DMU_META_DNODE_OBJECT && - db->db_state != DB_NOFILL) - arc_buf_thaw(db->db_buf); - } + DB_DNODE_EXIT(db); + + dbuf_redirty(dr); mutex_exit(&db->db_mtx); return (dr); } @@ -1014,13 +1575,20 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t * we already dirtied it in open context. Hence we must make * this assertion only if we're not already dirty. */ + os = dn->dn_objset; +#ifdef DEBUG + if (dn->dn_objset->os_dsl_dataset != NULL) + rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); + if (dn->dn_objset->os_dsl_dataset != NULL) + rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); +#endif ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); - if (db->db_blkid != DB_BONUS_BLKID) { + if (db->db_blkid != DMU_BONUS_BLKID) { /* * Update the accounting. * Note: we delay "free accounting" until after we drop @@ -1042,7 +1610,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t void *data_old = db->db_buf; if (db->db_state != DB_NOFILL) { - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db.db_data; } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { @@ -1068,6 +1636,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } + if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) + dr->dr_accounted = db->db.db_size; dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; dr->dr_next = *drp; @@ -1078,9 +1648,13 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t * and dbuf_dirty. We win, as though the dbuf_noread() had * happened after the free. */ - if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); - dnode_clear_range(dn, db->db_blkid, 1, tx); + if (dn->dn_free_ranges[txgoff] != NULL) { + range_tree_clear(dn->dn_free_ranges[txgoff], + db->db_blkid, 1); + } mutex_exit(&dn->dn_mtx); db->db_freed_in_flight = FALSE; } @@ -1094,14 +1668,29 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t mutex_exit(&db->db_mtx); - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); return (dr); - } else if (do_free_accounting) { + } + + /* + * The dn_struct_rwlock prevents db_blkptr from changing + * due to a write from syncing context completing + * while we are running, so we want to acquire it before + * looking at db_blkptr. + */ + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + + if (do_free_accounting) { blkptr_t *bp = db->db_blkptr; int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? bp_get_dsize(os->os_spa, bp) : db->db.db_size; @@ -1113,14 +1702,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t * db_blkptr, but since this is just a guess, * it's OK if we get an odd answer. */ + ddt_prefetch(os->os_spa, bp); dnode_willuse_space(dn, -willfree, tx); } - if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { - rw_enter(&dn->dn_struct_rwlock, RW_READER); - drop_struct_lock = TRUE; - } - if (db->db_level == 0) { dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); ASSERT(dn->dn_maxblkid >= db->db_blkid); @@ -1136,6 +1721,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t parent = dbuf_hold_level(dn, db->db_level+1, db->db_blkid >> epbs, FTAG); + ASSERT(parent != NULL); parent_held = TRUE; } if (drop_struct_lock) @@ -1146,7 +1732,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t dbuf_rele(parent, FTAG); mutex_enter(&db->db_mtx); - /* possible race with dbuf_undirty() */ + /* + * Since we've dropped the mutex, it's possible that + * dbuf_undirty() might have changed this out from under us. + */ if (db->db_last_dirty == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); @@ -1160,8 +1749,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t } else { ASSERT(db->db_level+1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); - ASSERT(db->db_parent == NULL || - db->db_parent == db->db_dnode->dn_dbuf); + ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); @@ -1171,119 +1759,138 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t } dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); return (dr); } -static int +/* + * Undirty a buffer in the transaction group referenced by the given + * transaction. Return whether this evicted the dbuf. + */ +static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; uint64_t txg = tx->tx_txg; dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); - ASSERT(db->db_blkid != DB_BONUS_BLKID); - mutex_enter(&db->db_mtx); + /* + * Due to our use of dn_nlevels below, this can only be called + * in open context, unless we are operating on the MOS. + * From syncing context, dn_nlevels may be different from the + * dn_nlevels used when dbuf was dirtied. + */ + ASSERT(db->db_objset == + dmu_objset_pool(db->db_objset)->dp_meta_objset || + txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT0(db->db_level); + ASSERT(MUTEX_HELD(&db->db_mtx)); + /* * If this buffer is not dirty, we're done. */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg <= txg) break; - if (dr == NULL || dr->dr_txg < txg) { - mutex_exit(&db->db_mtx); - return (0); - } + if (dr == NULL || dr->dr_txg < txg) + return (B_FALSE); ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); - /* - * If this buffer is currently held, we cannot undirty - * it, since one of the current holders may be in the - * middle of an update. Note that users of dbuf_undirty() - * should not place a hold on the dbuf before the call. - */ - if (refcount_count(&db->db_holds) > db->db_dirtycnt) { - mutex_exit(&db->db_mtx); - /* Make sure we don't toss this buffer at sync phase */ - mutex_enter(&dn->dn_mtx); - dnode_clear_range(dn, db->db_blkid, 1, tx); - mutex_exit(&dn->dn_mtx); - return (0); - } + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); ASSERT(db->db.db_size != 0); - /* XXX would be nice to fix up dn_towrite_space[] */ + dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), + dr->dr_accounted, txg); *drp = dr->dr_next; + /* + * Note that there are three places in dbuf_dirty() + * where this dirty record may be put on a list. + * Make sure to do a list_remove corresponding to + * every one of those list_insert calls. + */ if (dr->dr_parent) { mutex_enter(&dr->dr_parent->dt.di.dr_mtx); list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); - } else if (db->db_level+1 == dn->dn_nlevels) { + } else if (db->db_blkid == DMU_SPILL_BLKID || + db->db_level + 1 == dn->dn_nlevels) { ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } + DB_DNODE_EXIT(db); - if (db->db_level == 0) { - if (db->db_state != DB_NOFILL) { - dbuf_unoverride(dr); + if (db->db_state != DB_NOFILL) { + dbuf_unoverride(dr); - ASSERT(db->db_buf != NULL); - ASSERT(dr->dt.dl.dr_data != NULL); - if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, - db) == 1); - } - } else { ASSERT(db->db_buf != NULL); - ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - mutex_destroy(&dr->dt.di.dr_mtx); - list_destroy(&dr->dt.di.dr_children); + ASSERT(dr->dt.dl.dr_data != NULL); + if (dr->dt.dl.dr_data != db->db_buf) + arc_buf_destroy(dr->dt.dl.dr_data, db); } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { - arc_buf_t *buf = db->db_buf; - - ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); - dbuf_set_data(db, NULL); - VERIFY(arc_buf_remove_ref(buf, db) == 1); - dbuf_evict(db); - return (1); + ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); + dbuf_destroy(db); + return (B_TRUE); } - mutex_exit(&db->db_mtx); - return (0); + return (B_FALSE); } -__attribute__((__weak__)) void +void dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dbuf_will_dirty(db, tx); -} - -void -dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) -{ int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); - if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) + /* + * Quick check for dirtyness. For already dirty blocks, this + * reduces runtime of this function by >90%, and overall performance + * by 50% for some workloads (e.g. file deletion with indirect blocks + * cached). + */ + mutex_enter(&db->db_mtx); + dbuf_dirty_record_t *dr; + for (dr = db->db_last_dirty; + dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { + /* + * It's possible that it is already dirty but not cached, + * because there are some calls to dbuf_dirty() that don't + * go through dmu_buf_will_dirty(). + */ + if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { + /* This dbuf is already dirty and cached. */ + dbuf_redirty(dr); + mutex_exit(&db->db_mtx); + return; + } + } + mutex_exit(&db->db_mtx); + + DB_DNODE_ENTER(db); + if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; + DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, rf); (void) dbuf_dirty(db, tx); } @@ -1303,7 +1910,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dm { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); ASSERT(db->db_level == 0); ASSERT(!refcount_is_zero(&db->db_holds)); @@ -1315,13 +1922,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dm (void) dbuf_dirty(db, tx); } -__attribute__((__weak__)) void -dmu_buf_fill_done(dmu_buf_t *db_fake, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dbuf_fill_done(db, tx); -} - +#pragma weak dmu_buf_fill_done = dbuf_fill_done /* ARGSUSED */ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) @@ -1331,7 +1932,7 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_t if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); @@ -1343,6 +1944,43 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_t mutex_exit(&db->db_mtx); } +void +dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, + bp_embedded_type_t etype, enum zio_compress comp, + int uncompressed_size, int compressed_size, int byteorder, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + struct dirty_leaf *dl; + dmu_object_type_t type; + + if (etype == BP_EMBEDDED_TYPE_DATA) { + ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), + SPA_FEATURE_EMBEDDED_DATA)); + } + + DB_DNODE_ENTER(db); + type = DB_DNODE(db)->dn_type; + DB_DNODE_EXIT(db); + + ASSERT0(db->db_level); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + dmu_buf_will_not_fill(dbuf, tx); + + ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); + dl = &db->db_last_dirty->dt.dl; + encode_embedded_bp_compressed(&dl->dr_overridden_by, + data, comp, uncompressed_size, compressed_size); + BPE_SET_ETYPE(&dl->dr_overridden_by, etype); + BP_SET_TYPE(&dl->dr_overridden_by, type); + BP_SET_LEVEL(&dl->dr_overridden_by, 0); + BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); + + dl->dr_override_state = DR_OVERRIDDEN; + dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; +} + /* * Directly assign a provided arc buf to a given dbuf if it's not referenced * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. @@ -1351,8 +1989,7 @@ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { ASSERT(!refcount_is_zero(&db->db_holds)); - ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT); - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); ASSERT(buf != NULL); @@ -1374,7 +2011,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, a mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); - VERIFY(arc_buf_remove_ref(buf, db) == 1); + arc_buf_destroy(buf, db); xuio_stat_wbuf_copied(); return; } @@ -1392,10 +2029,10 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, a arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + arc_buf_destroy(db->db_buf, db); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; } @@ -1404,71 +2041,109 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, a db->db_state = DB_FILL; mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); - dbuf_fill_done(db, tx); + dmu_buf_fill_done(&db->db, tx); } -/* - * "Clear" the contents of this dbuf. This will mark the dbuf - * EVICTING and clear *most* of its references. Unfortunetely, - * when we are not holding the dn_dbufs_mtx, we can't clear the - * entry in the dn_dbufs list. We have to wait until dbuf_destroy() - * in this case. For callers from the DMU we will usually see: - * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() - * For the arc callback, we will usually see: - * dbuf_do_evict()->dbuf_clear();dbuf_destroy() - * Sometimes, though, we will get a mix of these two: - * DMU: dbuf_clear()->arc_buf_evict() - * ARC: dbuf_do_evict()->dbuf_destroy() - */ void -dbuf_clear(dmu_buf_impl_t *db) +dbuf_destroy(dmu_buf_impl_t *db) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; - dmu_buf_impl_t *dndb = dn->dn_dbuf; - int dbuf_gone = FALSE; + dmu_buf_impl_t *dndb; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); - dbuf_evict_user(db); + if (db->db_buf != NULL) { + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + } - if (db->db_state == DB_CACHED) { + if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(db->db.db_data != NULL); - if (db->db_blkid == DB_BONUS_BLKID) { - zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - } - db->db.db_data = NULL; + zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); db->db_state = DB_UNCACHED; } + dbuf_clear_data(db); + + if (multilist_link_active(&db->db_cache_link)) { + multilist_remove(&dbuf_cache, db); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + } + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); db->db_state = DB_EVICTING; db->db_blkptr = NULL; - if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { - list_remove(&dn->dn_dbufs, db); + /* + * Now that db_state is DB_EVICTING, nobody else can find this via + * the hash table. We can now drop db_mtx, which allows us to + * acquire the dn_dbufs_mtx. + */ + mutex_exit(&db->db_mtx); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dndb = dn->dn_dbuf; + if (db->db_blkid != DMU_BONUS_BLKID) { + boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); + if (needlock) + mutex_enter(&dn->dn_dbufs_mtx); + avl_remove(&dn->dn_dbufs, db); + atomic_dec_32(&dn->dn_dbufs_count); + membar_producer(); + DB_DNODE_EXIT(db); + if (needlock) + mutex_exit(&dn->dn_dbufs_mtx); + /* + * Decrementing the dbuf count means that the hold corresponding + * to the removed dbuf is no longer discounted in dnode_move(), + * so the dnode cannot be moved until after we release the hold. + * The membar_producer() ensures visibility of the decremented + * value in dnode_move(), since DB_DNODE_EXIT doesn't actually + * release any lock. + */ dnode_rele(dn, db); - db->db_dnode = NULL; + db->db_dnode_handle = NULL; + + dbuf_hash_remove(db); + } else { + DB_DNODE_EXIT(db); } - if (db->db_buf) - dbuf_gone = arc_buf_evict(db->db_buf); + ASSERT(refcount_is_zero(&db->db_holds)); - if (!dbuf_gone) - mutex_exit(&db->db_mtx); + db->db_parent = NULL; + + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + ASSERT(db->db_hash_next == NULL); + ASSERT(db->db_blkptr == NULL); + ASSERT(db->db_data_pending == NULL); + ASSERT(!multilist_link_active(&db->db_cache_link)); + + kmem_cache_free(dbuf_kmem_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); /* - * If this dbuf is referened from an indirect dbuf, + * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ if (parent && parent != dndb) dbuf_rele(parent, db); } +/* + * Note: While bpp will always be updated if the function returns success, + * parentp will not be updated if the dnode does not have dn_dbuf filled in; + * this happens when the dnode is the meta-dnode, or a userused or groupused + * object. + */ static int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, dmu_buf_impl_t **parentp, blkptr_t **bpp) @@ -1478,7 +2153,20 @@ dbuf_findbp(dnode_t *dn, int level, uint *parentp = NULL; *bpp = NULL; - ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(blkid != DMU_BONUS_BLKID); + + if (blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + if (dn->dn_have_spill && + (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) + *bpp = &dn->dn_phys->dn_spill; + else + *bpp = NULL; + dbuf_add_ref(dn->dn_dbuf, NULL); + *parentp = dn->dn_dbuf; + mutex_exit(&dn->dn_mtx); + return (0); + } if (dn->dn_phys->dn_nlevels == 0) nlevels = 1; @@ -1492,11 +2180,11 @@ dbuf_findbp(dnode_t *dn, int level, uint if (level >= nlevels || (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { /* the buffer has no parent yet */ - return (ENOENT); + return (SET_ERROR(ENOENT)); } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err = dbuf_hold_impl(dn, level+1, - blkid >> epbs, fail_sparse, NULL, parentp); + blkid >> epbs, fail_sparse, FALSE, NULL, parentp); if (err) return (err); err = dbuf_read(*parentp, NULL, @@ -1533,7 +2221,7 @@ dbuf_create(dnode_t *dn, uint8_t level, ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); - db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); + db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); db->db_objset = os; db->db.db_object = dn->dn_object; @@ -1541,29 +2229,32 @@ dbuf_create(dnode_t *dn, uint8_t level, db->db_blkid = blkid; db->db_last_dirty = NULL; db->db_dirtycnt = 0; - db->db_dnode = dn; + db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; - db->db_user_ptr = NULL; - db->db_user_data_ptr_ptr = NULL; - db->db_evict_func = NULL; - db->db_immediate_evict = 0; - db->db_freed_in_flight = 0; + db->db_user = NULL; + db->db_user_immediate_evict = FALSE; + db->db_freed_in_flight = FALSE; + db->db_pending_evict = FALSE; - if (blkid == DB_BONUS_BLKID) { + if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); db->db.db_size = DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); - db->db.db_offset = DB_BONUS_BLKID; + db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); + } else if (blkid == DMU_SPILL_BLKID) { + db->db.db_size = (blkptr != NULL) ? + BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; + db->db.db_offset = 0; } else { int blocksize = - db->db_level ? 1<dn_indblkshift : dn->dn_datablksz; + db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } @@ -1579,11 +2270,12 @@ dbuf_create(dnode_t *dn, uint8_t level, db->db_state = DB_EVICTING; if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ - kmem_cache_free(dbuf_cache, db); + kmem_cache_free(dbuf_kmem_cache, db); mutex_exit(&dn->dn_dbufs_mtx); return (odb); } - list_insert_head(&dn->dn_dbufs, db); + avl_add(&dn->dn_dbufs, db); + db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); @@ -1594,120 +2286,241 @@ dbuf_create(dnode_t *dn, uint8_t level, ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || refcount_count(&dn->dn_holds) > 0); (void) refcount_add(&dn->dn_holds, db); + atomic_inc_32(&dn->dn_dbufs_count); dprintf_dbuf(db, "db=%p\n", db); return (db); } -static int -dbuf_do_evict(void *private) -{ - arc_buf_t *buf = private; - dmu_buf_impl_t *db = buf->b_private; +typedef struct dbuf_prefetch_arg { + spa_t *dpa_spa; /* The spa to issue the prefetch in. */ + zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ + int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ + int dpa_curlevel; /* The current level that we're reading */ + dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ + zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ + zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ + arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ +} dbuf_prefetch_arg_t; - if (!MUTEX_HELD(&db->db_mtx)) - mutex_enter(&db->db_mtx); +/* + * Actually issue the prefetch read for the block given. + */ +static void +dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) +{ + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + return; - ASSERT(refcount_is_zero(&db->db_holds)); + arc_flags_t aflags = + dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; - if (db->db_state != DB_EVICTING) { - ASSERT(db->db_state == DB_CACHED); - DBUF_VERIFY(db); - db->db_buf = NULL; - dbuf_evict(db); - } else { - mutex_exit(&db->db_mtx); - dbuf_destroy(db); - } - return (0); + ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); + ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); + ASSERT(dpa->dpa_zio != NULL); + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, + dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &dpa->dpa_zb); } +/* + * Called when an indirect block above our prefetch target is read in. This + * will either read in the next indirect block down the tree or issue the actual + * prefetch if the next block down is our target. + */ static void -dbuf_destroy(dmu_buf_impl_t *db) +dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) { - ASSERT(refcount_is_zero(&db->db_holds)); + dbuf_prefetch_arg_t *dpa = private; - if (db->db_blkid != DB_BONUS_BLKID) { - /* - * If this dbuf is still on the dn_dbufs list, - * remove it from that list. - */ - if (db->db_dnode) { - dnode_t *dn = db->db_dnode; + ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); + ASSERT3S(dpa->dpa_curlevel, >, 0); - mutex_enter(&dn->dn_dbufs_mtx); - list_remove(&dn->dn_dbufs, db); - mutex_exit(&dn->dn_dbufs_mtx); - - dnode_rele(dn, db); - db->db_dnode = NULL; + /* + * The dpa_dnode is only valid if we are called with a NULL + * zio. This indicates that the arc_read() returned without + * first calling zio_read() to issue a physical read. Once + * a physical read is made the dpa_dnode must be invalidated + * as the locks guarding it may have been dropped. If the + * dpa_dnode is still valid, then we want to add it to the dbuf + * cache. To do so, we must hold the dbuf associated with the block + * we just prefetched, read its contents so that we associate it + * with an arc_buf_t, and then release it. + */ + if (zio != NULL) { + ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); + if (zio->io_flags & ZIO_FLAG_RAW) { + ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); + } else { + ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); } - dbuf_hash_remove(db); - } - db->db_parent = NULL; - db->db_buf = NULL; + ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); - ASSERT(!list_link_active(&db->db_link)); - ASSERT(db->db.db_data == NULL); - ASSERT(db->db_hash_next == NULL); - ASSERT(db->db_blkptr == NULL); - ASSERT(db->db_data_pending == NULL); + dpa->dpa_dnode = NULL; + } else if (dpa->dpa_dnode != NULL) { + uint64_t curblkid = dpa->dpa_zb.zb_blkid >> + (dpa->dpa_epbs * (dpa->dpa_curlevel - + dpa->dpa_zb.zb_level)); + dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, + dpa->dpa_curlevel, curblkid, FTAG); + (void) dbuf_read(db, NULL, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); + dbuf_rele(db, FTAG); + } + + dpa->dpa_curlevel--; + + uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> + (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); + blkptr_t *bp = ((blkptr_t *)abuf->b_data) + + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); + if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { + kmem_free(dpa, sizeof (*dpa)); + } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { + ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); + dbuf_issue_final_prefetch(dpa, bp); + kmem_free(dpa, sizeof (*dpa)); + } else { + arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; + zbookmark_phys_t zb; + + ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); + + SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, + dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); + + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, + bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &iter_aflags, &zb); + } - kmem_cache_free(dbuf_cache, db); - arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); + arc_buf_destroy(abuf, private); } +/* + * Issue prefetch reads for the given block on the given level. If the indirect + * blocks above that block are not in memory, we will read them in + * asynchronously. As a result, this call never blocks waiting for a read to + * complete. + */ void -dbuf_prefetch(dnode_t *dn, uint64_t blkid) +dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, + arc_flags_t aflags) { - dmu_buf_impl_t *db = NULL; - blkptr_t *bp = NULL; + blkptr_t bp; + int epbs, nlevels, curlevel; + uint64_t curblkid; - ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + if (blkid > dn->dn_maxblkid) + return; + if (dnode_block_freed(dn, blkid)) return; - /* dbuf_find() returns with db_mtx held */ - if (db = dbuf_find(dn, 0, blkid)) { - if (refcount_count(&db->db_holds) > 0) { - /* - * This dbuf is active. We assume that it is - * already CACHED, or else about to be either - * read or filled. - */ - mutex_exit(&db->db_mtx); - return; - } + /* + * This dnode hasn't been written to disk yet, so there's nothing to + * prefetch. + */ + nlevels = dn->dn_phys->dn_nlevels; + if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) + return; + + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) + return; + + dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, + level, blkid); + if (db != NULL) { mutex_exit(&db->db_mtx); - db = NULL; + /* + * This dbuf already exists. It is either CACHED, or + * (we assume) about to be read or filled. + */ + return; + } + + /* + * Find the closest ancestor (indirect block) of the target block + * that is present in the cache. In this indirect block, we will + * find the bp that is at curlevel, curblkid. + */ + curlevel = level; + curblkid = blkid; + while (curlevel < nlevels - 1) { + int parent_level = curlevel + 1; + uint64_t parent_blkid = curblkid >> epbs; + dmu_buf_impl_t *db; + + if (dbuf_hold_impl(dn, parent_level, parent_blkid, + FALSE, TRUE, FTAG, &db) == 0) { + blkptr_t *bpp = db->db_buf->b_data; + bp = bpp[P2PHASE(curblkid, 1 << epbs)]; + dbuf_rele(db, FTAG); + break; + } + + curlevel = parent_level; + curblkid = parent_blkid; } - if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { - if (bp && !BP_IS_HOLE(bp)) { - arc_buf_t *pbuf; - dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; - zbookmark_t zb; + if (curlevel == nlevels - 1) { + /* No cached indirect blocks found. */ + ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); + bp = dn->dn_phys->dn_blkptr[curblkid]; + } + if (BP_IS_HOLE(&bp)) + return; - SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, - dn->dn_object, 0, blkid); + ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); - if (db) - pbuf = db->db_buf; - else - pbuf = dn->dn_objset->os_phys_buf; + zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, + ZIO_FLAG_CANFAIL); - (void) arc_read(NULL, dn->dn_objset->os_spa, - bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &aflags, &zb); - } - if (db) - dbuf_rele(db, NULL); + dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, level, blkid); + dpa->dpa_curlevel = curlevel; + dpa->dpa_prio = prio; + dpa->dpa_aflags = aflags; + dpa->dpa_spa = dn->dn_objset->os_spa; + dpa->dpa_dnode = dn; + dpa->dpa_epbs = epbs; + dpa->dpa_zio = pio; + + /* + * If we have the indirect just above us, no need to do the asynchronous + * prefetch chain; we'll just run the last step ourselves. If we're at + * a higher level, though, we want to issue the prefetches for all the + * indirect blocks asynchronously, so we can go on with whatever we were + * doing. + */ + if (curlevel == level) { + ASSERT3U(curblkid, ==, blkid); + dbuf_issue_final_prefetch(dpa, &bp); + kmem_free(dpa, sizeof (*dpa)); + } else { + arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; + zbookmark_phys_t zb; + + SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, curlevel, curblkid); + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, + &bp, dbuf_prefetch_indirect_done, dpa, prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &iter_aflags, &zb); } + /* + * We use pio here instead of dpa_zio since it's possible that + * dpa may have already been freed. + */ + zio_nowait(pio); } /* @@ -1715,29 +2528,33 @@ dbuf_prefetch(dnode_t *dn, uint64_t blki * Note: dn_struct_rwlock must be held. */ int -dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, +dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, + boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp) { dmu_buf_impl_t *db, *parent = NULL; - ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT3U(dn->dn_nlevels, >, level); *dbp = NULL; top: /* dbuf_find() returns with db_mtx held */ - db = dbuf_find(dn, level, blkid); + db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); if (db == NULL) { blkptr_t *bp = NULL; int err; + if (fail_uncached) + return (SET_ERROR(ENOENT)); + ASSERT3P(parent, ==, NULL); err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); if (fail_sparse) { if (err == 0 && bp && BP_IS_HOLE(bp)) - err = ENOENT; + err = SET_ERROR(ENOENT); if (err) { if (parent) dbuf_rele(parent, NULL); @@ -1749,19 +2566,14 @@ top: db = dbuf_create(dn, level, blkid, parent, bp); } - if (db->db_buf && refcount_is_zero(&db->db_holds)) { - arc_buf_add_ref(db->db_buf, db); - if (db->db_buf->b_data == NULL) { - dbuf_clear(db); - if (parent) { - dbuf_rele(parent, NULL); - parent = NULL; - } - goto top; - } - ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); + if (fail_uncached && db->db_state != DB_CACHED) { + mutex_exit(&db->db_mtx); + return (SET_ERROR(ENOENT)); } + if (db->db_buf != NULL) + ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); + ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); /* @@ -1769,7 +2581,7 @@ top: * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ - if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && dn->dn_object != DMU_META_DNODE_OBJECT && db->db_state == DB_CACHED && db->db_data_pending) { dbuf_dirty_record_t *dr = db->db_data_pending; @@ -1778,15 +2590,20 @@ top: arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, - arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + arc_alloc_buf(dn->dn_objset->os_spa, db->db.db_size, db, type)); bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, db->db.db_size); } } + if (multilist_link_active(&db->db_cache_link)) { + ASSERT(refcount_is_zero(&db->db_holds)); + multilist_remove(&dbuf_cache, db); + (void) refcount_remove_many(&dbuf_cache_size, + db->db.db_size, db); + } (void) refcount_add(&db->db_holds, tag); - dbuf_update_data(db); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); @@ -1794,7 +2611,7 @@ top: if (parent) dbuf_rele(parent, NULL); - ASSERT3P(db->db_dnode, ==, dn); + ASSERT3P(DB_DNODE(db), ==, dn); ASSERT3U(db->db_blkid, ==, blkid); ASSERT3U(db->db_level, ==, level); *dbp = db; @@ -1805,16 +2622,14 @@ top: dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { - dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); - return (err ? NULL : db); + return (dbuf_hold_level(dn, 0, blkid, tag)); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; - int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); + int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); return (err ? NULL : db); } @@ -1824,30 +2639,77 @@ dbuf_create_bonus(dnode_t *dn) ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); - dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); + dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); } -__attribute__((__weak__)) void -dmu_buf_add_ref(dmu_buf_t *db_fake, void *tag) +int +dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dbuf_add_ref(db, tag); + dnode_t *dn; + + if (db->db_blkid != DMU_SPILL_BLKID) + return (SET_ERROR(ENOTSUP)); + if (blksz == 0) + blksz = SPA_MINBLOCKSIZE; + ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); + blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dbuf_new_size(db, blksz, tx); + rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(db); + + return (0); } void +dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) +{ + dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); +} + +#pragma weak dmu_buf_add_ref = dbuf_add_ref +void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { int64_t holds = refcount_add(&db->db_holds, tag); - ASSERT(holds > 1); + ASSERT3S(holds, >, 1); } -__attribute__((__weak__)) void -dmu_buf_rele(dmu_buf_t *db_fake, void *tag) +#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref +boolean_t +dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, + void *tag) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dbuf_rele(db, tag); + dmu_buf_impl_t *found_db; + boolean_t result = B_FALSE; + + if (db->db_blkid == DMU_BONUS_BLKID) + found_db = dbuf_find_bonus(os, obj); + else + found_db = dbuf_find(os, obj, 0, blkid); + + if (found_db != NULL) { + if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { + (void) refcount_add(&db->db_holds, tag); + result = B_TRUE; + } + mutex_exit(&db->db_mtx); + } + return (result); } +/* + * If you call dbuf_rele() you had better not be referencing the dnode handle + * unless you have some other direct or indirect hold on the dnode. (An indirect + * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) + * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the + * dnode's parent dbuf evicting its dnode handles. + */ void dbuf_rele(dmu_buf_impl_t *db, void *tag) { @@ -1855,6 +2717,12 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag) dbuf_rele_and_unlock(db, tag); } +void +dmu_buf_rele(dmu_buf_t *db, void *tag) +{ + dbuf_rele((dmu_buf_impl_t *)db, tag); +} + /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow * db_dirtycnt and db_holds to be updated atomically. @@ -1867,6 +2735,11 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); + /* + * Remove the reference to the dbuf before removing its hold on the + * dnode so we can guarantee in dnode_move() that a referenced bonus + * buffer has a corresponding dnode hold. + */ holds = refcount_remove(&db->db_holds, tag); ASSERT(holds >= 0); @@ -1874,17 +2747,47 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ - if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) + if (db->db_buf != NULL && + holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { arc_buf_freeze(db->db_buf); + } if (holds == db->db_dirtycnt && - db->db_level == 0 && db->db_immediate_evict) + db->db_level == 0 && db->db_user_immediate_evict) dbuf_evict_user(db); if (holds == 0) { - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { + dnode_t *dn; + boolean_t evict_dbuf = db->db_pending_evict; + + /* + * If the dnode moves here, we cannot cross this + * barrier until the move completes. + */ + DB_DNODE_ENTER(db); + + dn = DB_DNODE(db); + atomic_dec_32(&dn->dn_dbufs_count); + + /* + * Decrementing the dbuf count means that the bonus + * buffer's dnode hold is no longer discounted in + * dnode_move(). The dnode cannot move until after + * the dnode_rele() below. + */ + DB_DNODE_EXIT(db); + + /* + * Do not reference db after its lock is dropped. + * Another thread may evict it. + */ mutex_exit(&db->db_mtx); - dnode_rele(db->db_dnode, db); + + if (evict_dbuf) + dnode_evict_bonus(dn); + + dnode_rele(dn, db); } else if (db->db_buf == NULL) { /* * This is a special case: we never associated this @@ -1892,34 +2795,47 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); - dbuf_evict(db); + dbuf_destroy(db); } else if (arc_released(db->db_buf)) { - arc_buf_t *buf = db->db_buf; /* * This dbuf has anonymous data associated with it. */ - dbuf_set_data(db, NULL); - VERIFY(arc_buf_remove_ref(buf, db) == 1); - dbuf_evict(db); + dbuf_destroy(db); } else { - VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); - if (!DBUF_IS_CACHEABLE(db)) - dbuf_clear(db); - else + boolean_t do_arc_evict = B_FALSE; + blkptr_t bp; + spa_t *spa = dmu_objset_spa(db->db_objset); + + if (!DBUF_IS_CACHEABLE(db) && + db->db_blkptr != NULL && + !BP_IS_HOLE(db->db_blkptr) && + !BP_IS_EMBEDDED(db->db_blkptr)) { + do_arc_evict = B_TRUE; + bp = *db->db_blkptr; + } + + if (!DBUF_IS_CACHEABLE(db) || + db->db_pending_evict) { + dbuf_destroy(db); + } else if (!multilist_link_active(&db->db_cache_link)) { + multilist_insert(&dbuf_cache, db); + (void) refcount_add_many(&dbuf_cache_size, + db->db.db_size, db); mutex_exit(&db->db_mtx); + + dbuf_evict_notify(); + } + + if (do_arc_evict) + arc_freed(spa, &bp); } } else { mutex_exit(&db->db_mtx); } -} -__attribute__((__weak__)) uint64_t -dmu_buf_refcount(dmu_buf_t *db_fake) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - return dbuf_refcount(db); } +#pragma weak dmu_buf_refcount = dbuf_refcount uint64_t dbuf_refcount(dmu_buf_impl_t *db) { @@ -1927,56 +2843,57 @@ dbuf_refcount(dmu_buf_impl_t *db) } void * -dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *evict_func) +dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, + dmu_buf_user_t *new_user) { - return (dmu_buf_update_user(db_fake, NULL, user_ptr, - user_data_ptr_ptr, evict_func)); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + mutex_enter(&db->db_mtx); + dbuf_verify_user(db, DBVU_NOT_EVICTING); + if (db->db_user == old_user) + db->db_user = new_user; + else + old_user = db->db_user; + dbuf_verify_user(db, DBVU_NOT_EVICTING); + mutex_exit(&db->db_mtx); + + return (old_user); } void * -dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, - dmu_buf_evict_func_t *evict_func) +dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - db->db_immediate_evict = TRUE; - return (dmu_buf_update_user(db_fake, NULL, user_ptr, - user_data_ptr_ptr, evict_func)); + return (dmu_buf_replace_user(db_fake, NULL, user)); } void * -dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, - void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) +dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(db->db_level == 0); - - ASSERT((user_ptr == NULL) == (evict_func == NULL)); - mutex_enter(&db->db_mtx); - - if (db->db_user_ptr == old_user_ptr) { - db->db_user_ptr = user_ptr; - db->db_user_data_ptr_ptr = user_data_ptr_ptr; - db->db_evict_func = evict_func; - - dbuf_update_data(db); - } else { - old_user_ptr = db->db_user_ptr; - } + db->db_user_immediate_evict = TRUE; + return (dmu_buf_set_user(db_fake, user)); +} - mutex_exit(&db->db_mtx); - return (old_user_ptr); +void * +dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) +{ + return (dmu_buf_replace_user(db_fake, user, NULL)); } void * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(!refcount_is_zero(&db->db_holds)); - return (db->db_user_ptr); + dbuf_verify_user(db, DBVU_NOT_EVICTING); + return (db->db_user); +} + +void +dmu_buf_user_evict_wait() +{ + taskq_wait(dbu_evict_taskq); } boolean_t @@ -1987,11 +2904,40 @@ dmu_buf_freeable(dmu_buf_t *dbuf) if (db->db_blkptr) res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, - db->db_blkptr->blk_birth); + db->db_blkptr, db->db_blkptr->blk_birth); return (res); } +blkptr_t * +dmu_buf_get_blkptr(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + return (dbi->db_blkptr); +} + +objset_t * +dmu_buf_get_objset(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + return (dbi->db_objset); +} + +dnode_t * +dmu_buf_dnode_enter(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + DB_DNODE_ENTER(dbi); + return (DB_DNODE(dbi)); +} + +void +dmu_buf_dnode_exit(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + DB_DNODE_EXIT(dbi); +} + static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { @@ -2001,6 +2947,11 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_i if (db->db_blkptr != NULL) return; + if (db->db_blkid == DMU_SPILL_BLKID) { + db->db_blkptr = &dn->dn_phys->dn_spill; + BP_ZERO(db->db_blkptr); + return; + } if (db->db_level == dn->dn_phys->dn_nlevels-1) { /* * This buffer was allocated at a time when there was @@ -2020,8 +2971,8 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_i if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); - (void) dbuf_hold_impl(dn, db->db_level+1, - db->db_blkid >> epbs, FALSE, db, &parent); + parent = dbuf_hold_level(dn, db->db_level + 1, + db->db_blkid >> epbs, db); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; @@ -2036,7 +2987,7 @@ static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; + dnode_t *dn; zio_t *zio; ASSERT(dmu_tx_is_syncing(tx)); @@ -2048,17 +2999,23 @@ dbuf_sync_indirect(dbuf_dirty_record_t * ASSERT(db->db_level > 0); DBUF_VERIFY(db); + /* Read the block if it hasn't been read yet. */ if (db->db_buf == NULL) { mutex_exit(&db->db_mtx); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); mutex_enter(&db->db_mtx); } ASSERT3U(db->db_state, ==, DB_CACHED); - ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); ASSERT(db->db_buf != NULL); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + /* Indirect block size must match what the dnode thinks it is. */ + ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); + DB_DNODE_EXIT(db); + /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; mutex_exit(&db->db_mtx); @@ -2066,7 +3023,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t * zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); - dbuf_sync_list(&dr->dt.di.dr_children, tx); + dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_exit(&dr->dt.di.dr_mtx); zio_nowait(zio); @@ -2077,8 +3034,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; - objset_t *os = dn->dn_objset; + dnode_t *dn; + objset_t *os; uint64_t txg = tx->tx_txg; ASSERT(dmu_tx_is_syncing(tx)); @@ -2101,19 +3058,30 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, } DBUF_VERIFY(db); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (db->db_blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; + mutex_exit(&dn->dn_mtx); + } + /* * If this is a bonus buffer, simply copy the bonus data into the * dnode. It will be written out when the dnode is synced (and it * will be synced, since it must have been dirty for dbuf_sync to * be called). */ - if (db->db_blkid == DB_BONUS_BLKID) { + if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_dirty_record_t **drp; ASSERT(*datap != NULL); - ASSERT3U(db->db_level, ==, 0); + ASSERT0(db->db_level); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); + DB_DNODE_EXIT(db); + if (*datap != db->db.db_data) { zio_buf_free(*datap, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); @@ -2125,6 +3093,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, ASSERT(dr->dr_next == NULL); ASSERT(dr->dr_dbuf == db); *drp = dr->dr_next; + if (dr->dr_dbuf->db_level != 0) { + list_destroy(&dr->dt.di.dr_children); + mutex_destroy(&dr->dt.di.dr_mtx); + } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; @@ -2132,6 +3104,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, return; } + os = dn->dn_objset; + /* * This function may have dropped the db_mtx lock allowing a dmu_sync * operation to sneak in. As a result, we need to ensure that we @@ -2141,7 +3115,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dbuf_check_blkptr(dn, db); /* - * If this buffer is in the middle of an immdiate write, + * If this buffer is in the middle of an immediate write, * wait for the synchronous IO to complete. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { @@ -2168,7 +3142,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, */ int blksz = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - *datap = arc_buf_alloc(os->os_spa, blksz, db, type); + *datap = arc_alloc_buf(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } db->db_data_pending = dr; @@ -2178,14 +3152,24 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); - if (dn->dn_object == DMU_META_DNODE_OBJECT) + if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); - else + DB_DNODE_EXIT(db); + } else { + /* + * Although zio_nowait() does not "wait for an IO", it does + * initiate the IO. If this is an empty write it seems plausible + * that the IO could actually be completed before the nowait + * returns. We need to DB_DNODE_EXIT() first in case + * zio_nowait() invalidates the dbuf. + */ + DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); + } } void -dbuf_sync_list(list_t *list, dmu_tx_t *tx) +dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; @@ -2202,6 +3186,10 @@ dbuf_sync_list(list_t *list, dmu_tx_t *t DMU_META_DNODE_OBJECT); break; } + if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { + VERIFY3U(dr->dr_dbuf->db_level, ==, level); + } list_remove(list, dr); if (dr->dr_dbuf->db_level > 0) dbuf_sync_indirect(dr, tx); @@ -2215,33 +3203,46 @@ static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; + dnode_t *dn; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; - dnode_t *dn = db->db_dnode; spa_t *spa = zio->io_spa; int64_t delta; uint64_t fill = 0; int i; - ASSERT(db->db_blkptr == bp); + ASSERT3P(db->db_blkptr, !=, NULL); + ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; - if (BP_IS_HOLE(bp)) { - ASSERT(bp->blk_fill == 0); - return; + if (bp->blk_birth != 0) { + ASSERT((db->db_blkid != DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_type) || + (db->db_blkid == DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_bonustype) || + BP_IS_EMBEDDED(bp)); + ASSERT(BP_GET_LEVEL(bp) == db->db_level); } - ASSERT(BP_GET_TYPE(bp) == dn->dn_type); - ASSERT(BP_GET_LEVEL(bp) == db->db_level); - mutex_enter(&db->db_mtx); +#ifdef ZFS_DEBUG + if (db->db_blkid == DMU_SPILL_BLKID) { + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(bp)) && + db->db_blkptr == &dn->dn_phys->dn_spill); + } +#endif + if (db->db_level == 0) { mutex_enter(&dn->dn_mtx); - if (db->db_blkid > dn->dn_phys->dn_maxblkid) + if (db->db_blkid > dn->dn_phys->dn_maxblkid && + db->db_blkid != DMU_SPILL_BLKID) dn->dn_phys->dn_maxblkid = db->db_blkid; mutex_exit(&dn->dn_mtx); @@ -2253,7 +3254,11 @@ dbuf_write_ready(zio_t *zio, arc_buf_t * fill++; } } else { - fill = 1; + if (BP_IS_HOLE(bp)) { + fill = 0; + } else { + fill = 1; + } } } else { blkptr_t *ibp = db->db.db_data; @@ -2261,13 +3266,90 @@ dbuf_write_ready(zio_t *zio, arc_buf_t * for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; - fill += ibp->blk_fill; + fill += BP_GET_FILL(ibp); } } + DB_DNODE_EXIT(db); - bp->blk_fill = fill; + if (!BP_IS_EMBEDDED(bp)) + bp->blk_fill = fill; mutex_exit(&db->db_mtx); + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + *db->db_blkptr = *bp; + rw_exit(&dn->dn_struct_rwlock); +} + +/* ARGSUSED */ +/* + * This function gets called just prior to running through the compression + * stage of the zio pipeline. If we're an indirect block comprised of only + * holes, then we want this indirect to be compressed away to a hole. In + * order to do that we must zero out any information about the holes that + * this indirect points to prior to before we try to compress it. + */ +static void +dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + dnode_t *dn; + blkptr_t *bp; + uint64_t i; + int epbs; + + ASSERT3U(db->db_level, >, 0); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + + /* Determine if all our children are holes */ + for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { + if (!BP_IS_HOLE(bp)) + break; + } + + /* + * If all the children are holes, then zero them all out so that + * we may get compressed away. + */ + if (i == 1 << epbs) { + /* didn't find any non-holes */ + bzero(db->db.db_data, db->db.db_size); + } + DB_DNODE_EXIT(db); +} + +/* + * The SPA will call this callback several times for each zio - once + * for every physical child i/o (zio->io_phys_children times). This + * allows the DMU to monitor the progress of each logical i/o. For example, + * there may be 2 copies of an indirect block, or many fragments of a RAID-Z + * block. There may be a long delay before all copies/fragments are completed, + * so this callback allows us to retire dirty space gradually, as the physical + * i/os complete. + */ +/* ARGSUSED */ +static void +dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) +{ + dmu_buf_impl_t *db = arg; + objset_t *os = db->db_objset; + dsl_pool_t *dp = dmu_objset_pool(os); + dbuf_dirty_record_t *dr; + int delta = 0; + + dr = db->db_data_pending; + ASSERT3U(dr->dr_txg, ==, zio->io_txg); + + /* + * The callback will be called io_phys_children times. Retire one + * portion of our dirty space each time we are called. Any rounding + * error will be cleaned up by dsl_pool_sync()'s call to + * dsl_pool_undirty_space(). + */ + delta = dr->dr_accounted / zio->io_phys_children; + dsl_pool_undirty_space(dp, delta, zio->io_txg); } /* ARGSUSED */ @@ -2275,22 +3357,23 @@ static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; - blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; - dnode_t *dn = db->db_dnode; - objset_t *os = dn->dn_objset; - uint64_t txg = zio->io_txg; + blkptr_t *bp = db->db_blkptr; + objset_t *os = db->db_objset; + dmu_tx_t *tx = os->os_synctx; dbuf_dirty_record_t **drp, *dr; - ASSERT3U(zio->io_error, ==, 0); + ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); - if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + /* + * For nopwrites and rewrites we ensure that the bp matches our + * original and bypass all the accounting. + */ + if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; - dmu_tx_t *tx = os->os_synctx; - (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } @@ -2303,33 +3386,46 @@ dbuf_write_done(zio_t *zio, arc_buf_t *b while ((dr = *drp) != db->db_data_pending) drp = &dr->dr_next; ASSERT(!list_link_active(&dr->dr_dirty_node)); - ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); ASSERT(dr->dr_next == NULL); *drp = dr->dr_next; +#ifdef ZFS_DEBUG + if (db->db_blkid == DMU_SPILL_BLKID) { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && + db->db_blkptr == &dn->dn_phys->dn_spill); + DB_DNODE_EXIT(db); + } +#endif + if (db->db_level == 0) { - ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) - VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, - db) == 1); - else if (!arc_released(db->db_buf)) - arc_set_callback(db->db_buf, dbuf_do_evict, db); + arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); + ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + ASSERT3U(db->db_blkid, <=, + dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); - ASSERT3U(dn->dn_phys->dn_maxblkid - >> (db->db_level * epbs), >=, db->db_blkid); - arc_set_callback(db->db_buf, dbuf_do_evict, db); } + DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } @@ -2339,7 +3435,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *b ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); } static void @@ -2381,17 +3477,25 @@ dbuf_write_override_done(zio_t *zio) dbuf_write_done(zio, NULL, db); } +/* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; - objset_t *os = dn->dn_objset; + dnode_t *dn; + objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; - zbookmark_t zb; + zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; + int wp_flag = 0; + + ASSERT(dmu_tx_is_syncing(tx)); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + os = dn->dn_objset; if (db->db_state != DB_NOFILL) { if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { @@ -2404,20 +3508,31 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_ if (BP_IS_HOLE(db->db_blkptr)) { arc_buf_thaw(data); } else { - arc_release(data, db); + dbuf_release_bp(db); } } } if (parent != dn->dn_dbuf) { + /* Our parent is an indirect block. */ + /* We have a dirty parent that has been scheduled for write. */ ASSERT(parent && parent->db_data_pending); + /* Our parent's buffer is one level closer to the dnode. */ ASSERT(db->db_level == parent->db_level-1); + /* + * We're about to modify our parent's db_data by modifying + * our block pointer, so the parent must be released. + */ ASSERT(arc_released(parent->db_buf)); zio = parent->db_data_pending->dr_zio; } else { - ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); - ASSERT3P(db->db_blkptr, ==, - &dn->dn_phys->dn_blkptr[db->db_blkid]); + /* Our parent is the dnode itself. */ + ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && + db->db_blkid != DMU_SPILL_BLKID) || + (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); + if (db->db_blkid != DMU_SPILL_BLKID) + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); zio = dn->dn_zio; } @@ -2429,32 +3544,64 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_ os->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); - dmu_write_policy(os, dn, db->db_level, - db->db_state == DB_NOFILL ? WP_NOFILL : 0, &zp); + if (db->db_blkid == DMU_SPILL_BLKID) + wp_flag = WP_SPILL; + wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; + + dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); + DB_DNODE_EXIT(db); + + /* + * We copy the blkptr now (rather than when we instantiate the dirty + * record), because its value can change between open context and + * syncing context. We do not need to hold dn_struct_rwlock to read + * db_blkptr because we are in syncing context. + */ + dr->dr_bp_copy = *db->db_blkptr; + + if (db->db_level == 0 && + dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* + * The BP for this block has been provided by open context + * (by dmu_sync() or dmu_buf_write_embedded()). + */ + void *contents = (data != NULL) ? data->b_data : NULL; - if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { - ASSERT(db->db_state != DB_NOFILL); dr->dr_zio = zio_write(zio, os->os_spa, txg, - db->db_blkptr, data->b_data, arc_buf_size(data), &zp, - dbuf_write_override_ready, dbuf_write_override_done, dr, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + &dr->dr_bp_copy, contents, db->db.db_size, &zp, + dbuf_write_override_ready, NULL, NULL, + dbuf_write_override_done, + dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, - dr->dt.dl.dr_copies); + dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { - ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); + ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || + zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(zio, os->os_spa, txg, - db->db_blkptr, NULL, db->db.db_size, &zp, - dbuf_write_nofill_ready, dbuf_write_nofill_done, db, + &dr->dr_bp_copy, NULL, db->db.db_size, &zp, + dbuf_write_nofill_ready, NULL, NULL, + dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { ASSERT(arc_released(data)); + + /* + * For indirect blocks, we want to setup the children + * ready callback so that we can properly handle an indirect + * block that only contains holes. + */ + arc_done_func_t *children_ready_cb = NULL; + if (db->db_level != 0) + children_ready_cb = dbuf_write_children_ready; + dr->dr_zio = arc_write(zio, os->os_spa, txg, - db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, - dbuf_write_ready, dbuf_write_done, db, + &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), + &zp, dbuf_write_ready, children_ready_cb, + dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt.c,v retrieving revision 1.2 diff -u -p -r1.2 ddt.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt.c 27 Mar 2014 15:50:48 -0000 1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt.c 22 Nov 2015 17:22:33 -0000 @@ -20,8 +20,8 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include @@ -35,6 +35,17 @@ #include #include #include +#include + +/* + * Enable/disable prefetching of dedup-ed blocks which are going to be freed. + */ +int zfs_dedup_prefetch = 1; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS DEDUP"); +SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RWTUN, &zfs_dedup_prefetch, + 0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed"); static const ddt_ops_t *ddt_ops[DDT_TYPES] = { &ddt_zap_ops, @@ -53,10 +64,11 @@ ddt_object_create(ddt_t *ddt, enum ddt_t spa_t *spa = ddt->ddt_spa; objset_t *os = ddt->ddt_os; uint64_t *objectp = &ddt->ddt_object[type][class]; - boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup; + boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags & + ZCHECKSUM_FLAG_DEDUP; char name[DDT_NAMELEN]; - ddt_object_name(ddt, type, class, name, sizeof(name)); + ddt_object_name(ddt, type, class, name); ASSERT(*objectp == 0); VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); @@ -77,16 +89,18 @@ ddt_object_destroy(ddt_t *ddt, enum ddt_ spa_t *spa = ddt->ddt_spa; objset_t *os = ddt->ddt_os; uint64_t *objectp = &ddt->ddt_object[type][class]; + uint64_t count; char name[DDT_NAMELEN]; - ddt_object_name(ddt, type, class, name, sizeof(name)); + ddt_object_name(ddt, type, class, name); ASSERT(*objectp != 0); - ASSERT(ddt_object_count(ddt, type, class) == 0); + VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0); ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); + bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); *objectp = 0; } @@ -94,36 +108,64 @@ ddt_object_destroy(ddt_t *ddt, enum ddt_ static int ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) { + ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; + dmu_object_info_t doi; + uint64_t count; char name[DDT_NAMELEN]; int error; - ddt_object_name(ddt, type, class, name, sizeof(name)); + ddt_object_name(ddt, type, class, name); error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); - if (error) + if (error != 0) return (error); - error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), - &ddt->ddt_histogram[type][class]); + &ddt->ddt_histogram[type][class])); - ASSERT(error == 0); - return (error); + /* + * Seed the cached statistics. + */ + VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); + + error = ddt_object_count(ddt, type, class, &count); + if (error) + return error; + + ddo->ddo_count = count; + ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; + ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; + + return (0); } static void ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, dmu_tx_t *tx) { + ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; + dmu_object_info_t doi; + uint64_t count; char name[DDT_NAMELEN]; - ddt_object_name(ddt, type, class, name, sizeof(name)); + ddt_object_name(ddt, type, class, name); VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), &ddt->ddt_histogram[type][class], tx) == 0); + + /* + * Cache DDT statistics; this is the only time they'll change. + */ + VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); + VERIFY(ddt_object_count(ddt, type, class, &count) == 0); + + ddo->ddo_count = count; + ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; + ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; } static int @@ -131,13 +173,24 @@ ddt_object_lookup(ddt_t *ddt, enum ddt_t ddt_entry_t *dde) { if (!ddt_object_exists(ddt, type, class)) - return (ENOENT); + return (SET_ERROR(ENOENT)); return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, ddt->ddt_object[type][class], dde)); } -static int +static void +ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde) +{ + if (!ddt_object_exists(ddt, type, class)) + return; + + ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, + ddt->ddt_object[type][class], dde); +} + +int ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx) { @@ -167,13 +220,13 @@ ddt_object_walk(ddt_t *ddt, enum ddt_typ ddt->ddt_object[type][class], dde, walk)); } -uint64_t -ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +int +ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, uint64_t *count) { ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, - ddt->ddt_object[type][class])); + ddt->ddt_object[type][class], count)); } int @@ -181,7 +234,7 @@ ddt_object_info(ddt_t *ddt, enum ddt_typ dmu_object_info_t *doi) { if (!ddt_object_exists(ddt, type, class)) - return (ENOENT); + return (SET_ERROR(ENOENT)); return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], doi)); @@ -195,9 +248,9 @@ ddt_object_exists(ddt_t *ddt, enum ddt_t void ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, - char *name, size_t namelen) + char *name) { - (void) snprintf(name, namelen, DMU_POOL_DDT, + (void) sprintf(name, DMU_POOL_DDT, zio_checksum_table[ddt->ddt_checksum].ci_name, ddt_ops[type]->ddt_op_name, ddt_class_name[class]); } @@ -222,12 +275,13 @@ ddt_bp_create(enum zio_checksum checksum ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); bp->blk_cksum = ddk->ddk_cksum; + bp->blk_fill = 1; BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); BP_SET_CHECKSUM(bp, checksum); - BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_TYPE(bp, DMU_OT_DEDUP); BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); @@ -362,7 +416,7 @@ ddt_stat_update(ddt_t *ddt, ddt_entry_t ddt_stat_generate(ddt, dde, &dds); - bucket = highbit(dds.dds_ref_blocks) - 1; + bucket = highbit64(dds.dds_ref_blocks) - 1; ASSERT(bucket >= 0); ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; @@ -400,31 +454,28 @@ ddt_histogram_empty(const ddt_histogram_ } void -ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo) +ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) { - dmu_object_info_t doi; - uint64_t count; - int error; - + /* Sum the statistics we cached in ddt_object_sync(). */ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; for (enum ddt_type type = 0; type < DDT_TYPES; type++) { for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { - error = ddt_object_info(ddt, type, class, &doi); - if (error == ENOENT) - continue; - ASSERT3U(error, ==, 0); - - count = ddt_object_count(ddt, type, class); - ddo->ddo_count += count; - ddo->ddo_dspace += - (doi.doi_physical_blocks_512 << 9) / count; - ddo->ddo_mspace += doi.doi_fill_count * - doi.doi_data_block_size / count; + ddt_object_t *ddo = + &ddt->ddt_object_stats[type][class]; + ddo_total->ddo_count += ddo->ddo_count; + ddo_total->ddo_dspace += ddo->ddo_dspace; + ddo_total->ddo_mspace += ddo->ddo_mspace; } } } + + /* ... and compute the averages. */ + if (ddo_total->ddo_count != 0) { + ddo_total->ddo_dspace /= ddo_total->ddo_count; + ddo_total->ddo_mspace /= ddo_total->ddo_count; + } } void @@ -436,7 +487,7 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_ for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { ddt_histogram_add(ddh, - &ddt->ddt_histogram[type][class]); + &ddt->ddt_histogram_cache[type][class]); } } } @@ -543,7 +594,10 @@ ddt_compress(void *src, uchar_t *dst, si bcopy(src, dst, s_len); } - *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc; + *version = cpfunc; + /* CONSTCOND */ + if (ZFS_HOST_BYTEORDER) + *version |= DDT_COMPRESS_BYTEORDER_MASK; return (c_len + 1); } @@ -560,7 +614,8 @@ ddt_decompress(uchar_t *src, void *dst, else bcopy(src, dst, d_len); - if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK) + if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) != + (ZFS_HOST_BYTEORDER != 0)) byteswap_uint64_array(dst, d_len); } @@ -689,6 +744,30 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *b return (dde); } +void +ddt_prefetch(spa_t *spa, const blkptr_t *bp) +{ + ddt_t *ddt; + ddt_entry_t dde; + + if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) + return; + + /* + * We only remove the DDT once all tables are empty and only + * prefetch dedup blocks when there are entries in the DDT. + * Thus no locking is required as the DDT can't disappear on us. + */ + ddt = ddt_select(spa, bp); + ddt_key_fill(&dde.dde_key, bp); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + ddt_object_prefetch(ddt, type, class, &dde); + } + } +} + int ddt_entry_compare(const void *x1, const void *x2) { @@ -761,15 +840,21 @@ ddt_load(spa_t *spa) return (error == ENOENT ? 0 : error); for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; for (enum ddt_type type = 0; type < DDT_TYPES; type++) { for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { - ddt_t *ddt = spa->spa_ddt[c]; error = ddt_object_load(ddt, type, class); if (error != 0 && error != ENOENT) return (error); } } + + /* + * Seed the cached histograms. + */ + bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + sizeof (ddt->ddt_histogram)); } return (0); @@ -967,10 +1052,17 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t * ddt_object_create(ddt, ntype, nclass, tx); VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); - if (dp->dp_scrub_func != SCRUB_FUNC_NONE && - oclass > nclass && - nclass <= dp->dp_scrub_ddt_class_max) - dsl_pool_scrub_ddt_entry(dp, ddt->ddt_checksum, dde); + /* + * If the class changes, the order that we scan this bp + * changes. If it decreases, we could miss it, so + * scan it right now. (This covers both class changing + * while we are doing ddt_walk(), and when we are + * traversing.) + */ + if (nclass < oclass) { + dsl_scan_ddt_entry(dp->dp_scan, + ddt->ddt_checksum, dde, tx); + } } } @@ -984,15 +1076,12 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, if (avl_numnodes(&ddt->ddt_tree) == 0) return; - ASSERT(spa_sync_pass(spa) == 1); ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); if (spa->spa_ddt_stat_object == 0) { - spa->spa_ddt_stat_object = zap_create(ddt->ddt_os, - DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx); - VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, - &spa->spa_ddt_stat_object, tx) == 0); + spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, + DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, tx); } while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { @@ -1001,14 +1090,23 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, } for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + uint64_t add, count = 0; + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + if (ddt_object_exists(ddt, type, class)) { + ddt_object_sync(ddt, type, class, tx); + VERIFY(ddt_object_count(ddt, type, class, + &add) == 0); + count += add; + } + } for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { - if (!ddt_object_exists(ddt, type, class)) - continue; - ddt_object_sync(ddt, type, class, tx); - if (ddt_object_count(ddt, type, class) == 0) + if (count == 0 && ddt_object_exists(ddt, type, class)) ddt_object_destroy(ddt, type, class, tx); } } + + bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + sizeof (ddt->ddt_histogram)); } void @@ -1049,6 +1147,8 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb ddb->ddb_type, ddb->ddb_class, &ddb->ddb_cursor, dde); } + dde->dde_type = ddb->ddb_type; + dde->dde_class = ddb->ddb_class; if (error == 0) return (0); if (error != ENOENT) @@ -1060,5 +1160,5 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb ddb->ddb_type = 0; } while (++ddb->ddb_class < DDT_CLASSES); - return (ENOENT); + return (SET_ERROR(ENOENT)); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt_zap.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt_zap.c,v retrieving revision 1.1.1.1 diff -u -p -r1.1.1.1 ddt_zap.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt_zap.c 27 Feb 2010 22:30:59 -0000 1.1.1.1 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/ddt_zap.c 16 Feb 2013 16:40:56 -0000 @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -30,7 +29,6 @@ #include #include #include -#include int ddt_zap_leaf_blockshift = 12; int ddt_zap_indirect_blockshift = 12; @@ -81,6 +79,13 @@ ddt_zap_lookup(objset_t *os, uint64_t ob return (0); } +static void +ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde) +{ + (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS); +} + static int ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) { @@ -128,14 +133,11 @@ ddt_zap_walk(objset_t *os, uint64_t obje return (error); } -static uint64_t -ddt_zap_count(objset_t *os, uint64_t object) +static int +ddt_zap_count(objset_t *os, uint64_t object, uint64_t *count) { - uint64_t count = 0; - - VERIFY(zap_count(os, object, &count) == 0); - return (count); + return (zap_count(os, object, count)); } const ddt_ops_t ddt_zap_ops = { @@ -143,6 +145,7 @@ const ddt_ops_t ddt_zap_ops = { ddt_zap_create, ddt_zap_destroy, ddt_zap_lookup, + ddt_zap_prefetch, ddt_zap_update, ddt_zap_remove, ddt_zap_walk, Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c,v retrieving revision 1.5 diff -u -p -r1.5 dmu.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c 11 Apr 2015 00:23:31 -0000 1.5 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c 2 May 2017 16:59:07 -0000 @@ -19,9 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. */ +/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ +/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */ +/* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. */ #include #include @@ -40,60 +43,125 @@ #include #include #include +#include +#include +#include #ifdef _KERNEL -#include +#include +#include #include #endif +/* + * Enable/disable nopwrite feature. + */ +int zfs_nopwrite_enabled = 1; +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN, + &zfs_nopwrite_enabled, 0, "Enable nopwrite feature"); + +/* + * Tunable to control percentage of dirtied blocks from frees in one TXG. + * After this threshold is crossed, additional dirty blocks from frees + * wait until the next TXG. + * A value of zero will disable this throttle. + */ +uint32_t zfs_per_txg_dirty_frees_percent = 30; +SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN, + &zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg"); + const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { - { byteswap_uint8_array, TRUE, "unallocated" }, - { zap_byteswap, TRUE, "object directory" }, - { byteswap_uint64_array, TRUE, "object array" }, - { byteswap_uint8_array, TRUE, "packed nvlist" }, - { byteswap_uint64_array, TRUE, "packed nvlist size" }, - { byteswap_uint64_array, TRUE, "bplist" }, - { byteswap_uint64_array, TRUE, "bplist header" }, - { byteswap_uint64_array, TRUE, "SPA space map header" }, - { byteswap_uint64_array, TRUE, "SPA space map" }, - { byteswap_uint64_array, TRUE, "ZIL intent log" }, - { dnode_buf_byteswap, TRUE, "DMU dnode" }, - { dmu_objset_byteswap, TRUE, "DMU objset" }, - { byteswap_uint64_array, TRUE, "DSL directory" }, - { zap_byteswap, TRUE, "DSL directory child map"}, - { zap_byteswap, TRUE, "DSL dataset snap map" }, - { zap_byteswap, TRUE, "DSL props" }, - { byteswap_uint64_array, TRUE, "DSL dataset" }, - { zfs_znode_byteswap, TRUE, "ZFS znode" }, - { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" }, - { byteswap_uint8_array, FALSE, "ZFS plain file" }, - { zap_byteswap, TRUE, "ZFS directory" }, - { zap_byteswap, TRUE, "ZFS master node" }, - { zap_byteswap, TRUE, "ZFS delete queue" }, - { byteswap_uint8_array, FALSE, "zvol object" }, - { zap_byteswap, TRUE, "zvol prop" }, - { byteswap_uint8_array, FALSE, "other uint8[]" }, - { byteswap_uint64_array, FALSE, "other uint64[]" }, - { zap_byteswap, TRUE, "other ZAP" }, - { zap_byteswap, TRUE, "persistent error log" }, - { byteswap_uint8_array, TRUE, "SPA history" }, - { byteswap_uint64_array, TRUE, "SPA history offsets" }, - { zap_byteswap, TRUE, "Pool properties" }, - { zap_byteswap, TRUE, "DSL permissions" }, - { zfs_acl_byteswap, TRUE, "ZFS ACL" }, - { byteswap_uint8_array, TRUE, "ZFS SYSACL" }, - { byteswap_uint8_array, TRUE, "FUID table" }, - { byteswap_uint64_array, TRUE, "FUID table size" }, - { zap_byteswap, TRUE, "DSL dataset next clones"}, - { zap_byteswap, TRUE, "scrub work queue" }, - { zap_byteswap, TRUE, "ZFS user/group used" }, - { zap_byteswap, TRUE, "ZFS user/group quota" }, - { zap_byteswap, TRUE, "snapshot refcount tags"}, - { zap_byteswap, TRUE, "DDT ZAP algorithm" }, - { zap_byteswap, TRUE, "DDT statistics" }, + { DMU_BSWAP_UINT8, TRUE, "unallocated" }, + { DMU_BSWAP_ZAP, TRUE, "object directory" }, + { DMU_BSWAP_UINT64, TRUE, "object array" }, + { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, + { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, + { DMU_BSWAP_UINT64, TRUE, "bpobj" }, + { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, + { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, + { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, + { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, + { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, + { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, + { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, + { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, + { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, + { DMU_BSWAP_ZAP, TRUE, "DSL props" }, + { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, + { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, + { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, + { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, + { DMU_BSWAP_UINT8, FALSE, "zvol object" }, + { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, + { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, + { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, + { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, + { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, + { DMU_BSWAP_UINT8, TRUE, "SPA history" }, + { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, + { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, + { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, + { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, + { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, + { DMU_BSWAP_UINT8, TRUE, "FUID table" }, + { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, + { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, + { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, + { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, + { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, + { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, + { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, + { DMU_BSWAP_UINT8, TRUE, "System attributes" }, + { DMU_BSWAP_ZAP, TRUE, "SA master node" }, + { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, + { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, + { DMU_BSWAP_ZAP, TRUE, "scan translations" }, + { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, + { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, + { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, + { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, + { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } +}; + +const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { + { byteswap_uint8_array, "uint8" }, + { byteswap_uint16_array, "uint16" }, + { byteswap_uint32_array, "uint32" }, + { byteswap_uint64_array, "uint64" }, + { zap_byteswap, "zap" }, + { dnode_buf_byteswap, "dnode" }, + { dmu_objset_byteswap, "objset" }, + { zfs_znode_byteswap, "znode" }, + { zfs_oldacl_byteswap, "oldacl" }, + { zfs_acl_byteswap, "acl" } }; int -dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, +dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, + void *tag, dmu_buf_t **dbp) +{ + uint64_t blkid; + dmu_buf_impl_t *db; + + blkid = dbuf_whichblock(dn, 0, offset); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold(dn, blkid, tag); + rw_exit(&dn->dn_struct_rwlock); + + if (db == NULL) { + *dbp = NULL; + return (SET_ERROR(EIO)); + } + + *dbp = &db->db; + return (0); +} +int +dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp) { dnode_t *dn; @@ -104,22 +172,64 @@ dmu_buf_hold(objset_t *os, uint64_t obje err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); - blkid = dbuf_whichblock(dn, offset); + blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + if (db == NULL) { - err = EIO; - } else { - err = dbuf_read(db, NULL, DB_RF_CANFAIL); - if (err) { + *dbp = NULL; + return (SET_ERROR(EIO)); + } + + *dbp = &db->db; + return (err); +} + +int +dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, + void *tag, dmu_buf_t **dbp, int flags) +{ + int err; + int db_flags = DB_RF_CANFAIL; + + if (flags & DMU_READ_NO_PREFETCH) + db_flags |= DB_RF_NOPREFETCH; + + err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); + if (err == 0) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); + err = dbuf_read(db, NULL, db_flags); + if (err != 0) { dbuf_rele(db, tag); - db = NULL; + *dbp = NULL; + } + } + + return (err); +} + +int +dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **dbp, int flags) +{ + int err; + int db_flags = DB_RF_CANFAIL; + + if (flags & DMU_READ_NO_PREFETCH) + db_flags |= DB_RF_NOPREFETCH; + + err = dmu_buf_hold_noread(os, object, offset, tag, dbp); + if (err == 0) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); + err = dbuf_read(db, NULL, db_flags); + if (err != 0) { + dbuf_rele(db, tag); + *dbp = NULL; } } - dnode_rele(dn, FTAG); - *dbp = &db->db; return (err); } @@ -130,16 +240,79 @@ dmu_bonus_max(void) } int -dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx) +dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; - if (dn->dn_bonus != (dmu_buf_impl_t *)db) - return (EINVAL); - if (newsize < 0 || newsize > db->db_size) - return (EINVAL); - dnode_setbonuslen(dn, newsize, tx); - return (0); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (dn->dn_bonus != db) { + error = SET_ERROR(EINVAL); + } else if (newsize < 0 || newsize > db_fake->db_size) { + error = SET_ERROR(EINVAL); + } else { + dnode_setbonuslen(dn, newsize, tx); + error = 0; + } + + DB_DNODE_EXIT(db); + return (error); +} + +int +dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (!DMU_OT_IS_VALID(type)) { + error = SET_ERROR(EINVAL); + } else if (dn->dn_bonus != db) { + error = SET_ERROR(EINVAL); + } else { + dnode_setbonus_type(dn, type, tx); + error = 0; + } + + DB_DNODE_EXIT(db); + return (error); +} + +dmu_object_type_t +dmu_get_bonustype(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + dmu_object_type_t type; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + type = dn->dn_bonustype; + DB_DNODE_EXIT(db); + + return (type); +} + +int +dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + int error; + + error = dnode_hold(os, object, FTAG, &dn); + dbuf_rm_spill(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dnode_rm_spill(dn, tx); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (error); } /* @@ -164,21 +337,105 @@ dmu_bonus_hold(objset_t *os, uint64_t ob dbuf_create_bonus(dn); } db = dn->dn_bonus; - rw_exit(&dn->dn_struct_rwlock); /* as long as the bonus buf is held, the dnode will be held */ - if (refcount_add(&db->db_holds, tag) == 1) + if (refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); + atomic_inc_32(&dn->dn_dbufs_count); + } + + /* + * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's + * hold and incrementing the dbuf count to ensure that dnode_move() sees + * a dnode hold for every dbuf. + */ + rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); - VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); + VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); *dbp = &db->db; return (0); } /* + * returns ENOENT, EIO, or 0. + * + * This interface will allocate a blank spill dbuf when a spill blk + * doesn't already exist on the dnode. + * + * if you only want to find an already existing spill db, then + * dmu_spill_hold_existing() should be used. + */ +int +dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = NULL; + int err; + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + + ASSERT(db != NULL); + err = dbuf_read(db, NULL, flags); + if (err == 0) + *dbp = &db->db; + else + dbuf_rele(db, tag); + return (err); +} + +int +dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { + err = SET_ERROR(EINVAL); + } else { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + if (!dn->dn_have_spill) { + err = SET_ERROR(ENOENT); + } else { + err = dmu_spill_hold_by_dnode(dn, + DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); + } + + rw_exit(&dn->dn_struct_rwlock); + } + + DB_DNODE_EXIT(db); + return (err); +} + +int +dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); + DB_DNODE_EXIT(db); + + return (err); +} + +/* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than -- the lookup is wasteful, * and can induce severe lock contention when writing to several files @@ -186,27 +443,29 @@ dmu_bonus_hold(objset_t *os, uint64_t ob */ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, - int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) + boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { - dsl_pool_t *dp = NULL; dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio; - hrtime_t start; ASSERT(length <= DMU_MAX_ACCESS); - dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; - if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) - dbuf_flags |= DB_RF_NOPREFETCH; + /* + * Note: We directly notify the prefetch code of this read, so that + * we can tell it about the multi-block read. dbuf_read() only knows + * about the one block it is accessing. + */ + dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | + DB_RF_NOPREFETCH; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; - nblks = (P2ROUNDUP(offset+length, 1ULL<> blkshift; + nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - + P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " @@ -216,39 +475,52 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); rw_exit(&dn->dn_struct_rwlock); - return (EIO); + return (SET_ERROR(EIO)); } nblks = 1; } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); - if (dn->dn_objset->os_dsl_dataset) - dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; - if (dp && dsl_pool_sync_context(dp)) - start = gethrtime(); +#if defined(_KERNEL) && defined(RACCT) + if (racct_enable && !read) { + PROC_LOCK(curproc); + racct_add_force(curproc, RACCT_WRITEBPS, length); + racct_add_force(curproc, RACCT_WRITEIOPS, nblks); + PROC_UNLOCK(curproc); + } +#endif + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); - blkid = dbuf_whichblock(dn, offset); + blkid = dbuf_whichblock(dn, 0, offset); for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); + dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); zio_nowait(zio); - return (EIO); + return (SET_ERROR(EIO)); } + /* initiate async i/o */ - if (read) { + if (read) (void) dbuf_read(db, zio, dbuf_flags); - } + +#ifdef _KERNEL + else + curthread->td_ru.ru_oublock++; +#endif dbp[i] = &db->db; } + + if ((flags & DMU_READ_NO_PREFETCH) == 0 && + DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { + dmu_zfetch(&dn->dn_zfetch, blkid, nblks, + read && DNODE_IS_CACHEABLE(dn)); + } rw_exit(&dn->dn_struct_rwlock); /* wait for async i/o */ err = zio_wait(zio); - /* track read overhead when we are in sync context */ - if (dp && dsl_pool_sync_context(dp)) - dp->dp_read_overhead += gethrtime() - start; if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); @@ -263,7 +535,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) - err = EIO; + err = SET_ERROR(EIO); mutex_exit(&db->db_mtx); if (err) { dmu_buf_rele_array(dbp, nblks, tag); @@ -297,14 +569,19 @@ dmu_buf_hold_array(objset_t *os, uint64_ } int -dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, + uint64_t length, boolean_t read, void *tag, int *numbufsp, + dmu_buf_t ***dbpp) { - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; int err; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); + DB_DNODE_EXIT(db); return (err); } @@ -326,25 +603,32 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } +/* + * Issue prefetch i/os for the given blocks. If level is greater than 0, the + * indirect blocks prefeteched will be those that point to the blocks containing + * the data starting at offset, and continuing to offset + len. + * + * Note that if the indirect blocks above the blocks being prefetched are not in + * cache, they will be asychronously read in. + */ void -dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) +dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, + uint64_t len, zio_priority_t pri) { dnode_t *dn; uint64_t blkid; - int nblks, i, err; - - if (zfs_prefetch_disable) - return; + int nblks, err; if (len == 0) { /* they're interested in the bonus buffer */ - dn = os->os_meta_dnode; + dn = DMU_META_DNODE(os); if (object == 0 || object >= DN_MAX_OBJECT) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); - dbuf_prefetch(dn, blkid); + blkid = dbuf_whichblock(dn, level, + object * sizeof (dnode_phys_t)); + dbuf_prefetch(dn, level, blkid, pri, 0); rw_exit(&dn->dn_struct_rwlock); return; } @@ -359,18 +643,24 @@ dmu_prefetch(objset_t *os, uint64_t obje return; rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (dn->dn_datablkshift) { - int blkshift = dn->dn_datablkshift; - nblks = (P2ROUNDUP(offset+len, 1<> blkshift; + /* + * offset + len - 1 is the last byte we want to prefetch for, and offset + * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the + * last block we want to prefetch, and dbuf_whichblock(dn, level, + * offset) is the first. Then the number we need to prefetch is the + * last - first + 1. + */ + if (level > 0 || dn->dn_datablkshift != 0) { + nblks = dbuf_whichblock(dn, level, offset + len - 1) - + dbuf_whichblock(dn, level, offset) + 1; } else { nblks = (offset < dn->dn_datablksz); } if (nblks != 0) { - blkid = dbuf_whichblock(dn, offset); - for (i = 0; i < nblks; i++) - dbuf_prefetch(dn, blkid+i); + blkid = dbuf_whichblock(dn, level, offset); + for (int i = 0; i < nblks; i++) + dbuf_prefetch(dn, level, blkid + i, pri, 0); } rw_exit(&dn->dn_struct_rwlock); @@ -383,98 +673,136 @@ dmu_prefetch(objset_t *os, uint64_t obje * the end so that the file gets shorter over time (if we crashes in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. + * + * On input, *start should be the first offset that does not need to be + * freed (e.g. "offset + length"). On return, *start will be the first + * offset that should be freed. */ static int -get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) +get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) { - uint64_t len = *start - limit; - uint64_t blkcnt = 0; - uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1)); + uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); + /* bytes of data covered by a level-1 indirect block */ uint64_t iblkrange = dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); - ASSERT(limit <= *start); + ASSERT3U(minimum, <=, *start); - if (len <= iblkrange * maxblks) { - *start = limit; + if (*start - minimum <= iblkrange * maxblks) { + *start = minimum; return (0); } ASSERT(ISP2(iblkrange)); - while (*start > limit && blkcnt < maxblks) { + for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) { int err; - /* find next allocated L1 indirect */ + /* + * dnode_next_offset(BACKWARDS) will find an allocated L1 + * indirect block at or before the input offset. We must + * decrement *start so that it is at the end of the region + * to search. + */ + (*start)--; err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); - /* if there are no more, then we are done */ + /* if there are no indirect blocks before start, we are done */ if (err == ESRCH) { - *start = limit; - return (0); - } else if (err) { + *start = minimum; + break; + } else if (err != 0) { return (err); } - blkcnt += 1; - /* reset offset to end of "next" block back */ + /* set start to the beginning of this L1 indirect */ *start = P2ALIGN(*start, iblkrange); - if (*start <= limit) - *start = limit; - else - *start -= 1; } + if (*start < minimum) + *start = minimum; return (0); } static int dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, - uint64_t length, boolean_t free_dnode) + uint64_t length) { - dmu_tx_t *tx; - uint64_t object_size, start, end, len; - boolean_t trunc = (length == DMU_OBJECT_END); - int align, err; - - align = 1 << dn->dn_datablkshift; - ASSERT(align > 0); - object_size = align == 1 ? dn->dn_datablksz : - (dn->dn_maxblkid + 1) << dn->dn_datablkshift; - - end = offset + length; - if (trunc || end > object_size) - end = object_size; - if (end <= offset) + uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; + int err; + uint64_t dirty_frees_threshold; + dsl_pool_t *dp = dmu_objset_pool(os); + + if (offset >= object_size) return (0); - length = end - offset; - while (length) { - start = end; - /* assert(offset <= start) */ - err = get_next_chunk(dn, &start, offset); + if (zfs_per_txg_dirty_frees_percent <= 100) + dirty_frees_threshold = + zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; + else + dirty_frees_threshold = zfs_dirty_data_max / 4; + + if (length == DMU_OBJECT_END || offset + length > object_size) + length = object_size - offset; + + while (length != 0) { + uint64_t chunk_end, chunk_begin, chunk_len; + uint64_t long_free_dirty_all_txgs = 0; + dmu_tx_t *tx; + + chunk_end = chunk_begin = offset + length; + + /* move chunk_begin backwards to the beginning of this chunk */ + err = get_next_chunk(dn, &chunk_begin, offset); if (err) return (err); - len = trunc ? DMU_OBJECT_END : end - start; + ASSERT3U(chunk_begin, >=, offset); + ASSERT3U(chunk_begin, <=, chunk_end); + + chunk_len = chunk_end - chunk_begin; + + mutex_enter(&dp->dp_lock); + for (int t = 0; t < TXG_SIZE; t++) { + long_free_dirty_all_txgs += + dp->dp_long_free_dirty_pertxg[t]; + } + mutex_exit(&dp->dp_lock); + + /* + * To avoid filling up a TXG with just frees wait for + * the next TXG to open before freeing more chunks if + * we have reached the threshold of frees + */ + if (dirty_frees_threshold != 0 && + long_free_dirty_all_txgs >= dirty_frees_threshold) { + txg_wait_open(dp, 0); + continue; + } tx = dmu_tx_create(os); - dmu_tx_hold_free(tx, dn->dn_object, start, len); + dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); + + /* + * Mark this transaction as typically resulting in a net + * reduction in space used. + */ + dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } - dnode_free_range(dn, start, trunc ? -1 : len, tx); - - if (start == 0 && free_dnode) { - ASSERT(trunc); - dnode_free(dn, tx); - } - - length -= end - start; - + mutex_enter(&dp->dp_lock); + dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] += + chunk_len; + mutex_exit(&dp->dp_lock); + DTRACE_PROBE3(free__long__range, + uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len, + uint64_t, dmu_tx_get_txg(tx)); + dnode_free_range(dn, chunk_begin, chunk_len, tx); dmu_tx_commit(tx); - end = start; + + length -= chunk_len; } return (0); } @@ -489,38 +817,43 @@ dmu_free_long_range(objset_t *os, uint64 err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); - err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); + err = dmu_free_long_range_impl(os, dn, offset, length); + + /* + * It is important to zero out the maxblkid when freeing the entire + * file, so that (a) subsequent calls to dmu_free_long_range_impl() + * will take the fast path, and (b) dnode_reallocate() can verify + * that the entire file has been freed. + */ + if (err == 0 && offset == 0 && length == DMU_OBJECT_END) + dn->dn_maxblkid = 0; + dnode_rele(dn, FTAG); return (err); } int -dmu_free_object(objset_t *os, uint64_t object) +dmu_free_long_object(objset_t *os, uint64_t object) { - dnode_t *dn; dmu_tx_t *tx; int err; - err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, - FTAG, &dn); + err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); if (err != 0) return (err); - if (dn->dn_nlevels == 1) { - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, object); - dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err == 0) { - dnode_free_range(dn, 0, DMU_OBJECT_END, tx); - dnode_free(dn, tx); - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, object); + dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); + dmu_tx_mark_netfree(tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err == 0) { + err = dmu_object_free(os, object, tx); + dmu_tx_commit(tx); } else { - err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE); + dmu_tx_abort(tx); } - dnode_rele(dn, FTAG); + return (err); } @@ -661,6 +994,25 @@ dmu_prealloc(objset_t *os, uint64_t obje dmu_buf_rele_array(dbp, numbufs, FTAG); } +void +dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, + void *data, uint8_t etype, uint8_t comp, int uncompressed_size, + int compressed_size, int byteorder, dmu_tx_t *tx) +{ + dmu_buf_t *db; + + ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); + ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); + VERIFY0(dmu_buf_hold_noread(os, object, offset, + FTAG, &db)); + + dmu_buf_write_embedded(db, + data, (bp_embedded_type_t)etype, (enum zio_compress)comp, + uncompressed_size, compressed_size, byteorder, tx); + + dmu_buf_rele(db, FTAG); +} + /* * DMU support for xuio */ @@ -681,12 +1033,10 @@ dmu_xuio_init(xuio_t *xuio, int nblk) priv->iovp = uio->uio_iov; XUIO_XUZC_PRIV(xuio) = priv; -#ifdef PORT_SOLARIS if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); -#endif return (0); } @@ -701,12 +1051,10 @@ dmu_xuio_fini(xuio_t *xuio) kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); kmem_free(priv, sizeof (dmu_xuio_t)); -#ifdef PORT_SOLARIS if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); -#endif } /* @@ -755,7 +1103,6 @@ dmu_xuio_clear(xuio_t *xuio, int i) priv->bufs[i] = NULL; } -#ifdef PORT_SOLARIS static void xuio_stat_init(void) { @@ -776,7 +1123,6 @@ xuio_stat_fini(void) xuio_ksp = NULL; } } -#endif void xuio_stat_wbuf_copied() @@ -791,8 +1137,8 @@ xuio_stat_wbuf_nocopy() } #ifdef _KERNEL -int -dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) +static int +dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; @@ -802,12 +1148,12 @@ dmu_read_uio(objset_t *os, uint64_t obje * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ - err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, - &numbufs, &dbp); + err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, + TRUE, FTAG, &numbufs, &dbp, 0); if (err) return (err); -#ifndef __NetBSD__ /* XXX xuio */ +#ifdef UIO_XUIO if (uio->uio_extflg == UIO_XUIO) xuio = (xuio_t *)uio; #endif @@ -837,8 +1183,18 @@ dmu_read_uio(objset_t *os, uint64_t obje else XUIOSTAT_BUMP(xuiostat_rbuf_copied); } else { +#ifdef illumos + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_READ, uio); +#endif +#ifdef __FreeBSD__ + err = vn_io_fault_uiomove((char *)db->db_data + bufoff, + tocpy, uio); +#endif +#ifdef __NetBSD__ err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); +#endif } if (err) break; @@ -850,28 +1206,77 @@ dmu_read_uio(objset_t *os, uint64_t obje return (err); } +/* + * Read 'size' bytes into the uio buffer. + * From object zdb->db_object. + * Starting at offset uio->uio_loffset. + * + * If the caller already has a dbuf in the target object + * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), + * because we don't have to find the dnode_t for the object. + */ +int +dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_read_uio_dnode(dn, uio, size); + DB_DNODE_EXIT(db); + + return (err); +} + +/* + * Read 'size' bytes into the uio buffer. + * From the specified object + * Starting at offset uio->uio_loffset. + */ int -dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, - dmu_tx_t *tx) +dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { - dmu_buf_t **dbp; - int numbufs, i; - int err = 0; + dnode_t *dn; + int err; if (size == 0) return (0); - err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, - FALSE, FTAG, &numbufs, &dbp); + err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); - for (i = 0; i < numbufs; i++) { - int tocpy; - int bufoff; - dmu_buf_t *db = dbp[i]; + err = dmu_read_uio_dnode(dn, uio, size); - ASSERT(size > 0); + dnode_rele(dn, FTAG); + + return (err); +} + +static int +dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs; + int err = 0; + int i; + + err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, + FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); @@ -883,6 +1288,7 @@ dmu_write_uio(objset_t *os, uint64_t obj else dmu_buf_will_dirty(db, tx); +#ifdef illumos /* * XXX uiomove could block forever (eg. nfs-backed * pages). There needs to be a uiolockdown() function @@ -891,6 +1297,15 @@ dmu_write_uio(objset_t *os, uint64_t obj */ err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); +#endif +#ifdef __FreeBSD__ + err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy, + uio); +#endif +#ifdef __NetBSD__ + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_WRITE, uio); +#endif if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); @@ -900,11 +1315,66 @@ dmu_write_uio(objset_t *os, uint64_t obj size -= tocpy; } + dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } -#ifndef __NetBSD__ +/* + * Write 'size' bytes from the uio buffer. + * To object zdb->db_object. + * Starting at offset uio->uio_loffset. + * + * If the caller already has a dbuf in the target object + * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), + * because we don't have to find the dnode_t for the object. + */ +int +dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_write_uio_dnode(dn, uio, size, tx); + DB_DNODE_EXIT(db); + + return (err); +} + +/* + * Write 'size' bytes from the uio buffer. + * To the specified object. + * Starting at offset uio->uio_loffset. + */ +int +dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, + dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_write_uio_dnode(dn, uio, size, tx); + + dnode_rele(dn, FTAG); + + return (err); +} + +#ifdef illumos int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, page_t *pp, dmu_tx_t *tx) @@ -959,8 +1429,123 @@ dmu_write_pages(objset_t *os, uint64_t o dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } -#endif /* __NetBSD__ */ +#endif /* illumos */ + +#ifdef __FreeBSD__ +int +dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + vm_page_t *ma, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + struct sf_buf *sf; + int numbufs, i; + int err; + + if (size == 0) + return (0); + + err = dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy, copied, thiscpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + caddr_t va; + + ASSERT(size > 0); + ASSERT3U(db->db_size, >=, PAGESIZE); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + for (copied = 0; copied < tocpy; copied += PAGESIZE) { + ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff); + thiscpy = MIN(PAGESIZE, tocpy - copied); + va = zfs_map_page(*ma, &sf); + bcopy(va, (char *)db->db_data + bufoff, thiscpy); + zfs_unmap_page(sf); + ma += 1; + bufoff += PAGESIZE; + } + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} +#endif /* __FreeBSD__ */ + +#ifdef __NetBSD__ +int +dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + struct vm_page **pgs, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + int err; + + if (size == 0) + return (0); + + err = dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy, copied, thiscpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + caddr_t va; + + ASSERT(size > 0); + ASSERT3U(db->db_size, >=, PAGESIZE); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + for (copied = 0; copied < tocpy; copied += PAGESIZE) { + ASSERT3U((*pgs)->offset, ==, db->db_offset + bufoff); + thiscpy = MIN(PAGESIZE, tocpy - copied); + va = zfs_map_page(*pgs, S_READ); + bcopy(va, (char *)db->db_data + bufoff, thiscpy); + zfs_unmap_page(*pgs, va); + pgs++; + bufoff += PAGESIZE; + } + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} #endif +#endif /* _KERNEL */ /* * Allocate a loaned anonymous arc buffer. @@ -968,9 +1553,9 @@ dmu_write_pages(objset_t *os, uint64_t o arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { - dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; - return (arc_loan_buf(dn->dn_objset->os_spa, size)); + return (arc_loan_buf(db->db_objset->os_spa, size)); } /* @@ -980,7 +1565,7 @@ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); - VERIFY(arc_buf_remove_ref(buf, FTAG) == 1); + arc_buf_destroy(buf, FTAG); } /* @@ -992,23 +1577,53 @@ void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { - dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; + dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; + dnode_t *dn; dmu_buf_impl_t *db; uint32_t blksz = (uint32_t)arc_buf_size(buf); uint64_t blkid; + DB_DNODE_ENTER(dbuf); + dn = DB_DNODE(dbuf); rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, offset); + blkid = dbuf_whichblock(dn, 0, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); + DB_DNODE_EXIT(dbuf); - if (offset == db->db.db_offset && blksz == db->db.db_size) { + /* + * We can only assign if the offset is aligned, the arc buf is the + * same size as the dbuf, and the dbuf is not metadata. It + * can't be metadata because the loaned arc buf comes from the + * user-data kmem arena. + */ + if (offset == db->db.db_offset && blksz == db->db.db_size && + DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) { +#ifdef _KERNEL + curthread->td_ru.ru_oublock++; +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_force(curproc, RACCT_WRITEBPS, blksz); + racct_add_force(curproc, RACCT_WRITEIOPS, 1); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ +#endif /* _KERNEL */ dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { + objset_t *os; + uint64_t object; + + DB_DNODE_ENTER(dbuf); + dn = DB_DNODE(dbuf); + os = dn->dn_objset; + object = dn->dn_object; + DB_DNODE_EXIT(dbuf); + dbuf_rele(db, FTAG); - dmu_write(dn->dn_objset, dn->dn_object, offset, blksz, - buf->b_data, tx); + dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } @@ -1027,7 +1642,6 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *bu { dmu_sync_arg_t *dsa = varg; dmu_buf_t *db = dsa->dsa_zgd->zgd_db; - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { @@ -1037,8 +1651,7 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *bu * block size still needs to be known for replay. */ BP_SET_LSIZE(bp, db->db_size); - } else { - ASSERT(BP_GET_TYPE(bp) == dn->dn_type); + } else if (!BP_IS_EMBEDDED(bp)) { ASSERT(BP_GET_LEVEL(bp) == 0); bp->blk_fill = 1; } @@ -1062,10 +1675,33 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { + dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); + if (dr->dt.dl.dr_nopwrite) { + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + uint8_t chksum = BP_GET_CHECKSUM(bp_orig); + + ASSERT(BP_EQUAL(bp, bp_orig)); + ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); + ASSERT(zio_checksum_table[chksum].ci_flags & + ZCHECKSUM_FLAG_NOPWRITE); + } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; - if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) + + /* + * Old style holes are filled with all zeros, whereas + * new-style holes maintain their lsize, type, level, + * and birth time (see zio_write_compress). While we + * need to reset the BP_SET_LSIZE() call that happened + * in dmu_sync_ready for old style holes, we do *not* + * want to wipe out the information contained in new + * style holes. Thus, only zero out the block pointer if + * it's an old style hole. + */ + if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && + dr->dt.dl.dr_overridden_by.blk_birth == 0) BP_ZERO(&dr->dt.dl.dr_overridden_by); } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; @@ -1083,11 +1719,22 @@ dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; + blkptr_t *bp_orig = &zio->io_bp_orig; if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { - ASSERT(zio->io_bp->blk_birth == zio->io_txg); - ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); - zio_free(zio->io_spa, zio->io_txg, zio->io_bp); + /* + * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE) + * then there is nothing to do here. Otherwise, free the + * newly allocated block in this txg. + */ + if (zio->io_flags & ZIO_FLAG_NOPWRITE) { + ASSERT(BP_EQUAL(bp, bp_orig)); + } else { + ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); + ASSERT(zio->io_bp->blk_birth == zio->io_txg); + ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); + zio_free(zio->io_spa, zio->io_txg, zio->io_bp); + } } dmu_tx_commit(dsa->dsa_tx); @@ -1099,7 +1746,7 @@ dmu_sync_late_arrival_done(zio_t *zio) static int dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, - zio_prop_t *zp, zbookmark_t *zb) + zio_prop_t *zp, zbookmark_phys_t *zb) { dmu_sync_arg_t *dsa; dmu_tx_t *tx; @@ -1108,7 +1755,8 @@ dmu_sync_late_arrival(zio_t *pio, objset dmu_tx_hold_space(tx, zgd->zgd_db->db_size); if (dmu_tx_assign(tx, TXG_WAIT) != 0) { dmu_tx_abort(tx); - return (EIO); /* Make zl_get_data do txg_waited_synced() */ + /* Make zl_get_data do txg_waited_synced() */ + return (SET_ERROR(EIO)); } dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); @@ -1117,10 +1765,11 @@ dmu_sync_late_arrival(zio_t *pio, objset dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; - zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, - zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, - dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), + zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size, + zp, dmu_sync_late_arrival_ready, NULL, + NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, + ZIO_FLAG_CANFAIL, zb)); return (0); } @@ -1132,7 +1781,7 @@ dmu_sync_late_arrival(zio_t *pio, objset * * Return values: * - * EEXIST: this txg has already been synced, so there's nothing to to. + * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. @@ -1159,17 +1808,20 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr; dmu_sync_arg_t *dsa; - zbookmark_t zb; + zbookmark_phys_t zb; zio_prop_t zp; + dnode_t *dn; ASSERT(pio != NULL); - ASSERT(BP_IS_HOLE(bp)); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); - dmu_write_policy(os, db->db_dnode, db->db_level, WP_DMU_SYNC, &zp); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); + DB_DNODE_EXIT(db); /* * If we're frozen (running ziltest), we always need to generate a bp. @@ -1190,7 +1842,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s * This txg has already synced. There's nothing to do. */ mutex_exit(&db->db_mtx); - return (EEXIST); + return (SET_ERROR(EEXIST)); } if (txg <= spa_syncing_txg(os->os_spa)) { @@ -1212,9 +1864,39 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s * There's no need to log writes to freed blocks, so we're done. */ mutex_exit(&db->db_mtx); - return (ENOENT); + return (SET_ERROR(ENOENT)); } + ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); + + /* + * Assume the on-disk data is X, the current syncing data (in + * txg - 1) is Y, and the current in-memory data is Z (currently + * in dmu_sync). + * + * We usually want to perform a nopwrite if X and Z are the + * same. However, if Y is different (i.e. the BP is going to + * change before this write takes effect), then a nopwrite will + * be incorrect - we would override with X, which could have + * been freed when Y was written. + * + * (Note that this is not a concern when we are nop-writing from + * syncing context, because X and Y must be identical, because + * all previous txgs have been synced.) + * + * Therefore, we disable nopwrite if the current BP could change + * before this TXG. There are two ways it could change: by + * being dirty (dr_next is non-NULL), or by being freed + * (dnode_block_freed()). This behavior is verified by + * zio_done(), which VERIFYs that the override BP is identical + * to the on-disk BP. + */ + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) + zp.zp_nopwrite = B_FALSE; + DB_DNODE_EXIT(db); + ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { @@ -1224,7 +1906,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s * have been dirtied since, or we would have cleared the state. */ mutex_exit(&db->db_mtx); - return (EALREADY); + return (SET_ERROR(EALREADY)); } ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); @@ -1238,8 +1920,8 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, - bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp, - dmu_sync_ready, dmu_sync_done, dsa, + bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), + &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); @@ -1247,7 +1929,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, - dmu_tx_t *tx) + dmu_tx_t *tx) { dnode_t *dn; int err; @@ -1262,13 +1944,19 @@ dmu_object_set_blocksize(objset_t *os, u void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, - dmu_tx_t *tx) + dmu_tx_t *tx) { dnode_t *dn; - /* XXX assumes dnode_hold will not get an i/o error */ - (void) dnode_hold(os, object, FTAG, &dn); - ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); + /* + * Send streams include each object's checksum function. This + * check ensures that the receiving system can understand the + * checksum function transmitted. + */ + ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); + + VERIFY0(dnode_hold(os, object, FTAG, &dn)); + ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); @@ -1276,36 +1964,66 @@ dmu_object_set_checksum(objset_t *os, ui void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, - dmu_tx_t *tx) + dmu_tx_t *tx) { dnode_t *dn; - /* XXX assumes dnode_hold will not get an i/o error */ - (void) dnode_hold(os, object, FTAG, &dn); - ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); + /* + * Send streams include each object's compression function. This + * check ensures that the receiving system can understand the + * compression function transmitted. + */ + ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); + + VERIFY0(dnode_hold(os, object, FTAG, &dn)); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } int zfs_mdcomp_disable = 0; +SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN, + &zfs_mdcomp_disable, 0, "Disable metadata compression"); + +/* + * When the "redundant_metadata" property is set to "most", only indirect + * blocks of this level and higher will have an additional ditto block. + */ +int zfs_redundant_metadata_most_ditto_level = 2; void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; - boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata); + boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || + (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; enum zio_checksum dedup_checksum = os->os_dedup_checksum; - boolean_t dedup; + boolean_t dedup = B_FALSE; + boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; int copies = os->os_copies; /* - * Determine checksum setting. + * We maintain different write policies for each of the following + * types of data: + * 1. metadata + * 2. preallocated blocks (i.e. level-0 blocks of a dump device) + * 3. all other level 0 blocks */ if (ismd) { + if (zfs_mdcomp_disable) { + compress = ZIO_COMPRESS_EMPTY; + } else { + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + compress = zio_compress_select(os->os_spa, + ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); + } + /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a @@ -1313,82 +2031,93 @@ dmu_write_policy(objset_t *os, dnode_t * * as well. Otherwise, the metadata checksum defaults * to fletcher4. */ - if (zio_checksum_table[checksum].ci_correctable < 1 || - zio_checksum_table[checksum].ci_eck) + if (!(zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_METADATA) || + (zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_EMBEDDED)) checksum = ZIO_CHECKSUM_FLETCHER_4; - } else { - checksum = zio_checksum_select(dn->dn_checksum, checksum); - } - /* - * Determine compression setting. - */ - if (ismd) { + if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || + (os->os_redundant_metadata == + ZFS_REDUNDANT_METADATA_MOST && + (level >= zfs_redundant_metadata_most_ditto_level || + DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) + copies++; + } else if (wp & WP_NOFILL) { + ASSERT(level == 0); + /* - * XXX -- we should design a compression algorithm - * that specializes in arrays of bps. + * If we're writing preallocated blocks, we aren't actually + * writing them so don't set any policy properties. These + * blocks are currently only used by an external subsystem + * outside of zfs (i.e. dump) and not written by the zio + * pipeline. */ - compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : - ZIO_COMPRESS_LZJB; + compress = ZIO_COMPRESS_OFF; + checksum = ZIO_CHECKSUM_NOPARITY; } else { - compress = zio_compress_select(dn->dn_compress, compress); - } + compress = zio_compress_select(os->os_spa, dn->dn_compress, + compress); - /* - * Determine dedup setting. If we are in dmu_sync(), we won't - * actually dedup now because that's all done in syncing context; - * but we do want to use the dedup checkum. If the checksum is not - * strong enough to ensure unique signatures, force dedup_verify. - */ - dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF); - if (dedup) { - checksum = dedup_checksum; - if (!zio_checksum_table[checksum].ci_dedup) - dedup_verify = 1; - } + checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? + zio_checksum_select(dn->dn_checksum, checksum) : + dedup_checksum; - if (wp & WP_DMU_SYNC) - dedup = 0; - - if (wp & WP_NOFILL) { - ASSERT(!ismd && level == 0); - checksum = ZIO_CHECKSUM_OFF; - compress = ZIO_COMPRESS_OFF; - dedup = B_FALSE; + /* + * Determine dedup setting. If we are in dmu_sync(), + * we won't actually dedup now because that's all + * done in syncing context; but we do want to use the + * dedup checkum. If the checksum is not strong + * enough to ensure unique signatures, force + * dedup_verify. + */ + if (dedup_checksum != ZIO_CHECKSUM_OFF) { + dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; + if (!(zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_DEDUP)) + dedup_verify = B_TRUE; + } + + /* + * Enable nopwrite if we have secure enough checksum + * algorithm (see comment in zio_nop_write) and + * compression is enabled. We don't enable nopwrite if + * dedup is enabled as the two features are mutually + * exclusive. + */ + nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_NOPWRITE) && + compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } zp->zp_checksum = checksum; zp->zp_compress = compress; - zp->zp_type = type; + zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; - zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa)); + zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; + zp->zp_nopwrite = nopwrite; } int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; - int i, err; + int err; - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); /* * Sync any current changes before * we go trundling through the block pointers. */ - for (i = 0; i < TXG_SIZE; i++) { - if (list_link_active(&dn->dn_dirty_link[i])) - break; + err = dmu_object_wait_synced(os, object); + if (err) { + return (err); } - if (i != TXG_SIZE) { - dnode_rele(dn, FTAG); - txg_wait_synced(dmu_objset_pool(os), 0); - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); + + err = dnode_hold(os, object, FTAG, &dn); + if (err) { + return (err); } err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); @@ -1397,6 +2126,37 @@ dmu_offset_next(objset_t *os, uint64_t o return (err); } +/* + * Given the ZFS object, if it contains any dirty nodes + * this function flushes all dirty blocks to disk. This + * ensures the DMU object info is updated. A more efficient + * future version might just find the TXG with the maximum + * ID and wait for that to be synced. + */ +int +dmu_object_wait_synced(objset_t *os, uint64_t object) +{ + dnode_t *dn; + int error, i; + + error = dnode_hold(os, object, FTAG, &dn); + if (error) { + return (error); + } + + for (i = 0; i < TXG_SIZE; i++) { + if (list_link_active(&dn->dn_dirty_link[i])) { + break; + } + } + dnode_rele(dn, FTAG); + if (i != TXG_SIZE) { + txg_wait_synced(dmu_objset_pool(os), 0); + } + + return (0); +} + void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { @@ -1416,11 +2176,12 @@ dmu_object_info_from_dnode(dnode_t *dn, doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; + doi->doi_nblkptr = dn->dn_nblkptr; doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; - doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz; + doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_fill_count = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) - doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; + doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); @@ -1450,9 +2211,13 @@ dmu_object_info(objset_t *os, uint64_t o * As above, but faster; can be used when you have a held dbuf in hand. */ void -dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) +dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { - dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + DB_DNODE_ENTER(db); + dmu_object_info_from_dnode(DB_DNODE(db), doi); + DB_DNODE_EXIT(db); } /* @@ -1460,14 +2225,20 @@ dmu_object_info_from_db(dmu_buf_t *db, d * This is specifically optimized for zfs_getattr(). */ void -dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) +dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, + u_longlong_t *nblk512) { - dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add 1 for dnode space */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + 1; + DB_DNODE_EXIT(db); } void @@ -1518,25 +2289,29 @@ byteswap_uint8_array(void *vbuf, size_t void dmu_init(void) { - dbuf_init(); + zfs_dbgmsg_init(); + sa_cache_init(); + xuio_stat_init(); + dmu_objset_init(); dnode_init(); zfetch_init(); - arc_init(); + zio_compress_init(); l2arc_init(); -#ifdef PORT_SOLARIS - xuio_stat_init(); -#endif + arc_init(); + dbuf_init(); } void dmu_fini(void) { - arc_fini(); + arc_fini(); /* arc depends on l2arc, so arc must go first */ + l2arc_fini(); zfetch_fini(); - dnode_fini(); + zio_compress_fini(); dbuf_fini(); - l2arc_fini(); -#ifdef PORT_SOLARIS + dnode_fini(); + dmu_objset_fini(); xuio_stat_fini(); -#endif + sa_cache_fini(); + zfs_dbgmsg_fini(); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_diff.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_diff.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_diff.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_diff.c 22 Apr 2017 07:54:20 -0000 @@ -0,0 +1,268 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct diffarg { +#ifdef __FreeBSD__ + kthread_t *da_td; + struct file *da_fp; /* file to which we are reporting */ +#else + struct vnode *da_vp; /* file to which we are reporting */ +#endif + offset_t *da_offp; + int da_err; /* error that stopped diff search */ + dmu_diff_record_t da_ddr; +}; + +#ifdef __FreeBSD__ +static int +write_bytes(struct diffarg *da) +{ + struct uio auio; + struct iovec aiov; + + aiov.iov_base = (caddr_t)&da->da_ddr; + aiov.iov_len = sizeof (da->da_ddr); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = aiov.iov_len; + auio.uio_rw = UIO_WRITE; + auio.uio_offset = (off_t)-1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = da->da_td; +#ifdef _KERNEL + if (da->da_fp->f_type == DTYPE_VNODE) + bwillwrite(); + return (fo_write(da->da_fp, &auio, da->da_td->td_ucred, 0, da->da_td)); +#else + fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); + return (EOPNOTSUPP); +#endif +} +#endif /* __FreeBSD__ */ + +static int +write_record(struct diffarg *da) +{ + ssize_t resid; /* have to get resid to get detailed errno */ + + if (da->da_ddr.ddr_type == DDR_NONE) { + da->da_err = 0; + return (0); + } + +#ifdef __FreeBSD__ + da->da_err = write_bytes(da); +#else + da->da_err = vn_rdwr(UIO_WRITE, da->da_vp, (caddr_t)&da->da_ddr, + sizeof (da->da_ddr), 0, UIO_SYSSPACE, FAPPEND, + RLIM64_INFINITY, CRED(), &resid); +#endif + *da->da_offp += sizeof (da->da_ddr); + return (da->da_err); +} + +static int +report_free_dnode_range(struct diffarg *da, uint64_t first, uint64_t last) +{ + ASSERT(first <= last); + if (da->da_ddr.ddr_type != DDR_FREE || + first != da->da_ddr.ddr_last + 1) { + if (write_record(da) != 0) + return (da->da_err); + da->da_ddr.ddr_type = DDR_FREE; + da->da_ddr.ddr_first = first; + da->da_ddr.ddr_last = last; + return (0); + } + da->da_ddr.ddr_last = last; + return (0); +} + +static int +report_dnode(struct diffarg *da, uint64_t object, dnode_phys_t *dnp) +{ + ASSERT(dnp != NULL); + if (dnp->dn_type == DMU_OT_NONE) + return (report_free_dnode_range(da, object, object)); + + if (da->da_ddr.ddr_type != DDR_INUSE || + object != da->da_ddr.ddr_last + 1) { + if (write_record(da) != 0) + return (da->da_err); + da->da_ddr.ddr_type = DDR_INUSE; + da->da_ddr.ddr_first = da->da_ddr.ddr_last = object; + return (0); + } + da->da_ddr.ddr_last = object; + return (0); +} + +#define DBP_SPAN(dnp, level) \ + (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) + +/* ARGSUSED */ +static int +diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + struct diffarg *da = arg; + int err = 0; + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (SET_ERROR(EINTR)); + + if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT) + return (0); + + if (BP_IS_HOLE(bp)) { + uint64_t span = DBP_SPAN(dnp, zb->zb_level); + uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; + + err = report_free_dnode_range(da, dnobj, + dnobj + (span >> DNODE_SHIFT) - 1); + if (err) + return (err); + } else if (zb->zb_level == 0) { + dnode_phys_t *blk; + arc_buf_t *abuf; + arc_flags_t aflags = ARC_FLAG_WAIT; + int blksz = BP_GET_LSIZE(bp); + int i; + + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + &aflags, zb) != 0) + return (SET_ERROR(EIO)); + + blk = abuf->b_data; + for (i = 0; i < blksz >> DNODE_SHIFT; i++) { + uint64_t dnobj = (zb->zb_blkid << + (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; + err = report_dnode(da, dnobj, blk+i); + if (err) + break; + } + arc_buf_destroy(abuf, &abuf); + if (err) + return (err); + /* Don't care about the data blocks */ + return (TRAVERSE_VISIT_NO_CHILDREN); + } + return (0); +} + +int +dmu_diff(const char *tosnap_name, const char *fromsnap_name, +#ifdef __FreeBSD__ + struct file *fp, offset_t *offp) +#else + struct vnode *vp, offset_t *offp) +#endif +{ + struct diffarg da; + dsl_dataset_t *fromsnap; + dsl_dataset_t *tosnap; + dsl_pool_t *dp; + int error; + uint64_t fromtxg; + + if (strchr(tosnap_name, '@') == NULL || + strchr(fromsnap_name, '@') == NULL) + return (SET_ERROR(EINVAL)); + + error = dsl_pool_hold(tosnap_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap); + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + + if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { + dsl_dataset_rele(fromsnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (SET_ERROR(EXDEV)); + } + + fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg; + dsl_dataset_rele(fromsnap, FTAG); + + dsl_dataset_long_hold(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + +#ifdef __FreeBSD__ + da.da_td = curthread; + da.da_fp = fp; +#else + da.da_vp = vp; +#endif + da.da_offp = offp; + da.da_ddr.ddr_type = DDR_NONE; + da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0; + da.da_err = 0; + + error = traverse_dataset(tosnap, fromtxg, + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da); + + if (error != 0) { + da.da_err = error; + } else { + /* we set the da.da_err we return as side-effect */ + (void) write_record(&da); + } + + dsl_dataset_long_rele(tosnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + + return (da.da_err); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 dmu_object.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c 27 Feb 2010 22:30:45 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c 27 Mar 2016 02:52:21 -0000 @@ -19,14 +19,17 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright 2014 HybridCluster. All rights reserved. */ #include #include #include #include +#include +#include uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, @@ -34,7 +37,7 @@ dmu_object_alloc(objset_t *os, dmu_objec { uint64_t object; uint64_t L2_dnode_count = DNODES_PER_BLOCK << - (os->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT); + (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); dnode_t *dn = NULL; int restarted = B_FALSE; @@ -47,10 +50,16 @@ dmu_object_alloc(objset_t *os, dmu_objec * reasonably sparse (at most 1/4 full). Look from the * beginning once, but after that keep looking from here. * If we can't find one, just keep going from here. + * + * Note that dmu_traverse depends on the behavior that we use + * multiple blocks of the dnode object before going back to + * reuse objects. Any change to this algorithm should preserve + * that property or find another solution to the issues + * described in traverse_visitbp. */ if (P2PHASE(object, L2_dnode_count) == 0) { uint64_t offset = restarted ? object << DNODE_SHIFT : 0; - int error = dnode_next_offset(os->os_meta_dnode, + int error = dnode_next_offset(DMU_META_DNODE(os), DNODE_FIND_HOLE, &offset, 2, DNODES_PER_BLOCK >> 2, 0); restarted = B_TRUE; @@ -91,7 +100,7 @@ dmu_object_claim(objset_t *os, uint64_t int err; if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) - return (EBADF); + return (SET_ERROR(EBADF)); err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn); if (err) @@ -105,55 +114,22 @@ dmu_object_claim(objset_t *os, uint64_t int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonustype, int bonuslen) + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { dnode_t *dn; - dmu_tx_t *tx; - int nblkptr; int err; if (object == DMU_META_DNODE_OBJECT) - return (EBADF); + return (SET_ERROR(EBADF)); err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, FTAG, &dn); if (err) return (err); - if (dn->dn_type == ot && dn->dn_datablksz == blocksize && - dn->dn_bonustype == bonustype && dn->dn_bonuslen == bonuslen) { - /* nothing is changing, this is a noop */ - dnode_rele(dn, FTAG); - return (0); - } - - nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); - - /* - * If we are losing blkptrs or changing the block size this must - * be a new file instance. We must clear out the previous file - * contents before we can change this type of metadata in the dnode. - */ - if (dn->dn_nblkptr > nblkptr || dn->dn_datablksz != blocksize) { - err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); - if (err) - goto out; - } - - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, object); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - goto out; - } - dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx); - dmu_tx_commit(tx); -out: dnode_rele(dn, FTAG); - return (err); } @@ -178,16 +154,72 @@ dmu_object_free(objset_t *os, uint64_t o return (0); } +/* + * Return (in *objectp) the next object which is allocated (or a hole) + * after *object, taking into account only objects that may have been modified + * after the specified txg. + */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) { uint64_t offset = (*objectp + 1) << DNODE_SHIFT; int error; - error = dnode_next_offset(os->os_meta_dnode, + error = dnode_next_offset(DMU_META_DNODE(os), (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); *objectp = offset >> DNODE_SHIFT; return (error); } + +/* + * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the + * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. + * + * Only for use from syncing context, on MOS objects. + */ +void +dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, + dmu_tx_t *tx) +{ + dnode_t *dn; + + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY0(dnode_hold(mos, object, FTAG, &dn)); + if (dn->dn_type == DMU_OTN_ZAP_METADATA) { + dnode_rele(dn, FTAG); + return; + } + ASSERT3U(dn->dn_type, ==, old_type); + ASSERT0(dn->dn_maxblkid); + dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = + DMU_OTN_ZAP_METADATA; + dnode_setdirty(dn, tx); + dnode_rele(dn, FTAG); + + mzap_create_impl(mos, object, 0, 0, tx); + + spa_feature_incr(dmu_objset_spa(mos), + SPA_FEATURE_EXTENSIBLE_DATASET, tx); +} + +void +dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + dmu_object_type_t t; + + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY0(dnode_hold(mos, object, FTAG, &dn)); + t = dn->dn_type; + dnode_rele(dn, FTAG); + + if (t == DMU_OTN_ZAP_METADATA) { + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_EXTENSIBLE_DATASET, tx); + } + VERIFY0(dmu_object_free(mos, object, tx)); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 dmu_objset.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c 27 Feb 2010 22:30:49 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c 3 Dec 2016 17:03:49 -0000 @@ -19,10 +19,18 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015, STRATO AG, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ +/* Portions Copyright 2010 Robert Milkowski */ + #include #include #include @@ -40,7 +48,38 @@ #include #include #include -#include +#include +#include +#include +#include + +/* + * Needed to close a window in dnode_move() that allows the objset to be freed + * before it can be safely accessed. + */ +krwlock_t os_lock; + +/* + * Tunable to overwrite the maximum number of threads for the parallization + * of dmu_objset_find_dp, needed to speed up the import of pools with many + * datasets. + * Default is 4 times the number of leaf vdevs. + */ +int dmu_find_threads = 0; + +static void dmu_objset_find_dp_cb(void *arg); + +void +dmu_objset_init(void) +{ + rw_init(&os_lock, NULL, RW_DEFAULT, NULL); +} + +void +dmu_objset_fini(void) +{ + rw_destroy(&os_lock); +} spa_t * dmu_objset_spa(objset_t *os) @@ -91,7 +130,13 @@ dmu_objset_id(objset_t *os) return (ds ? ds->ds_object : 0); } -uint64_t +zfs_sync_type_t +dmu_objset_syncprop(objset_t *os) +{ + return (os->os_sync); +} + +zfs_logbias_op_t dmu_objset_logbias(objset_t *os) { return (os->os_logbias); @@ -120,7 +165,8 @@ compression_changed_cb(void *arg, uint64 */ ASSERT(newval != ZIO_COMPRESS_INHERIT); - os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); + os->os_compress = zio_compress_select(os->os_spa, newval, + ZIO_COMPRESS_ON); } static void @@ -184,6 +230,36 @@ secondary_cache_changed_cb(void *arg, ui } static void +sync_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || + newval == ZFS_SYNC_DISABLED); + + os->os_sync = newval; + if (os->os_zil) + zil_set_sync(os->os_zil, newval); +} + +static void +redundant_metadata_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL || + newval == ZFS_REDUNDANT_METADATA_MOST); + + os->os_redundant_metadata = newval; +} + +static void logbias_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; @@ -195,6 +271,14 @@ logbias_changed_cb(void *arg, uint64_t n zil_set_logbias(os->os_zil, newval); } +static void +recordsize_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + os->os_recordsize = newval; +} + void dmu_objset_byteswap(void *buf, size_t size) { @@ -225,42 +309,36 @@ dmu_objset_open_impl(spa_t *spa, dsl_dat os->os_spa = spa; os->os_rootbp = bp; if (!BP_IS_HOLE(os->os_rootbp)) { - uint32_t aflags = ARC_WAIT; - zbookmark_t zb; + arc_flags_t aflags = ARC_FLAG_WAIT; + zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (DMU_OS_IS_L2CACHEABLE(os)) - aflags |= ARC_L2CACHE; + aflags |= ARC_FLAG_L2CACHE; dprintf_bp(os->os_rootbp, "reading %s", ""); - /* - * NB: when bprewrite scrub can change the bp, - * and this is called from dmu_objset_open_ds_os, the bp - * could change, and we'll need a lock. - */ - err = arc_read_nolock(NULL, spa, os->os_rootbp, + err = arc_read(NULL, spa, os->os_rootbp, arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); - if (err) { + if (err != 0) { kmem_free(os, sizeof (objset_t)); /* convert checksum errors into IO errors */ if (err == ECKSUM) - err = EIO; + err = SET_ERROR(EIO); return (err); } /* Increase the blocksize if we are permitted. */ if (spa_version(spa) >= SPA_VERSION_USERSPACE && arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { - arc_buf_t *buf = arc_buf_alloc(spa, + arc_buf_t *buf = arc_alloc_buf(spa, sizeof (objset_phys_t), &os->os_phys_buf, ARC_BUFC_METADATA); bzero(buf->b_data, sizeof (objset_phys_t)); bcopy(os->os_phys_buf->b_data, buf->b_data, arc_buf_size(os->os_phys_buf)); - (void) arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } @@ -269,7 +347,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dat } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; - os->os_phys_buf = arc_buf_alloc(spa, size, + os->os_phys_buf = arc_alloc_buf(spa, size, &os->os_phys_buf, ARC_BUFC_METADATA); os->os_phys = os->os_phys_buf->b_data; bzero(os->os_phys, size); @@ -281,48 +359,91 @@ dmu_objset_open_impl(spa_t *spa, dsl_dat * default (fletcher2/off). Snapshots don't need to know about * checksum/compression/copies. */ - if (ds) { - err = dsl_prop_register(ds, "primarycache", + if (ds != NULL) { + boolean_t needlock = B_FALSE; + + /* + * Note: it's valid to open the objset if the dataset is + * long-held, in which case the pool_config lock will not + * be held. + */ + if (!dsl_pool_config_held(dmu_objset_pool(os))) { + needlock = B_TRUE; + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + } + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "secondarycache", + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); - if (!dsl_dataset_is_snapshot(ds)) { - if (err == 0) - err = dsl_prop_register(ds, "checksum", + } + if (!ds->ds_is_snapshot) { + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "compression", + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), compression_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "copies", + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_COPIES), copies_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "dedup", + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DEDUP), dedup_changed_cb, os); - if (err == 0) - err = dsl_prop_register(ds, "logbias", + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_LOGBIAS), logbias_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SYNC), + sync_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name( + ZFS_PROP_REDUNDANT_METADATA), + redundant_metadata_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), + recordsize_changed_cb, os); + } } - if (err) { - VERIFY(arc_buf_remove_ref(os->os_phys_buf, - &os->os_phys_buf) == 1); + if (needlock) + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + if (err != 0) { + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); kmem_free(os, sizeof (objset_t)); return (err); } - } else if (ds == NULL) { + } else { /* It's the meta-objset. */ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; - os->os_compress = ZIO_COMPRESS_LZJB; + os->os_compress = ZIO_COMPRESS_ON; os->os_copies = spa_max_replication(spa); os->os_dedup_checksum = ZIO_CHECKSUM_OFF; - os->os_dedup_verify = 0; - os->os_logbias = 0; + os->os_dedup_verify = B_FALSE; + os->os_logbias = ZFS_LOGBIAS_LATENCY; + os->os_sync = ZFS_SYNC_STANDARD; os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; } - os->os_zil_header = os->os_phys->os_zil_header; + if (ds == NULL || !ds->ds_is_snapshot) + os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { @@ -340,24 +461,13 @@ dmu_objset_open_impl(spa_t *spa, dsl_dat mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); - os->os_meta_dnode = dnode_special_open(os, - &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); + dnode_special_open(os, &os->os_phys->os_meta_dnode, + DMU_META_DNODE_OBJECT, &os->os_meta_dnode); if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { - os->os_userused_dnode = dnode_special_open(os, - &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT); - os->os_groupused_dnode = dnode_special_open(os, - &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT); - } - - /* - * We should be the only thread trying to do this because we - * have ds_opening_lock - */ - if (ds) { - mutex_enter(&ds->ds_lock); - ASSERT(ds->ds_objset == NULL); - ds->ds_objset = os; - mutex_exit(&ds->ds_lock); + dnode_special_open(os, &os->os_phys->os_userused_dnode, + DMU_USERUSED_OBJECT, &os->os_userused_dnode); + dnode_special_open(os, &os->os_phys->os_groupused_dnode, + DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); } *osp = os; @@ -369,63 +479,160 @@ dmu_objset_from_ds(dsl_dataset_t *ds, ob { int err = 0; + /* + * We shouldn't be doing anything with dsl_dataset_t's unless the + * pool_config lock is held, or the dataset is long-held. + */ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) || + dsl_dataset_long_held(ds)); + mutex_enter(&ds->ds_opening_lock); - *osp = ds->ds_objset; - if (*osp == NULL) { + if (ds->ds_objset == NULL) { + objset_t *os; + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), - ds, &ds->ds_phys->ds_bp, osp); + ds, dsl_dataset_get_blkptr(ds), &os); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + if (err == 0) { + mutex_enter(&ds->ds_lock); + ASSERT(ds->ds_objset == NULL); + ds->ds_objset = os; + mutex_exit(&ds->ds_lock); + } } + *osp = ds->ds_objset; mutex_exit(&ds->ds_opening_lock); return (err); } -/* called from zpl */ +/* + * Holds the pool while the objset is held. Therefore only one objset + * can be held at a time. + */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp) { + dsl_pool_t *dp; dsl_dataset_t *ds; int err; - err = dsl_dataset_hold(name, tag, &ds); - if (err) + err = dsl_pool_hold(name, tag, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, name, tag, &ds); + if (err != 0) { + dsl_pool_rele(dp, tag); return (err); + } err = dmu_objset_from_ds(ds, osp); - if (err) + if (err != 0) { dsl_dataset_rele(ds, tag); + dsl_pool_rele(dp, tag); + } return (err); } -/* called from zpl */ -int -dmu_objset_own(const char *name, dmu_objset_type_t type, +static int +dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp) { - dsl_dataset_t *ds; int err; - err = dsl_dataset_own(name, B_FALSE, tag, &ds); - if (err) - return (err); - err = dmu_objset_from_ds(ds, osp); - if (err) { + if (err != 0) { dsl_dataset_disown(ds, tag); } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { - dmu_objset_disown(*osp, tag); - return (EINVAL); + dsl_dataset_disown(ds, tag); + return (SET_ERROR(EINVAL)); } else if (!readonly && dsl_dataset_is_snapshot(ds)) { - dmu_objset_disown(*osp, tag); - return (EROFS); + dsl_dataset_disown(ds, tag); + return (SET_ERROR(EROFS)); } return (err); } +/* + * dsl_pool must not be held when this is called. + * Upon successful return, there will be a longhold on the dataset, + * and the dsl_pool will not be held. + */ +int +dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int err; + + err = dsl_pool_hold(name, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_own(dp, name, tag, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + err = dmu_objset_own_impl(ds, type, readonly, tag, osp); + dsl_pool_rele(dp, FTAG); + + return (err); +} + +int +dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_own_obj(dp, obj, tag, &ds); + if (err != 0) + return (err); + + return (dmu_objset_own_impl(ds, type, readonly, tag, osp)); +} + void dmu_objset_rele(objset_t *os, void *tag) { + dsl_pool_t *dp = dmu_objset_pool(os); dsl_dataset_rele(os->os_dsl_dataset, tag); + dsl_pool_rele(dp, tag); +} + +/* + * When we are called, os MUST refer to an objset associated with a dataset + * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner + * == tag. We will then release and reacquire ownership of the dataset while + * holding the pool config_rwlock to avoid intervening namespace or ownership + * changes may occur. + * + * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to + * release the hold on its dataset and acquire a new one on the dataset of the + * same name so that it can be partially torn down and reconstructed. + */ +void +dmu_objset_refresh_ownership(objset_t *os, void *tag) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds, *newds; + char name[ZFS_MAX_DATASET_NAME_LEN]; + + ds = os->os_dsl_dataset; + VERIFY3P(ds, !=, NULL); + VERIFY3P(ds->ds_owner, ==, tag); + VERIFY(dsl_dataset_long_held(ds)); + + dsl_dataset_name(ds, name); + dp = dmu_objset_pool(os); + dsl_pool_config_enter(dp, FTAG); + dmu_objset_disown(os, tag); + VERIFY0(dsl_dataset_own(dp, name, tag, &newds)); + VERIFY3P(newds, ==, os->os_dsl_dataset); + dsl_pool_config_exit(dp, FTAG); } void @@ -434,45 +641,56 @@ dmu_objset_disown(objset_t *os, void *ta dsl_dataset_disown(os->os_dsl_dataset, tag); } -int +void dmu_objset_evict_dbufs(objset_t *os) { + dnode_t dn_marker; dnode_t *dn; mutex_enter(&os->os_lock); - - /* process the mdn last, since the other dnodes have holds on it */ - list_remove(&os->os_dnodes, os->os_meta_dnode); - list_insert_tail(&os->os_dnodes, os->os_meta_dnode); - - /* - * Find the first dnode with holds. We have to do this dance - * because dnode_add_ref() only works if you already have a - * hold. If there are no holds then it has no dbufs so OK to - * skip. - */ - for (dn = list_head(&os->os_dnodes); - dn && !dnode_add_ref(dn, FTAG); - dn = list_next(&os->os_dnodes, dn)) - continue; - - while (dn) { - dnode_t *next_dn = dn; - - do { - next_dn = list_next(&os->os_dnodes, next_dn); - } while (next_dn && !dnode_add_ref(next_dn, FTAG)); - - mutex_exit(&os->os_lock); - dnode_evict_dbufs(dn); - dnode_rele(dn, FTAG); - mutex_enter(&os->os_lock); - dn = next_dn; + dn = list_head(&os->os_dnodes); + while (dn != NULL) { + /* + * Skip dnodes without holds. We have to do this dance + * because dnode_add_ref() only works if there is already a + * hold. If the dnode has no holds, then it has no dbufs. + */ + if (dnode_add_ref(dn, FTAG)) { + list_insert_after(&os->os_dnodes, dn, &dn_marker); + mutex_exit(&os->os_lock); + + dnode_evict_dbufs(dn); + dnode_rele(dn, FTAG); + + mutex_enter(&os->os_lock); + dn = list_next(&os->os_dnodes, &dn_marker); + list_remove(&os->os_dnodes, &dn_marker); + } else { + dn = list_next(&os->os_dnodes, dn); + } } mutex_exit(&os->os_lock); - return (list_head(&os->os_dnodes) != os->os_meta_dnode); + + if (DMU_USERUSED_DNODE(os) != NULL) { + dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os)); + dnode_evict_dbufs(DMU_USERUSED_DNODE(os)); + } + dnode_evict_dbufs(DMU_META_DNODE(os)); } +/* + * Objset eviction processing is split into into two pieces. + * The first marks the objset as evicting, evicts any dbufs that + * have a refcount of zero, and then queues up the objset for the + * second phase of eviction. Once os->os_dnodes has been cleared by + * dnode_buf_pageout()->dnode_destroy(), the second phase is executed. + * The second phase closes the special dnodes, dequeues the objset from + * the list of those undergoing eviction, and finally frees the objset. + * + * NOTE: Due to asynchronous eviction processing (invocation of + * dnode_buf_pageout()), it is possible for the meta dnode for the + * objset to have no holds even though os->os_dnodes is not empty. + */ void dmu_objset_evict(objset_t *os) { @@ -481,44 +699,51 @@ dmu_objset_evict(objset_t *os) for (int t = 0; t < TXG_SIZE; t++) ASSERT(!dmu_objset_is_dirty(os, t)); - if (ds) { - if (!dsl_dataset_is_snapshot(ds)) { - VERIFY(0 == dsl_prop_unregister(ds, "checksum", - checksum_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "compression", - compression_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "copies", - copies_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "dedup", - dedup_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "logbias", - logbias_changed_cb, os)); - } - VERIFY(0 == dsl_prop_unregister(ds, "primarycache", - primary_cache_changed_cb, os)); - VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", - secondary_cache_changed_cb, os)); + if (ds) + dsl_prop_unregister_all(ds, os); + + if (os->os_sa) + sa_tear_down(os); + + dmu_objset_evict_dbufs(os); + + mutex_enter(&os->os_lock); + spa_evicting_os_register(os->os_spa, os); + if (list_is_empty(&os->os_dnodes)) { + mutex_exit(&os->os_lock); + dmu_objset_evict_done(os); + } else { + mutex_exit(&os->os_lock); } +} - /* - * We should need only a single pass over the dnode list, since - * nothing can be added to the list at this point. - */ - (void) dmu_objset_evict_dbufs(os); - - dnode_special_close(os->os_meta_dnode); - if (os->os_userused_dnode) { - dnode_special_close(os->os_userused_dnode); - dnode_special_close(os->os_groupused_dnode); +void +dmu_objset_evict_done(objset_t *os) +{ + ASSERT3P(list_head(&os->os_dnodes), ==, NULL); + + dnode_special_close(&os->os_meta_dnode); + if (DMU_USERUSED_DNODE(os)) { + dnode_special_close(&os->os_userused_dnode); + dnode_special_close(&os->os_groupused_dnode); } zil_free(os->os_zil); - ASSERT3P(list_head(&os->os_dnodes), ==, NULL); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); + + /* + * This is a barrier to prevent the objset from going away in + * dnode_move() until we can safely ensure that the objset is still in + * use. We consider the objset valid before the barrier and invalid + * after the barrier. + */ + rw_enter(&os_lock, RW_READER); + rw_exit(&os_lock); - VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); mutex_destroy(&os->os_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); + spa_evicting_os_deregister(os->os_spa, os); kmem_free(os, sizeof (objset_t)); } @@ -537,12 +762,13 @@ dmu_objset_create_impl(spa_t *spa, dsl_d dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); - if (ds) - mutex_enter(&ds->ds_opening_lock); - VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &os)); - if (ds) - mutex_exit(&ds->ds_opening_lock); - mdn = os->os_meta_dnode; + + if (ds != NULL) + VERIFY0(dmu_objset_from_ds(ds, &os)); + else + VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); + + mdn = DMU_META_DNODE(os); dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); @@ -563,11 +789,17 @@ dmu_objset_create_impl(spa_t *spa, dsl_d /* * Determine the number of levels necessary for the meta-dnode - * to contain DN_MAX_OBJECT dnodes. + * to contain DN_MAX_OBJECT dnodes. Note that in order to + * ensure that we do not overflow 64 bits, there has to be + * a nlevels that gives us a number of blocks > DN_MAX_OBJECT + * but < 2^64. Therefore, + * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be + * less than (64 - log2(DN_MAX_OBJECT)) (16). */ - while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + + while ((uint64_t)mdn->dn_nblkptr << + (mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < - DN_MAX_OBJECT * sizeof (dnode_phys_t)) + DN_MAX_OBJECT) levels++; mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = @@ -588,299 +820,202 @@ dmu_objset_create_impl(spa_t *spa, dsl_d return (os); } -struct oscarg { - void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); - void *userarg; - dsl_dataset_t *clone_origin; - const char *lastname; - dmu_objset_type_t type; - uint64_t flags; -}; +typedef struct dmu_objset_create_arg { + const char *doca_name; + cred_t *doca_cred; + void (*doca_userfunc)(objset_t *os, void *arg, + cred_t *cr, dmu_tx_t *tx); + void *doca_userarg; + dmu_objset_type_t doca_type; + uint64_t doca_flags; +} dmu_objset_create_arg_t; /*ARGSUSED*/ static int -dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) +dmu_objset_create_check(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - struct oscarg *oa = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; - int err; - uint64_t ddobj; + dmu_objset_create_arg_t *doca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *pdd; + const char *tail; + int error; + + if (strchr(doca->doca_name, '@') != NULL) + return (SET_ERROR(EINVAL)); - err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, - oa->lastname, sizeof (uint64_t), 1, &ddobj); - if (err != ENOENT) - return (err ? err : EEXIST); - - if (oa->clone_origin != NULL) { - /* You can't clone across pools. */ - if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool) - return (EXDEV); - - /* You can only clone snapshots, not the head datasets. */ - if (!dsl_dataset_is_snapshot(oa->clone_origin)) - return (EINVAL); + if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + + error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); + if (error != 0) + return (error); + if (tail == NULL) { + dsl_dir_rele(pdd, FTAG); + return (SET_ERROR(EEXIST)); } + error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, + doca->doca_cred); + dsl_dir_rele(pdd, FTAG); - return (0); + return (error); } static void -dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dmu_objset_create_sync(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - struct oscarg *oa = arg2; - uint64_t dsobj; - - ASSERT(dmu_tx_is_syncing(tx)); + dmu_objset_create_arg_t *doca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *pdd; + const char *tail; + dsl_dataset_t *ds; + uint64_t obj; + blkptr_t *bp; + objset_t *os; - dsobj = dsl_dataset_create_sync(dd, oa->lastname, - oa->clone_origin, oa->flags, cr, tx); + VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); - if (oa->clone_origin == NULL) { - dsl_dataset_t *ds; - blkptr_t *bp; - objset_t *os; + obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, + doca->doca_cred, tx); - VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, - FTAG, &ds)); - bp = dsl_dataset_get_blkptr(ds); - ASSERT(BP_IS_HOLE(bp)); + VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + bp = dsl_dataset_get_blkptr(ds); + os = dmu_objset_create_impl(pdd->dd_pool->dp_spa, + ds, bp, doca->doca_type, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); - os = dmu_objset_create_impl(dsl_dataset_get_spa(ds), - ds, bp, oa->type, tx); - - if (oa->userfunc) - oa->userfunc(os, oa->userarg, cr, tx); - dsl_dataset_rele(ds, FTAG); + if (doca->doca_userfunc != NULL) { + doca->doca_userfunc(os, doca->doca_userarg, + doca->doca_cred, tx); } - spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa, - tx, cr, "dataset = %llu", dsobj); + spa_history_log_internal_ds(ds, "create", tx, ""); + dsl_dataset_rele(ds, FTAG); + dsl_dir_rele(pdd, FTAG); } int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) { - dsl_dir_t *pdd; - const char *tail; - int err = 0; - struct oscarg oa = { 0 }; + dmu_objset_create_arg_t doca; - ASSERT(strchr(name, '@') == NULL); - err = dsl_dir_open(name, FTAG, &pdd, &tail); - if (err) - return (err); - if (tail == NULL) { - dsl_dir_close(pdd, FTAG); - return (EEXIST); - } - - oa.userfunc = func; - oa.userarg = arg; - oa.lastname = tail; - oa.type = type; - oa.flags = flags; - - err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, - dmu_objset_create_sync, pdd, &oa, 5); - dsl_dir_close(pdd, FTAG); - return (err); -} + doca.doca_name = name; + doca.doca_cred = CRED(); + doca.doca_flags = flags; + doca.doca_userfunc = func; + doca.doca_userarg = arg; + doca.doca_type = type; + + return (dsl_sync_task(name, + dmu_objset_create_check, dmu_objset_create_sync, &doca, + 5, ZFS_SPACE_CHECK_NORMAL)); +} + +typedef struct dmu_objset_clone_arg { + const char *doca_clone; + const char *doca_origin; + cred_t *doca_cred; +} dmu_objset_clone_arg_t; -int -dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) +/*ARGSUSED*/ +static int +dmu_objset_clone_check(void *arg, dmu_tx_t *tx) { + dmu_objset_clone_arg_t *doca = arg; dsl_dir_t *pdd; const char *tail; - int err = 0; - struct oscarg oa = { 0 }; - - ASSERT(strchr(name, '@') == NULL); - err = dsl_dir_open(name, FTAG, &pdd, &tail); - if (err) - return (err); - if (tail == NULL) { - dsl_dir_close(pdd, FTAG); - return (EEXIST); - } - - oa.lastname = tail; - oa.clone_origin = clone_origin; - oa.flags = flags; + int error; + dsl_dataset_t *origin; + dsl_pool_t *dp = dmu_tx_pool(tx); - err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, - dmu_objset_create_sync, pdd, &oa, 5); - dsl_dir_close(pdd, FTAG); - return (err); -} + if (strchr(doca->doca_clone, '@') != NULL) + return (SET_ERROR(EINVAL)); -int -dmu_objset_destroy(const char *name, boolean_t defer) -{ - dsl_dataset_t *ds; - int error; + if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); - /* - * dsl_dataset_destroy() can free any claimed-but-unplayed - * intent log, but if there is an active log, it has blocks that - * are allocated, but may not yet be reflected in the on-disk - * structure. Only the ZIL knows how to free them, so we have - * to call into it here. - */ - error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); - if (error == 0) { - objset_t *os; - if (dmu_objset_from_ds(ds, &os) == 0) - zil_destroy(dmu_objset_zil(os), B_FALSE); - error = dsl_dataset_destroy(ds, FTAG, defer); - /* dsl_dataset_destroy() closes the ds. */ + error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); + if (error != 0) + return (error); + if (tail == NULL) { + dsl_dir_rele(pdd, FTAG); + return (SET_ERROR(EEXIST)); } - return (error); -} + error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, + doca->doca_cred); + if (error != 0) { + dsl_dir_rele(pdd, FTAG); + return (SET_ERROR(EDQUOT)); + } + dsl_dir_rele(pdd, FTAG); -struct snaparg { - dsl_sync_task_group_t *dstg; - char *snapname; - char failed[MAXPATHLEN]; - boolean_t recursive; - nvlist_t *props; -}; + error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); + if (error != 0) + return (error); -static int -snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - objset_t *os = arg1; - struct snaparg *sn = arg2; - - /* The props have already been checked by zfs_check_userprops(). */ + /* You can only clone snapshots, not the head datasets. */ + if (!origin->ds_is_snapshot) { + dsl_dataset_rele(origin, FTAG); + return (SET_ERROR(EINVAL)); + } + dsl_dataset_rele(origin, FTAG); - return (dsl_dataset_snapshot_check(os->os_dsl_dataset, - sn->snapname, tx)); + return (0); } static void -snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) { - objset_t *os = arg1; - dsl_dataset_t *ds = os->os_dsl_dataset; - struct snaparg *sn = arg2; + dmu_objset_clone_arg_t *doca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *pdd; + const char *tail; + dsl_dataset_t *origin, *ds; + uint64_t obj; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; - dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx); + VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); + VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); - if (sn->props) { - dsl_props_arg_t pa; - pa.pa_props = sn->props; - pa.pa_source = ZPROP_SRC_LOCAL; - dsl_props_set_sync(ds->ds_prev, &pa, cr, tx); - } + obj = dsl_dataset_create_sync(pdd, tail, origin, 0, + doca->doca_cred, tx); + + VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); + dsl_dataset_name(origin, namebuf); + spa_history_log_internal_ds(ds, "clone", tx, + "origin=%s (%llu)", namebuf, origin->ds_object); + dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(pdd, FTAG); } -static int -dmu_objset_snapshot_one(const char *name, void *arg) +int +dmu_objset_clone(const char *clone, const char *origin) { - struct snaparg *sn = arg; - objset_t *os; - int err; - char *cp; + dmu_objset_clone_arg_t doca; - /* - * If the objset starts with a '%', then ignore it unless it was - * explicitly named (ie, not recursive). These hidden datasets - * are always inconsistent, and by not opening them here, we can - * avoid a race with dsl_dir_destroy_check(). - */ - cp = strrchr(name, '/'); - if (cp && cp[1] == '%' && sn->recursive) - return (0); - - (void) strcpy(sn->failed, name); - - /* - * Check permissions if we are doing a recursive snapshot. The - * permission checks for the starting dataset have already been - * performed in zfs_secpolicy_snapshot() - */ - if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED()))) - return (err); - - err = dmu_objset_hold(name, sn, &os); - if (err != 0) - return (err); - - /* - * If the objset is in an inconsistent state (eg, in the process - * of being destroyed), don't snapshot it. As with %hidden - * datasets, we return EBUSY if this name was explicitly - * requested (ie, not recursive), and otherwise ignore it. - */ - if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { - dmu_objset_rele(os, sn); - return (sn->recursive ? 0 : EBUSY); - } + doca.doca_clone = clone; + doca.doca_origin = origin; + doca.doca_cred = CRED(); - /* - * NB: we need to wait for all in-flight changes to get to disk, - * so that we snapshot those changes. zil_suspend does this as - * a side effect. - */ - err = zil_suspend(dmu_objset_zil(os)); - if (err == 0) { - dsl_sync_task_create(sn->dstg, snapshot_check, - snapshot_sync, os, sn, 3); - } else { - dmu_objset_rele(os, sn); - } - - return (err); + return (dsl_sync_task(clone, + dmu_objset_clone_check, dmu_objset_clone_sync, &doca, + 5, ZFS_SPACE_CHECK_NORMAL)); } int -dmu_objset_snapshot(char *fsname, char *snapname, - nvlist_t *props, boolean_t recursive) +dmu_objset_snapshot_one(const char *fsname, const char *snapname) { - dsl_sync_task_t *dst; - struct snaparg sn; - spa_t *spa; int err; + char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); + nvlist_t *snaps = fnvlist_alloc(); - (void) strcpy(sn.failed, fsname); - - err = spa_open(fsname, &spa, FTAG); - if (err) - return (err); - - sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - sn.snapname = snapname; - sn.props = props; - sn.recursive = recursive; - - if (recursive) { - err = dmu_objset_find(fsname, - dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN); - } else { - err = dmu_objset_snapshot_one(fsname, &sn); - } - - if (err == 0) - err = dsl_sync_task_group_wait(sn.dstg); - - for (dst = list_head(&sn.dstg->dstg_tasks); dst; - dst = list_next(&sn.dstg->dstg_tasks, dst)) { - objset_t *os = dst->dst_arg1; - dsl_dataset_t *ds = os->os_dsl_dataset; - if (dst->dst_err) - dsl_dataset_name(ds, sn.failed); - zil_resume(dmu_objset_zil(os)); - dmu_objset_rele(os, &sn); - } - - if (err) - (void) strcpy(fsname, sn.failed); - dsl_sync_task_group_destroy(sn.dstg); - spa_close(spa, FTAG); + fnvlist_add_boolean(snaps, longsnap); + strfree(longsnap); + err = dsl_dataset_snapshot(snaps, NULL, NULL); + fnvlist_free(snaps); return (err); } @@ -919,9 +1054,9 @@ dmu_objset_write_ready(zio_t *zio, arc_b objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; - ASSERT(bp == os->os_rootbp); - ASSERT(BP_GET_TYPE(bp) == DMU_OT_OBJSET); - ASSERT(BP_GET_LEVEL(bp) == 0); + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); + ASSERT0(BP_GET_LEVEL(bp)); /* * Update rootbp fill count: it should be the number of objects @@ -931,7 +1066,12 @@ dmu_objset_write_ready(zio_t *zio, arc_b */ bp->blk_fill = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) - bp->blk_fill += dnp->dn_blkptr[i].blk_fill; + bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]); + if (os->os_dsl_dataset != NULL) + rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG); + *os->os_rootbp = *bp; + if (os->os_dsl_dataset != NULL) + rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); } /* ARGSUSED */ @@ -951,6 +1091,7 @@ dmu_objset_write_done(zio_t *zio, arc_bu (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } + kmem_free(bp, sizeof (*bp)); } /* called from dsl */ @@ -958,12 +1099,14 @@ void dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) { int txgoff; - zbookmark_t zb; + zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; list_t *list; list_t *newlist = NULL; dbuf_dirty_record_t *dr; + blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); + *blkptr_copy = *os->os_rootbp; dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); @@ -983,33 +1126,32 @@ dmu_objset_sync(objset_t *os, zio_t *pio /* * Create the root block IO */ - arc_release(os->os_phys_buf, &os->os_phys_buf); - SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + arc_release(os->os_phys_buf, &os->os_phys_buf); dmu_write_policy(os, NULL, 0, 0, &zp); zio = arc_write(pio, os->os_spa, tx->tx_txg, - os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp, - dmu_objset_write_ready, dmu_objset_write_done, os, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), + &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, + os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* * Sync special dnodes - the parent IO for the sync is the root block */ - os->os_meta_dnode->dn_zio = zio; - dnode_sync(os->os_meta_dnode, tx); + DMU_META_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_META_DNODE(os), tx); os->os_phys->os_flags = os->os_flags; - if (os->os_userused_dnode && - os->os_userused_dnode->dn_type != DMU_OT_NONE) { - os->os_userused_dnode->dn_zio = zio; - dnode_sync(os->os_userused_dnode, tx); - os->os_groupused_dnode->dn_zio = zio; - dnode_sync(os->os_groupused_dnode, tx); + if (DMU_USERUSED_DNODE(os) && + DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { + DMU_USERUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_USERUSED_DNODE(os), tx); + DMU_GROUPUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_GROUPUSED_DNODE(os), tx); } txgoff = tx->tx_txg & TXG_MASK; @@ -1027,9 +1169,9 @@ dmu_objset_sync(objset_t *os, zio_t *pio dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); - list = &os->os_meta_dnode->dn_dirty_records[txgoff]; + list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; while (dr = list_head(list)) { - ASSERT(dr->dr_dbuf->db_level == 0); + ASSERT0(dr->dr_dbuf->db_level); list_remove(list, dr); if (dr->dr_zio) zio_nowait(dr->dr_zio); @@ -1061,80 +1203,296 @@ boolean_t dmu_objset_userused_enabled(objset_t *os) { return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && - used_cbs[os->os_phys->os_type] && - os->os_userused_dnode); + used_cbs[os->os_phys->os_type] != NULL && + DMU_USERUSED_DNODE(os) != NULL); +} + +typedef struct userquota_node { + uint64_t uqn_id; + int64_t uqn_delta; + avl_node_t uqn_node; +} userquota_node_t; + +typedef struct userquota_cache { + avl_tree_t uqc_user_deltas; + avl_tree_t uqc_group_deltas; +} userquota_cache_t; + +static int +userquota_compare(const void *l, const void *r) +{ + const userquota_node_t *luqn = l; + const userquota_node_t *ruqn = r; + + if (luqn->uqn_id < ruqn->uqn_id) + return (-1); + if (luqn->uqn_id > ruqn->uqn_id) + return (1); + return (0); } static void -do_userquota_callback(objset_t *os, dnode_phys_t *dnp, - boolean_t subtract, dmu_tx_t *tx) +do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) { - static const char zerobuf[DN_MAX_BONUSLEN] = {0}; - uint64_t user, group; + void *cookie; + userquota_node_t *uqn; - ASSERT(dnp->dn_type != 0 || - (bcmp(DN_BONUS(dnp), zerobuf, DN_MAX_BONUSLEN) == 0 && - DN_USED_BYTES(dnp) == 0)); - - if ((dnp->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) && - 0 == used_cbs[os->os_phys->os_type](dnp->dn_bonustype, - DN_BONUS(dnp), &user, &group)) { - int64_t delta = DNODE_SIZE + DN_USED_BYTES(dnp); + ASSERT(dmu_tx_is_syncing(tx)); + + cookie = NULL; + while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas, + &cookie)) != NULL) { + VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT, + uqn->uqn_id, uqn->uqn_delta, tx)); + kmem_free(uqn, sizeof (*uqn)); + } + avl_destroy(&cache->uqc_user_deltas); + + cookie = NULL; + while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas, + &cookie)) != NULL) { + VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT, + uqn->uqn_id, uqn->uqn_delta, tx)); + kmem_free(uqn, sizeof (*uqn)); + } + avl_destroy(&cache->uqc_group_deltas); +} + +static void +userquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta) +{ + userquota_node_t search = { .uqn_id = id }; + avl_index_t idx; + + userquota_node_t *uqn = avl_find(avl, &search, &idx); + if (uqn == NULL) { + uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP); + uqn->uqn_id = id; + avl_insert(avl, uqn, idx); + } + uqn->uqn_delta += delta; +} + +static void +do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags, + uint64_t user, uint64_t group, boolean_t subtract) +{ + if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { + int64_t delta = DNODE_SIZE + used; if (subtract) delta = -delta; - VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, - user, delta, tx)); - VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, - group, delta, tx)); + + userquota_update_cache(&cache->uqc_user_deltas, user, delta); + userquota_update_cache(&cache->uqc_group_deltas, group, delta); } } void -dmu_objset_do_userquota_callbacks(objset_t *os, dmu_tx_t *tx) +dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) { dnode_t *dn; list_t *list = &os->os_synced_dnodes; + userquota_cache_t cache = { 0 }; ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); + avl_create(&cache.uqc_user_deltas, userquota_compare, + sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); + avl_create(&cache.uqc_group_deltas, userquota_compare, + sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); + while (dn = list_head(list)) { + int flags; ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); - ASSERT(dn->dn_oldphys); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED); /* Allocate the user/groupused objects if necessary. */ - if (os->os_userused_dnode->dn_type == DMU_OT_NONE) { - VERIFY(0 == zap_create_claim(os, + if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { + VERIFY0(zap_create_claim(os, DMU_USERUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); - VERIFY(0 == zap_create_claim(os, + VERIFY0(zap_create_claim(os, DMU_GROUPUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } - /* - * We intentionally modify the zap object even if the - * net delta (due to phys-oldphys) is zero. Otherwise - * the block of the zap obj could be shared between - * datasets but need to be different between them after - * a bprewrite. - */ - do_userquota_callback(os, dn->dn_oldphys, B_TRUE, tx); - do_userquota_callback(os, dn->dn_phys, B_FALSE, tx); + flags = dn->dn_id_flags; + ASSERT(flags); + if (flags & DN_ID_OLD_EXIST) { + do_userquota_update(&cache, + dn->dn_oldused, dn->dn_oldflags, + dn->dn_olduid, dn->dn_oldgid, B_TRUE); + } + if (flags & DN_ID_NEW_EXIST) { + do_userquota_update(&cache, + DN_USED_BYTES(dn->dn_phys), + dn->dn_phys->dn_flags, dn->dn_newuid, + dn->dn_newgid, B_FALSE); + } - /* - * The mutex is needed here for interlock with dnode_allocate. - */ mutex_enter(&dn->dn_mtx); - zio_buf_free(dn->dn_oldphys, sizeof (dnode_phys_t)); - dn->dn_oldphys = NULL; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + if (dn->dn_id_flags & DN_ID_NEW_EXIST) { + dn->dn_olduid = dn->dn_newuid; + dn->dn_oldgid = dn->dn_newgid; + dn->dn_id_flags |= DN_ID_OLD_EXIST; + if (dn->dn_bonuslen == 0) + dn->dn_id_flags |= DN_ID_CHKED_SPILL; + else + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + } + dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); mutex_exit(&dn->dn_mtx); list_remove(list, dn); dnode_rele(dn, list); } + do_userquota_cacheflush(os, &cache, tx); +} + +/* + * Returns a pointer to data to find uid/gid from + * + * If a dirty record for transaction group that is syncing can't + * be found then NULL is returned. In the NULL case it is assumed + * the uid/gid aren't changing. + */ +static void * +dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr, **drp; + void *data; + + if (db->db_dirtycnt == 0) + return (db->db.db_data); /* Nothing is changing */ + + for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) + if (dr->dr_txg == tx->tx_txg) + break; + + if (dr == NULL) { + data = NULL; + } else { + dnode_t *dn; + + DB_DNODE_ENTER(dr->dr_dbuf); + dn = DB_DNODE(dr->dr_dbuf); + + if (dn->dn_bonuslen == 0 && + dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) + data = dr->dt.dl.dr_data->b_data; + else + data = dr->dt.dl.dr_data; + + DB_DNODE_EXIT(dr->dr_dbuf); + } + + return (data); +} + +void +dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) +{ + objset_t *os = dn->dn_objset; + void *data = NULL; + dmu_buf_impl_t *db = NULL; + uint64_t *user = NULL; + uint64_t *group = NULL; + int flags = dn->dn_id_flags; + int error; + boolean_t have_spill = B_FALSE; + + if (!dmu_objset_userused_enabled(dn->dn_objset)) + return; + + if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| + DN_ID_CHKED_SPILL))) + return; + + if (before && dn->dn_bonuslen != 0) + data = DN_BONUS(dn->dn_phys); + else if (!before && dn->dn_bonuslen != 0) { + if (dn->dn_bonus) { + db = dn->dn_bonus; + mutex_enter(&db->db_mtx); + data = dmu_objset_userquota_find_data(db, tx); + } else { + data = DN_BONUS(dn->dn_phys); + } + } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { + int rf = 0; + + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + error = dmu_spill_hold_by_dnode(dn, + rf | DB_RF_MUST_SUCCEED, + FTAG, (dmu_buf_t **)&db); + ASSERT(error == 0); + mutex_enter(&db->db_mtx); + data = (before) ? db->db.db_data : + dmu_objset_userquota_find_data(db, tx); + have_spill = B_TRUE; + } else { + mutex_enter(&dn->dn_mtx); + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + mutex_exit(&dn->dn_mtx); + return; + } + + if (before) { + ASSERT(data); + user = &dn->dn_olduid; + group = &dn->dn_oldgid; + } else if (data) { + user = &dn->dn_newuid; + group = &dn->dn_newgid; + } + + /* + * Must always call the callback in case the object + * type has changed and that type isn't an object type to track + */ + error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, + user, group); + + /* + * Preserve existing uid/gid when the callback can't determine + * what the new uid/gid are and the callback returned EEXIST. + * The EEXIST error tells us to just use the existing uid/gid. + * If we don't know what the old values are then just assign + * them to 0, since that is a new file being created. + */ + if (!before && data == NULL && error == EEXIST) { + if (flags & DN_ID_OLD_EXIST) { + dn->dn_newuid = dn->dn_olduid; + dn->dn_newgid = dn->dn_oldgid; + } else { + dn->dn_newuid = 0; + dn->dn_newgid = 0; + } + error = 0; + } + + if (db) + mutex_exit(&db->db_mtx); + + mutex_enter(&dn->dn_mtx); + if (error == 0 && before) + dn->dn_id_flags |= DN_ID_OLD_EXIST; + if (error == 0 && !before) + dn->dn_id_flags |= DN_ID_NEW_EXIST; + + if (have_spill) { + dn->dn_id_flags |= DN_ID_CHKED_SPILL; + } else { + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + } + mutex_exit(&dn->dn_mtx); + if (have_spill) + dmu_buf_rele((dmu_buf_t *)db, FTAG); } boolean_t @@ -1153,9 +1511,9 @@ dmu_objset_userspace_upgrade(objset_t *o if (dmu_objset_userspace_present(os)) return (0); if (!dmu_objset_userused_enabled(os)) - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); if (dmu_objset_is_snapshot(os)) - return (EINVAL); + return (SET_ERROR(EINVAL)); /* * We simply need to mark every object dirty, so that it will be @@ -1171,15 +1529,15 @@ dmu_objset_userspace_upgrade(objset_t *o int objerr; if (issig(JUSTLOOKING) && issig(FORREAL)) - return (EINTR); + return (SET_ERROR(EINTR)); objerr = dmu_bonus_hold(os, obj, FTAG, &db); - if (objerr) + if (objerr != 0) continue; tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, obj); objerr = dmu_tx_assign(tx, TXG_WAIT); - if (objerr) { + if (objerr != 0) { dmu_tx_abort(tx); continue; } @@ -1234,7 +1592,7 @@ int dmu_objset_is_snapshot(objset_t *os) { if (os->os_dsl_dataset != NULL) - return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); + return (os->os_dsl_dataset->ds_is_snapshot); else return (B_FALSE); } @@ -1246,12 +1604,12 @@ dmu_snapshot_realname(objset_t *os, char dsl_dataset_t *ds = os->os_dsl_dataset; uint64_t ignored; - if (ds->ds_phys->ds_snapnames_zapobj == 0) - return (ENOENT); + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) + return (SET_ERROR(ENOENT)); return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, - real, maxlen, conflict)); + dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored, + MT_FIRST, real, maxlen, conflict)); } int @@ -1262,21 +1620,23 @@ dmu_snapshot_list_next(objset_t *os, int zap_cursor_t cursor; zap_attribute_t attr; - if (ds->ds_phys->ds_snapnames_zapobj == 0) - return (ENOENT); + ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); + + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) + return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, *offp); + dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); - return (ENOENT); + return (SET_ERROR(ENOENT)); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); - return (ENAMETOOLONG); + return (SET_ERROR(ENAMETOOLONG)); } (void) strcpy(name, attr.za_name); @@ -1301,21 +1661,21 @@ dmu_dir_list_next(objset_t *os, int name /* there is no next dir on a snapshot! */ if (os->os_dsl_dataset->ds_object != - dd->dd_phys->dd_head_dataset_obj) - return (ENOENT); + dsl_dir_phys(dd)->dd_head_dataset_obj) + return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, dd->dd_pool->dp_meta_objset, - dd->dd_phys->dd_child_dir_zapobj, *offp); + dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); - return (ENOENT); + return (SET_ERROR(ENOENT)); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); - return (ENAMETOOLONG); + return (SET_ERROR(ENAMETOOLONG)); } (void) strcpy(name, attr.za_name); @@ -1328,42 +1688,242 @@ dmu_dir_list_next(objset_t *os, int name return (0); } -struct findarg { - int (*func)(const char *, void *); - void *arg; -}; +typedef struct dmu_objset_find_ctx { + taskq_t *dc_tq; + dsl_pool_t *dc_dp; + uint64_t dc_ddobj; + int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *); + void *dc_arg; + int dc_flags; + kmutex_t *dc_error_lock; + int *dc_error; +} dmu_objset_find_ctx_t; -/* ARGSUSED */ -static int -findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +static void +dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp) { - struct findarg *fa = arg; - return (fa->func(dsname, fa->arg)); + dsl_pool_t *dp = dcp->dc_dp; + dmu_objset_find_ctx_t *child_dcp; + dsl_dir_t *dd; + dsl_dataset_t *ds; + zap_cursor_t zc; + zap_attribute_t *attr; + uint64_t thisobj; + int err = 0; + + /* don't process if there already was an error */ + if (*dcp->dc_error != 0) + goto out; + + err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd); + if (err != 0) + goto out; + + /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ + if (dd->dd_myname[0] == '$') { + dsl_dir_rele(dd, FTAG); + goto out; + } + + thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; + attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + + /* + * Iterate over all children. + */ + if (dcp->dc_flags & DS_FIND_CHILDREN) { + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); + + child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP); + *child_dcp = *dcp; + child_dcp->dc_ddobj = attr->za_first_integer; + if (dcp->dc_tq != NULL) + (void) taskq_dispatch(dcp->dc_tq, + dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP); + else + dmu_objset_find_dp_impl(child_dcp); + } + zap_cursor_fini(&zc); + } + + /* + * Iterate over all snapshots. + */ + if (dcp->dc_flags & DS_FIND_SNAPSHOTS) { + dsl_dataset_t *ds; + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + + if (err == 0) { + uint64_t snapobj; + + snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; + dsl_dataset_rele(ds, FTAG); + + for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); + + err = dsl_dataset_hold_obj(dp, + attr->za_first_integer, FTAG, &ds); + if (err != 0) + break; + err = dcp->dc_func(dp, ds, dcp->dc_arg); + dsl_dataset_rele(ds, FTAG); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + } + } + + dsl_dir_rele(dd, FTAG); + kmem_free(attr, sizeof (zap_attribute_t)); + + if (err != 0) + goto out; + + /* + * Apply to self. + */ + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + if (err != 0) + goto out; + err = dcp->dc_func(dp, ds, dcp->dc_arg); + dsl_dataset_rele(ds, FTAG); + +out: + if (err != 0) { + mutex_enter(dcp->dc_error_lock); + /* only keep first error */ + if (*dcp->dc_error == 0) + *dcp->dc_error = err; + mutex_exit(dcp->dc_error_lock); + } + + kmem_free(dcp, sizeof (*dcp)); +} + +static void +dmu_objset_find_dp_cb(void *arg) +{ + dmu_objset_find_ctx_t *dcp = arg; + dsl_pool_t *dp = dcp->dc_dp; + + /* + * We need to get a pool_config_lock here, as there are several + * asssert(pool_config_held) down the stack. Getting a lock via + * dsl_pool_config_enter is risky, as it might be stalled by a + * pending writer. This would deadlock, as the write lock can + * only be granted when our parent thread gives up the lock. + * The _prio interface gives us priority over a pending writer. + */ + dsl_pool_config_enter_prio(dp, FTAG); + + dmu_objset_find_dp_impl(dcp); + + dsl_pool_config_exit(dp, FTAG); } /* - * Find all objsets under name, and for each, call 'func(child_name, arg)'. - * Perhaps change all callers to use dmu_objset_find_spa()? + * Find objsets under and including ddobj, call func(ds) on each. + * The order for the enumeration is completely undefined. + * func is called with dsl_pool_config held. */ int -dmu_objset_find(char *name, int func(const char *, void *), void *arg, - int flags) +dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, + int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) { - struct findarg fa; - fa.func = func; - fa.arg = arg; - return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags)); + int error = 0; + taskq_t *tq = NULL; + int ntasks; + dmu_objset_find_ctx_t *dcp; + kmutex_t err_lock; + + mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL); + dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP); + dcp->dc_tq = NULL; + dcp->dc_dp = dp; + dcp->dc_ddobj = ddobj; + dcp->dc_func = func; + dcp->dc_arg = arg; + dcp->dc_flags = flags; + dcp->dc_error_lock = &err_lock; + dcp->dc_error = &error; + + if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) { + /* + * In case a write lock is held we can't make use of + * parallelism, as down the stack of the worker threads + * the lock is asserted via dsl_pool_config_held. + * In case of a read lock this is solved by getting a read + * lock in each worker thread, which isn't possible in case + * of a writer lock. So we fall back to the synchronous path + * here. + * In the future it might be possible to get some magic into + * dsl_pool_config_held in a way that it returns true for + * the worker threads so that a single lock held from this + * thread suffices. For now, stay single threaded. + */ + dmu_objset_find_dp_impl(dcp); + mutex_destroy(&err_lock); + + return (error); + } + + ntasks = dmu_find_threads; + if (ntasks == 0) + ntasks = vdev_count_leaves(dp->dp_spa) * 4; + tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks, + INT_MAX, 0); + if (tq == NULL) { + kmem_free(dcp, sizeof (*dcp)); + mutex_destroy(&err_lock); + + return (SET_ERROR(ENOMEM)); + } + dcp->dc_tq = tq; + + /* dcp will be freed by task */ + (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP); + + /* + * PORTING: this code relies on the property of taskq_wait to wait + * until no more tasks are queued and no more tasks are active. As + * we always queue new tasks from within other tasks, task_wait + * reliably waits for the full recursion to finish, even though we + * enqueue new tasks after taskq_wait has been called. + * On platforms other than illumos, taskq_wait may not have this + * property. + */ + taskq_wait(tq); + taskq_destroy(tq); + mutex_destroy(&err_lock); + + return (error); } /* - * Find all objsets under name, call func on each + * Find all objsets under name, and for each, call 'func(child_name, arg)'. + * The dp_config_rwlock must not be held when this is called, and it + * will not be held when the callback is called. + * Therefore this function should only be used when the pool is not changing + * (e.g. in syncing context), or the callback can deal with the possible races. */ -int -dmu_objset_find_spa(spa_t *spa, const char *name, - int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags) +static int +dmu_objset_find_impl(spa_t *spa, const char *name, + int func(const char *, void *), void *arg, int flags) { dsl_dir_t *dd; - dsl_pool_t *dp; + dsl_pool_t *dp = spa_get_dsl(spa); dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; @@ -1371,43 +1931,50 @@ dmu_objset_find_spa(spa_t *spa, const ch uint64_t thisobj; int err; - if (name == NULL) - name = spa_name(spa); - err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL); - if (err) + dsl_pool_config_enter(dp, FTAG); + + err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); + if (err != 0) { + dsl_pool_config_exit(dp, FTAG); return (err); + } /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { - dsl_dir_close(dd, FTAG); + dsl_dir_rele(dd, FTAG); + dsl_pool_config_exit(dp, FTAG); return (0); } - thisobj = dd->dd_phys->dd_head_dataset_obj; + thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - dp = dd->dd_pool; /* * Iterate over all children. */ if (flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, - dd->dd_phys->dd_child_dir_zapobj); + dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { - ASSERT(attr->za_integer_length == sizeof (uint64_t)); - ASSERT(attr->za_num_integers == 1); + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s/%s", name, attr->za_name); - err = dmu_objset_find_spa(spa, child, func, arg, flags); + dsl_pool_config_exit(dp, FTAG); + err = dmu_objset_find_impl(spa, child, + func, arg, flags); + dsl_pool_config_enter(dp, FTAG); strfree(child); - if (err) + if (err != 0) break; } zap_cursor_fini(&zc); - if (err) { - dsl_dir_close(dd, FTAG); + if (err != 0) { + dsl_dir_rele(dd, FTAG); + dsl_pool_config_exit(dp, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); return (err); } @@ -1417,77 +1984,61 @@ dmu_objset_find_spa(spa_t *spa, const ch * Iterate over all snapshots. */ if (flags & DS_FIND_SNAPSHOTS) { - if (!dsl_pool_sync_context(dp)) - rw_enter(&dp->dp_config_rwlock, RW_READER); err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); - if (!dsl_pool_sync_context(dp)) - rw_exit(&dp->dp_config_rwlock); if (err == 0) { - uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + uint64_t snapobj; + + snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { - ASSERT(attr->za_integer_length == + ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); - ASSERT(attr->za_num_integers == 1); + ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s@%s", name, attr->za_name); - err = func(spa, attr->za_first_integer, - child, arg); + dsl_pool_config_exit(dp, FTAG); + err = func(child, arg); + dsl_pool_config_enter(dp, FTAG); strfree(child); - if (err) + if (err != 0) break; } zap_cursor_fini(&zc); } } - dsl_dir_close(dd, FTAG); + dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); + dsl_pool_config_exit(dp, FTAG); - if (err) + if (err != 0) return (err); - /* - * Apply to self if appropriate. - */ - err = func(spa, thisobj, name, arg); - return (err); + /* Apply to self. */ + return (func(name, arg)); } -/* ARGSUSED */ +/* + * See comment above dmu_objset_find_impl(). + */ int -dmu_objset_prefetch(const char *name, void *arg) +dmu_objset_find(char *name, int func(const char *, void *), void *arg, + int flags) { - dsl_dataset_t *ds; - - if (dsl_dataset_hold(name, FTAG, &ds)) - return (0); - - if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { - mutex_enter(&ds->ds_opening_lock); - if (ds->ds_objset == NULL) { - uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; - zbookmark_t zb; - - SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, - ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - - (void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds), - &ds->ds_phys->ds_bp, NULL, NULL, - ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &aflags, &zb); - } - mutex_exit(&ds->ds_opening_lock); - } + spa_t *spa; + int error; - dsl_dataset_rele(ds, FTAG); - return (0); + error = spa_open(name, &spa, FTAG); + if (error != 0) + return (error); + error = dmu_objset_find_impl(spa, name, func, arg, flags); + spa_close(spa, FTAG); + return (error); } void @@ -1503,3 +2054,19 @@ dmu_objset_get_user(objset_t *os) ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); return (os->os_user_ptr); } + +/* + * Determine name of filesystem, given name of snapshot. + * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes + */ +int +dmu_fsname(const char *snapname, char *buf) +{ + char *atp = strchr(snapname, '@'); + if (atp == NULL) + return (SET_ERROR(EINVAL)); + if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + (void) strlcpy(buf, snapname, atp - snapname + 1); + return (0); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 dmu_send.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c 27 Feb 2010 22:30:41 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c 16 Jun 2017 21:23:39 -0000 @@ -19,8 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2012, Martin Matuska . All rights reserved. + * Copyright 2014 HybridCluster. All rights reserved. + * Copyright 2016 RackTop Systems. + * Copyright (c) 2014 Integros [integros.com] */ #include @@ -39,53 +45,208 @@ #include #include #include +#include +#include #include #include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __FreeBSD__ +#undef dump_write +#define dump_write dmu_dump_write +#endif + +#ifdef __NetBSD__ +#ifdef _KERNEL +#define FOF_OFFSET FOF_UPDATE_OFFSET +#define td_ucred l_cred +#define bwillwrite() /* nothing */ + +static int +fo_write(struct file *fp, struct uio *uio, cred_t *cred, int flags, kthread_t *thr) +{ + + return (*fp->f_ops->fo_write)(fp, &fp->f_offset, uio, cred, flags); +} + +static int +fo_read(struct file *fp, struct uio *uio, cred_t *cred, int flags, kthread_t *thr) +{ + + return (*fp->f_ops->fo_read)(fp, &fp->f_offset, uio, cred, flags); +} +#endif +#endif + +/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ +int zfs_send_corrupt_data = B_FALSE; +int zfs_send_queue_length = 16 * 1024 * 1024; +int zfs_recv_queue_length = 16 * 1024 * 1024; +/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ +int zfs_send_set_freerecords_bit = B_TRUE; + +#ifdef _KERNEL +TUNABLE_INT("vfs.zfs.send_set_freerecords_bit", &zfs_send_set_freerecords_bit); +#endif static char *dmu_recv_tag = "dmu_recv_tag"; +const char *recv_clone_name = "%recv"; -/* - * The list of data whose inclusion in a send stream can be pending from - * one call to backup_cb to another. Multiple calls to dump_free() and - * dump_freeobjects() can be aggregated into a single DRR_FREE or - * DRR_FREEOBJECTS replay record. - */ -typedef enum { - PENDING_NONE, - PENDING_FREE, - PENDING_FREEOBJECTS -} pendop_t; +#define BP_SPAN(datablkszsec, indblkshift, level) \ + (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (indblkshift - SPA_BLKPTRSHIFT))) + +static void byteswap_record(dmu_replay_record_t *drr); + +struct send_thread_arg { + bqueue_t q; + dsl_dataset_t *ds; /* Dataset to traverse */ + uint64_t fromtxg; /* Traverse from this txg */ + int flags; /* flags to pass to traverse_dataset */ + int error_code; + boolean_t cancel; + zbookmark_phys_t resume; +}; -struct backuparg { - dmu_replay_record_t *drr; - vnode_t *vp; - offset_t *off; - objset_t *os; - zio_cksum_t zc; - uint64_t toguid; - int err; - pendop_t pending_op; +struct send_block_record { + boolean_t eos_marker; /* Marks the end of the stream */ + blkptr_t bp; + zbookmark_phys_t zb; + uint8_t indblkshift; + uint16_t datablkszsec; + bqueue_node_t ln; }; static int -dump_bytes(struct backuparg *ba, void *buf, int len) +dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) { - ssize_t resid; /* have to get resid to get detailed errno */ - ASSERT3U(len % 8, ==, 0); + dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os); + struct uio auio; + struct iovec aiov; + + /* + * The code does not rely on this (len being a multiple of 8). We keep + * this assertion because of the corresponding assertion in + * receive_read(). Keeping this assertion ensures that we do not + * inadvertently break backwards compatibility (causing the assertion + * in receive_read() to trigger on old software). + * + * Removing the assertions could be rolled into a new feature that uses + * data that isn't 8-byte aligned; if the assertions were removed, a + * feature flag would have to be added. + */ + + ASSERT0(len % 8); + + aiov.iov_base = buf; + aiov.iov_len = len; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = len; +#ifdef __NetBSD__ +#ifdef _KERNEL + auio.uio_vmspace = vmspace_kernel(); +#endif +#else + auio.uio_segflg = UIO_SYSSPACE; +#endif + auio.uio_rw = UIO_WRITE; + auio.uio_offset = (off_t)-1; +#ifdef __FreeBSD__ + auio.uio_td = dsp->dsa_td; +#endif +#ifdef _KERNEL + if (dsp->dsa_fp->f_type == DTYPE_VNODE) + bwillwrite(); + dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0, + dsp->dsa_td); +#else + fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); + dsp->dsa_err = EOPNOTSUPP; +#endif + mutex_enter(&ds->ds_sendstream_lock); + *dsp->dsa_off += len; + mutex_exit(&ds->ds_sendstream_lock); + + return (dsp->dsa_err); +} - fletcher_4_incremental_native(buf, len, &ba->zc); - ba->err = vn_rdwr(UIO_WRITE, ba->vp, - (caddr_t)buf, len, - 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); - *ba->off += len; - return (ba->err); +/* + * For all record types except BEGIN, fill in the checksum (overlaid in + * drr_u.drr_checksum.drr_checksum). The checksum verifies everything + * up to the start of the checksum itself. + */ +static int +dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) +{ + ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + fletcher_4_incremental_native(dsp->dsa_drr, + offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + &dsp->dsa_zc); + if (dsp->dsa_drr->drr_type == DRR_BEGIN) { + dsp->dsa_sent_begin = B_TRUE; + } else { + ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. + drr_checksum.drr_checksum)); + dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; + } + if (dsp->dsa_drr->drr_type == DRR_END) { + dsp->dsa_sent_end = B_TRUE; + } + fletcher_4_incremental_native(&dsp->dsa_drr-> + drr_u.drr_checksum.drr_checksum, + sizeof (zio_cksum_t), &dsp->dsa_zc); + if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) + return (SET_ERROR(EINTR)); + if (payload_len != 0) { + fletcher_4_incremental_native(payload, payload_len, + &dsp->dsa_zc); + if (dump_bytes(dsp, payload, payload_len) != 0) + return (SET_ERROR(EINTR)); + } + return (0); } +/* + * Fill in the drr_free struct, or perform aggregation if the previous record is + * also a free record, and the two are adjacent. + * + * Note that we send free records even for a full send, because we want to be + * able to receive a full send as a clone, which requires a list of all the free + * and freeobject records that were generated on the source. + */ static int -dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, +dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, uint64_t length) { - struct drr_free *drrf = &(ba->drr->drr_u.drr_free); + struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); + + /* + * When we receive a free record, dbuf_free_range() assumes + * that the receiving system doesn't have any dbufs in the range + * being freed. This is always true because there is a one-record + * constraint: we only send one WRITE record for any given + * object,offset. We know that the one-record constraint is + * true because we always send data in increasing order by + * object,offset. + * + * If the increasing-order constraint ever changes, we should find + * another way to assert that the one-record constraint is still + * satisfied. + */ + ASSERT(object > dsp->dsa_last_data_object || + (object == dsp->dsa_last_data_object && + offset > dsp->dsa_last_data_offset)); + + if (length != -1ULL && offset + length < offset) + length = -1ULL; /* * If there is a pending op, but it's not PENDING_FREE, push it out, @@ -94,13 +255,14 @@ dump_free(struct backuparg *ba, uint64_t * other DRR_FREE records. DRR_FREEOBJECTS records can only be * aggregated with other DRR_FREEOBJECTS records. */ - if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) { - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) - return (EINTR); - ba->pending_op = PENDING_NONE; + if (dsp->dsa_pending_op != PENDING_NONE && + dsp->dsa_pending_op != PENDING_FREE) { + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dsp->dsa_pending_op = PENDING_NONE; } - if (ba->pending_op == PENDING_FREE) { + if (dsp->dsa_pending_op == PENDING_FREE) { /* * There should never be a PENDING_FREE if length is -1 * (because dump_dnode is the only place where this @@ -118,35 +280,43 @@ dump_free(struct backuparg *ba, uint64_t return (0); } else { /* not a continuation. Push out pending record */ - if (dump_bytes(ba, ba->drr, - sizeof (dmu_replay_record_t)) != 0) - return (EINTR); - ba->pending_op = PENDING_NONE; + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dsp->dsa_pending_op = PENDING_NONE; } } /* create a FREE record and make it pending */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_FREE; + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_FREE; drrf->drr_object = object; drrf->drr_offset = offset; drrf->drr_length = length; - drrf->drr_toguid = ba->toguid; + drrf->drr_toguid = dsp->dsa_toguid; if (length == -1ULL) { - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) - return (EINTR); + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); } else { - ba->pending_op = PENDING_FREE; + dsp->dsa_pending_op = PENDING_FREE; } return (0); } static int -dump_data(struct backuparg *ba, dmu_object_type_t type, +dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) { - struct drr_write *drrw = &(ba->drr->drr_u.drr_write); + struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); + /* + * We send data in increasing object, offset order. + * See comment in dump_free() for details. + */ + ASSERT(object > dsp->dsa_last_data_object || + (object == dsp->dsa_last_data_object && + offset > dsp->dsa_last_data_offset)); + dsp->dsa_last_data_object = object; + dsp->dsa_last_data_offset = offset + blksz - 1; /* * If there is any kind of pending aggregation (currently either @@ -154,38 +324,104 @@ dump_data(struct backuparg *ba, dmu_obje * the stream, since aggregation can't be done across operations * of different types. */ - if (ba->pending_op != PENDING_NONE) { - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + if (dsp->dsa_pending_op != PENDING_NONE) { + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dsp->dsa_pending_op = PENDING_NONE; + } + /* write a WRITE record */ + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_WRITE; + drrw->drr_object = object; + drrw->drr_type = type; + drrw->drr_offset = offset; + drrw->drr_length = blksz; + drrw->drr_toguid = dsp->dsa_toguid; + if (bp == NULL || BP_IS_EMBEDDED(bp)) { + /* + * There's no pre-computed checksum for partial-block + * writes or embedded BP's, so (like + * fletcher4-checkummed blocks) userland will have to + * compute a dedup-capable checksum itself. + */ + drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; + } else { + drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); + if (zio_checksum_table[drrw->drr_checksumtype].ci_flags & + ZCHECKSUM_FLAG_DEDUP) + drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; + DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); + DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); + DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); + drrw->drr_key.ddk_cksum = bp->blk_cksum; + } + + if (dump_record(dsp, data, blksz) != 0) + return (SET_ERROR(EINTR)); + return (0); +} + +static int +dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, + int blksz, const blkptr_t *bp) +{ + char buf[BPE_PAYLOAD_SIZE]; + struct drr_write_embedded *drrw = + &(dsp->dsa_drr->drr_u.drr_write_embedded); + + if (dsp->dsa_pending_op != PENDING_NONE) { + if (dump_record(dsp, NULL, 0) != 0) return (EINTR); - ba->pending_op = PENDING_NONE; + dsp->dsa_pending_op = PENDING_NONE; } - /* write a DATA record */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_WRITE; + + ASSERT(BP_IS_EMBEDDED(bp)); + + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; drrw->drr_object = object; - drrw->drr_type = type; drrw->drr_offset = offset; drrw->drr_length = blksz; - drrw->drr_toguid = ba->toguid; - drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); - if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) - drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; - DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); - DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); - DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); - drrw->drr_key.ddk_cksum = bp->blk_cksum; + drrw->drr_toguid = dsp->dsa_toguid; + drrw->drr_compression = BP_GET_COMPRESS(bp); + drrw->drr_etype = BPE_GET_ETYPE(bp); + drrw->drr_lsize = BPE_GET_LSIZE(bp); + drrw->drr_psize = BPE_GET_PSIZE(bp); - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) - return (EINTR); - if (dump_bytes(ba, data, blksz) != 0) + decode_embedded_bp_compressed(bp, buf); + + if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) return (EINTR); return (0); } static int -dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) +dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) +{ + struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); + + if (dsp->dsa_pending_op != PENDING_NONE) { + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dsp->dsa_pending_op = PENDING_NONE; + } + + /* write a SPILL record */ + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_SPILL; + drrs->drr_object = object; + drrs->drr_length = blksz; + drrs->drr_toguid = dsp->dsa_toguid; + + if (dump_record(dsp, data, blksz) != 0) + return (SET_ERROR(EINTR)); + return (0); +} + +static int +dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) { - struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects); + struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); /* * If there is a pending op, but it's not PENDING_FREEOBJECTS, @@ -194,13 +430,13 @@ dump_freeobjects(struct backuparg *ba, u * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records * can only be aggregated with other DRR_FREEOBJECTS records. */ - if (ba->pending_op != PENDING_NONE && - ba->pending_op != PENDING_FREEOBJECTS) { - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) - return (EINTR); - ba->pending_op = PENDING_NONE; + if (dsp->dsa_pending_op != PENDING_NONE && + dsp->dsa_pending_op != PENDING_FREEOBJECTS) { + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dsp->dsa_pending_op = PENDING_NONE; } - if (ba->pending_op == PENDING_FREEOBJECTS) { + if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { /* * See whether this free object array can be aggregated * with pending one @@ -210,42 +446,54 @@ dump_freeobjects(struct backuparg *ba, u return (0); } else { /* can't be aggregated. Push out pending record */ - if (dump_bytes(ba, ba->drr, - sizeof (dmu_replay_record_t)) != 0) - return (EINTR); - ba->pending_op = PENDING_NONE; + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dsp->dsa_pending_op = PENDING_NONE; } } /* write a FREEOBJECTS record */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_FREEOBJECTS; + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; drrfo->drr_firstobj = firstobj; drrfo->drr_numobjs = numobjs; - drrfo->drr_toguid = ba->toguid; + drrfo->drr_toguid = dsp->dsa_toguid; - ba->pending_op = PENDING_FREEOBJECTS; + dsp->dsa_pending_op = PENDING_FREEOBJECTS; return (0); } static int -dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) +dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) { - struct drr_object *drro = &(ba->drr->drr_u.drr_object); + struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); + + if (object < dsp->dsa_resume_object) { + /* + * Note: when resuming, we will visit all the dnodes in + * the block of dnodes that we are resuming from. In + * this case it's unnecessary to send the dnodes prior to + * the one we are resuming from. We should be at most one + * block's worth of dnodes behind the resume point. + */ + ASSERT3U(dsp->dsa_resume_object - object, <, + 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); + return (0); + } if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) - return (dump_freeobjects(ba, object, 1)); + return (dump_freeobjects(dsp, object, 1)); - if (ba->pending_op != PENDING_NONE) { - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) - return (EINTR); - ba->pending_op = PENDING_NONE; + if (dsp->dsa_pending_op != PENDING_NONE) { + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dsp->dsa_pending_op = PENDING_NONE; } /* write an OBJECT record */ - bzero(ba->drr, sizeof (dmu_replay_record_t)); - ba->drr->drr_type = DRR_OBJECT; + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_OBJECT; drro->drr_object = object; drro->drr_type = dnp->dn_type; drro->drr_bonustype = dnp->dn_bonustype; @@ -253,488 +501,1317 @@ dump_dnode(struct backuparg *ba, uint64_ drro->drr_bonuslen = dnp->dn_bonuslen; drro->drr_checksumtype = dnp->dn_checksum; drro->drr_compress = dnp->dn_compress; - drro->drr_toguid = ba->toguid; + drro->drr_toguid = dsp->dsa_toguid; - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) - return (EINTR); + if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) + drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; + + if (dump_record(dsp, DN_BONUS(dnp), + P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) { + return (SET_ERROR(EINTR)); + } + + /* Free anything past the end of the file. */ + if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) + return (SET_ERROR(EINTR)); + if (dsp->dsa_err != 0) + return (SET_ERROR(EINTR)); + return (0); +} - if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) - return (EINTR); +static boolean_t +backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) +{ + if (!BP_IS_EMBEDDED(bp)) + return (B_FALSE); - /* free anything past the end of the file */ - if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) - return (EINTR); - if (ba->err) - return (EINTR); - return (0); + /* + * Compression function must be legacy, or explicitly enabled. + */ + if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && + !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4))) + return (B_FALSE); + + /* + * Embed type must be explicitly enabled. + */ + switch (BPE_GET_ETYPE(bp)) { + case BP_EMBEDDED_TYPE_DATA: + if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) + return (B_TRUE); + break; + default: + return (B_FALSE); + } + return (B_FALSE); +} + +/* + * This is the callback function to traverse_dataset that acts as the worker + * thread for dmu_send_impl. + */ +/*ARGSUSED*/ +static int +send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) +{ + struct send_thread_arg *sta = arg; + struct send_block_record *record; + uint64_t record_size; + int err = 0; + + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= sta->resume.zb_object); + + if (sta->cancel) + return (SET_ERROR(EINTR)); + + if (bp == NULL) { + ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); + return (0); + } else if (zb->zb_level < 0) { + return (0); + } + + record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP); + record->eos_marker = B_FALSE; + record->bp = *bp; + record->zb = *zb; + record->indblkshift = dnp->dn_indblkshift; + record->datablkszsec = dnp->dn_datablkszsec; + record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + bqueue_enqueue(&sta->q, record, record_size); + + return (err); } -#define BP_SPAN(dnp, level) \ - (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ - (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) +/* + * This function kicks off the traverse_dataset. It also handles setting the + * error code of the thread in case something goes wrong, and pushes the End of + * Stream record when the traverse_dataset call has finished. If there is no + * dataset to traverse, the thread immediately pushes End of Stream marker. + */ +static void +send_traverse_thread(void *arg) +{ + struct send_thread_arg *st_arg = arg; + int err; + struct send_block_record *data; + + if (st_arg->ds != NULL) { + err = traverse_dataset_resume(st_arg->ds, + st_arg->fromtxg, &st_arg->resume, + st_arg->flags, send_cb, st_arg); + + if (err != EINTR) + st_arg->error_code = err; + } + data = kmem_zalloc(sizeof (*data), KM_SLEEP); + data->eos_marker = B_TRUE; + bqueue_enqueue(&st_arg->q, data, 1); + thread_exit(); +} -/* ARGSUSED */ +/* + * This function actually handles figuring out what kind of record needs to be + * dumped, reading the data (which has hopefully been prefetched), and calling + * the appropriate helper function. + */ static int -backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) { - struct backuparg *ba = arg; + dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os); + const blkptr_t *bp = &data->bp; + const zbookmark_phys_t *zb = &data->zb; + uint8_t indblkshift = data->indblkshift; + uint16_t dblkszsec = data->datablkszsec; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; int err = 0; - if (issig(JUSTLOOKING) && issig(FORREAL)) - return (EINTR); + ASSERT3U(zb->zb_level, >=, 0); + + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= dsa->dsa_resume_object); if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); - } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t span = BP_SPAN(dnp, zb->zb_level); + } else if (BP_IS_HOLE(bp) && + zb->zb_object == DMU_META_DNODE_OBJECT) { + uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; - err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); - } else if (bp == NULL) { - uint64_t span = BP_SPAN(dnp, zb->zb_level); - err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span); + err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT); + } else if (BP_IS_HOLE(bp)) { + uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); + uint64_t offset = zb->zb_blkid * span; + err = dump_free(dsa, zb->zb_object, offset, span); } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { return (0); } else if (type == DMU_OT_DNODE) { - dnode_phys_t *blk; - int i; int blksz = BP_GET_LSIZE(bp); - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; - if (arc_read_nolock(NULL, spa, bp, - arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL, &aflags, zb) != 0) - return (EIO); - - blk = abuf->b_data; - for (i = 0; i < blksz >> DNODE_SHIFT; i++) { - uint64_t dnobj = (zb->zb_blkid << - (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; - err = dump_dnode(ba, dnobj, blk+i); - if (err) + ASSERT0(zb->zb_level); + + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + &aflags, zb) != 0) + return (SET_ERROR(EIO)); + + dnode_phys_t *blk = abuf->b_data; + uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT); + for (int i = 0; i < blksz >> DNODE_SHIFT; i++) { + err = dump_dnode(dsa, dnobj + i, blk + i); + if (err != 0) break; } - (void) arc_buf_remove_ref(abuf, &abuf); - } else { /* it's a level-0 block of a regular object */ - uint32_t aflags = ARC_WAIT; + arc_buf_destroy(abuf, &abuf); + } else if (type == DMU_OT_SA) { + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); - if (arc_read_nolock(NULL, spa, bp, - arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL, &aflags, zb) != 0) - return (EIO); - - err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz, - blksz, bp, abuf->b_data); - (void) arc_buf_remove_ref(abuf, &abuf); + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + &aflags, zb) != 0) + return (SET_ERROR(EIO)); + + err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); + arc_buf_destroy(abuf, &abuf); + } else if (backup_do_embed(dsa, bp)) { + /* it's an embedded level-0 block of a regular object */ + int blksz = dblkszsec << SPA_MINBLOCKSHIFT; + ASSERT0(zb->zb_level); + err = dump_write_embedded(dsa, zb->zb_object, + zb->zb_blkid * blksz, blksz, bp); + } else { + /* it's a level-0 block of a regular object */ + arc_flags_t aflags = ARC_FLAG_WAIT; + arc_buf_t *abuf; + int blksz = dblkszsec << SPA_MINBLOCKSHIFT; + uint64_t offset; + + ASSERT0(zb->zb_level); + ASSERT(zb->zb_object > dsa->dsa_resume_object || + (zb->zb_object == dsa->dsa_resume_object && + zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); + + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + &aflags, zb) != 0) { + if (zfs_send_corrupt_data) { + /* Send a block filled with 0x"zfs badd bloc" */ + abuf = arc_alloc_buf(spa, blksz, &abuf, + ARC_BUFC_DATA); + uint64_t *ptr; + for (ptr = abuf->b_data; + (char *)ptr < (char *)abuf->b_data + blksz; + ptr++) + *ptr = 0x2f5baddb10cULL; + } else { + return (SET_ERROR(EIO)); + } + } + + offset = zb->zb_blkid * blksz; + + if (!(dsa->dsa_featureflags & + DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + blksz > SPA_OLD_MAXBLOCKSIZE) { + char *buf = abuf->b_data; + while (blksz > 0 && err == 0) { + int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); + err = dump_write(dsa, type, zb->zb_object, + offset, n, NULL, buf); + offset += n; + buf += n; + blksz -= n; + } + } else { + err = dump_write(dsa, type, zb->zb_object, + offset, blksz, bp, abuf->b_data); + } + arc_buf_destroy(abuf, &abuf); } ASSERT(err == 0 || err == EINTR); return (err); } -int -dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, +/* + * Pop the new data off the queue, and free the old data. + */ +static struct send_block_record * +get_next_record(bqueue_t *bq, struct send_block_record *data) +{ + struct send_block_record *tmp = bqueue_dequeue(bq); + kmem_free(data, sizeof (*data)); + return (tmp); +} + +/* + * Actually do the bulk of the work in a zfs send. + * + * Note: Releases dp using the specified tag. + */ +static int +dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, + zfs_bookmark_phys_t *ancestor_zb, + boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd, + uint64_t resumeobj, uint64_t resumeoff, +#ifdef illumos vnode_t *vp, offset_t *off) +#else + struct file *fp, offset_t *off) +#endif { - dsl_dataset_t *ds = tosnap->os_dsl_dataset; - dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; + objset_t *os; dmu_replay_record_t *drr; - struct backuparg ba; + dmu_sendarg_t *dsp; int err; uint64_t fromtxg = 0; + uint64_t featureflags = 0; + struct send_thread_arg to_arg = { 0 }; - /* tosnap must be a snapshot */ - if (ds->ds_phys->ds_next_snap_obj == 0) - return (EINVAL); - - /* fromsnap must be an earlier snapshot from the same fs as tosnap */ - if (fromds && (ds->ds_dir != fromds->ds_dir || - fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) - return (EXDEV); - - if (fromorigin) { - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - if (fromsnap) - return (EINVAL); - - if (dsl_dir_is_clone(ds->ds_dir)) { - rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); - rw_exit(&dp->dp_config_rwlock); - if (err) - return (err); - } else { - fromorigin = B_FALSE; - } + err = dmu_objset_from_ds(to_ds, &os); + if (err != 0) { + dsl_pool_rele(dp, tag); + return (err); } - drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, DMU_SUBSTREAM); + +#ifdef _KERNEL + if (dmu_objset_type(os) == DMU_OST_ZFS) { + uint64_t version; + if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { + kmem_free(drr, sizeof (dmu_replay_record_t)); + dsl_pool_rele(dp, tag); + return (SET_ERROR(EINVAL)); + } + if (version >= ZPL_VERSION_SA) { + featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; + } + } +#endif + + if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS]) + featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; + if (embedok && + spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { + featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) + featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; + } + + if (resumeobj != 0 || resumeoff != 0) { + featureflags |= DMU_BACKUP_FEATURE_RESUMING; + } + + DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, + featureflags); + drr->drr_u.drr_begin.drr_creation_time = - ds->ds_phys->ds_creation_time; - drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; - if (fromorigin) + dsl_dataset_phys(to_ds)->ds_creation_time; + drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); + if (is_clone) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; - drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; - if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; + if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; + if (zfs_send_set_freerecords_bit) + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS; - if (fromds) - drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; - dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); - - if (fromds) - fromtxg = fromds->ds_phys->ds_creation_txg; - if (fromorigin) - dsl_dataset_rele(fromds, FTAG); + if (ancestor_zb != NULL) { + drr->drr_u.drr_begin.drr_fromguid = + ancestor_zb->zbm_guid; + fromtxg = ancestor_zb->zbm_creation_txg; + } + dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname); + if (!to_ds->ds_is_snapshot) { + (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", + sizeof (drr->drr_u.drr_begin.drr_toname)); + } + + dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); + + dsp->dsa_drr = drr; + dsp->dsa_outfd = outfd; + dsp->dsa_proc = curproc; + dsp->dsa_td = curthread; + dsp->dsa_fp = fp; + dsp->dsa_os = os; + dsp->dsa_off = off; + dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; + dsp->dsa_pending_op = PENDING_NONE; + dsp->dsa_featureflags = featureflags; + dsp->dsa_resume_object = resumeobj; + dsp->dsa_resume_offset = resumeoff; + + mutex_enter(&to_ds->ds_sendstream_lock); + list_insert_head(&to_ds->ds_sendstreams, dsp); + mutex_exit(&to_ds->ds_sendstream_lock); + + dsl_dataset_long_hold(to_ds, FTAG); + dsl_pool_rele(dp, tag); + + void *payload = NULL; + size_t payload_len = 0; + if (resumeobj != 0 || resumeoff != 0) { + dmu_object_info_t to_doi; + err = dmu_object_info(os, resumeobj, &to_doi); + if (err != 0) + goto out; + SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0, + resumeoff / to_doi.doi_data_block_size); - ba.drr = drr; - ba.vp = vp; - ba.os = tosnap; - ba.off = off; - ba.toguid = ds->ds_phys->ds_guid; - ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); - ba.pending_op = PENDING_NONE; - - if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { - kmem_free(drr, sizeof (dmu_replay_record_t)); - return (ba.err); + nvlist_t *nvl = fnvlist_alloc(); + fnvlist_add_uint64(nvl, "resume_object", resumeobj); + fnvlist_add_uint64(nvl, "resume_offset", resumeoff); + payload = fnvlist_pack(nvl, &payload_len); + drr->drr_payloadlen = payload_len; + fnvlist_free(nvl); + } + + err = dump_record(dsp, payload, payload_len); + fnvlist_pack_free(payload, payload_len); + if (err != 0) { + err = dsp->dsa_err; + goto out; + } + + err = bqueue_init(&to_arg.q, zfs_send_queue_length, + offsetof(struct send_block_record, ln)); + to_arg.error_code = 0; + to_arg.cancel = B_FALSE; + to_arg.ds = to_ds; + to_arg.fromtxg = fromtxg; + to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; + (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, &p0, + TS_RUN, minclsyspri); + + struct send_block_record *to_data; + to_data = bqueue_dequeue(&to_arg.q); + + while (!to_data->eos_marker && err == 0) { + err = do_dump(dsp, to_data); + to_data = get_next_record(&to_arg.q, to_data); + if (issig(JUSTLOOKING) && issig(FORREAL)) + err = EINTR; } - err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, - backup_cb, &ba); + if (err != 0) { + to_arg.cancel = B_TRUE; + while (!to_data->eos_marker) { + to_data = get_next_record(&to_arg.q, to_data); + } + } + kmem_free(to_data, sizeof (*to_data)); - if (ba.pending_op != PENDING_NONE) - if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) - err = EINTR; + bqueue_destroy(&to_arg.q); - if (err) { - if (err == EINTR && ba.err) - err = ba.err; - kmem_free(drr, sizeof (dmu_replay_record_t)); - return (err); + if (err == 0 && to_arg.error_code != 0) + err = to_arg.error_code; + + if (err != 0) + goto out; + + if (dsp->dsa_pending_op != PENDING_NONE) + if (dump_record(dsp, NULL, 0) != 0) + err = SET_ERROR(EINTR); + + if (err != 0) { + if (err == EINTR && dsp->dsa_err != 0) + err = dsp->dsa_err; + goto out; } bzero(drr, sizeof (dmu_replay_record_t)); drr->drr_type = DRR_END; - drr->drr_u.drr_end.drr_checksum = ba.zc; - drr->drr_u.drr_end.drr_toguid = ba.toguid; + drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; + drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; - if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { - kmem_free(drr, sizeof (dmu_replay_record_t)); - return (ba.err); - } + if (dump_record(dsp, NULL, 0) != 0) + err = dsp->dsa_err; + +out: + mutex_enter(&to_ds->ds_sendstream_lock); + list_remove(&to_ds->ds_sendstreams, dsp); + mutex_exit(&to_ds->ds_sendstream_lock); + + VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end)); kmem_free(drr, sizeof (dmu_replay_record_t)); + kmem_free(dsp, sizeof (dmu_sendarg_t)); - return (0); -} + dsl_dataset_long_rele(to_ds, FTAG); -struct recvbeginsyncarg { - const char *tofs; - const char *tosnap; - dsl_dataset_t *origin; - uint64_t fromguid; - dmu_objset_type_t type; - void *tag; - boolean_t force; - uint64_t dsflags; - char clonelastname[MAXNAMELEN]; - dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ -}; + return (err); +} -/* ARGSUSED */ -static int -recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) +int +dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, + boolean_t embedok, boolean_t large_block_ok, +#ifdef illumos + int outfd, vnode_t *vp, offset_t *off) +#else + int outfd, struct file *fp, offset_t *off) +#endif { - dsl_dir_t *dd = arg1; - struct recvbeginsyncarg *rbsa = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t val; + dsl_pool_t *dp; + dsl_dataset_t *ds; + dsl_dataset_t *fromds = NULL; int err; - err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, - strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); - - if (err != ENOENT) - return (err ? err : EEXIST); + err = dsl_pool_hold(pool, FTAG, &dp); + if (err != 0) + return (err); - if (rbsa->origin) { - /* make sure it's a snap in the same pool */ - if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) - return (EXDEV); - if (!dsl_dataset_is_snapshot(rbsa->origin)) - return (EINVAL); - if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) - return (ENODEV); + err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); } - return (0); -} - -static void -recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dir_t *dd = arg1; - struct recvbeginsyncarg *rbsa = arg2; - uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; - uint64_t dsobj; + if (fromsnap != 0) { + zfs_bookmark_phys_t zb; + boolean_t is_clone; - /* Create and open new dataset. */ - dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, - rbsa->origin, flags, cr, tx); - VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, - B_TRUE, dmu_recv_tag, &rbsa->ds)); - - if (rbsa->origin == NULL) { - (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, - rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); + err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); + if (err != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); + } + if (!dsl_dataset_is_before(ds, fromds, 0)) + err = SET_ERROR(EXDEV); + zb.zbm_creation_time = + dsl_dataset_phys(fromds)->ds_creation_time; + zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; + zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; + is_clone = (fromds->ds_dir != ds->ds_dir); + dsl_dataset_rele(fromds, FTAG); + err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, + embedok, large_block_ok, outfd, 0, 0, fp, off); + } else { + err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, + embedok, large_block_ok, outfd, 0, 0, fp, off); } - - spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC, - dd->dd_pool->dp_spa, tx, cr, "dataset = %lld", dsobj); + dsl_dataset_rele(ds, FTAG); + return (err); } -/* ARGSUSED */ -static int -recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) +int +dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, + boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff, +#ifdef illumos + vnode_t *vp, offset_t *off) +#else + struct file *fp, offset_t *off) +#endif { - dsl_dataset_t *ds = arg1; - struct recvbeginsyncarg *rbsa = arg2; + dsl_pool_t *dp; + dsl_dataset_t *ds; int err; - uint64_t val; + boolean_t owned = B_FALSE; + + if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) + return (SET_ERROR(EINVAL)); - /* must not have any changes since most recent snapshot */ - if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) - return (ETXTBSY); - - if (rbsa->fromguid) { - /* if incremental, most recent snapshot must match fromguid */ - if (ds->ds_prev == NULL) - return (ENODEV); + err = dsl_pool_hold(tosnap, FTAG, &dp); + if (err != 0) + return (err); + if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { /* - * most recent snapshot must match fromguid, or there are no - * changes since the fromguid one + * We are sending a filesystem or volume. Ensure + * that it doesn't change by owning the dataset. */ - if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { - uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; - uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; - while (obj != 0) { - dsl_dataset_t *snap; - err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, - obj, FTAG, &snap); - if (err) - return (ENODEV); - if (snap->ds_phys->ds_creation_txg < birth) { - dsl_dataset_rele(snap, FTAG); - return (ENODEV); - } - if (snap->ds_phys->ds_guid == rbsa->fromguid) { - dsl_dataset_rele(snap, FTAG); - break; /* it's ok */ - } - obj = snap->ds_phys->ds_prev_snap_obj; - dsl_dataset_rele(snap, FTAG); - } - if (obj == 0) - return (ENODEV); - } + err = dsl_dataset_own(dp, tosnap, FTAG, &ds); + owned = B_TRUE; } else { - /* if full, most recent snapshot must be $ORIGIN */ - if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) - return (ENODEV); + err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); } - - /* temporary clone name must not exist */ - err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_dir->dd_phys->dd_child_dir_zapobj, - rbsa->clonelastname, 8, 1, &val); - if (err == 0) - return (EEXIST); - if (err != ENOENT) + if (err != 0) { + dsl_pool_rele(dp, FTAG); return (err); + } - /* new snapshot name must not exist */ - err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); - if (err == 0) - return (EEXIST); - if (err != ENOENT) - return (err); - return (0); + if (fromsnap != NULL) { + zfs_bookmark_phys_t zb; + boolean_t is_clone = B_FALSE; + int fsnamelen = strchr(tosnap, '@') - tosnap; + + /* + * If the fromsnap is in a different filesystem, then + * mark the send stream as a clone. + */ + if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || + (fromsnap[fsnamelen] != '@' && + fromsnap[fsnamelen] != '#')) { + is_clone = B_TRUE; + } + + if (strchr(fromsnap, '@')) { + dsl_dataset_t *fromds; + err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); + if (err == 0) { + if (!dsl_dataset_is_before(ds, fromds, 0)) + err = SET_ERROR(EXDEV); + zb.zbm_creation_time = + dsl_dataset_phys(fromds)->ds_creation_time; + zb.zbm_creation_txg = + dsl_dataset_phys(fromds)->ds_creation_txg; + zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; + is_clone = (ds->ds_dir != fromds->ds_dir); + dsl_dataset_rele(fromds, FTAG); + } + } else { + err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); + } + if (err != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); + } + err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, + embedok, large_block_ok, + outfd, resumeobj, resumeoff, fp, off); + } else { + err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, + embedok, large_block_ok, + outfd, resumeobj, resumeoff, fp, off); + } + if (owned) + dsl_dataset_disown(ds, FTAG); + else + dsl_dataset_rele(ds, FTAG); + return (err); } -/* ARGSUSED */ -static void -recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +static int +dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size, + uint64_t *sizep) { - dsl_dataset_t *ohds = arg1; - struct recvbeginsyncarg *rbsa = arg2; - dsl_pool_t *dp = ohds->ds_dir->dd_pool; - dsl_dataset_t *cds; - uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; - uint64_t dsobj; - - /* create and open the temporary clone */ - dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, - ohds->ds_prev, flags, cr, tx); - VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); + int err; + /* + * Assume that space (both on-disk and in-stream) is dominated by + * data. We will adjust for indirect blocks and the copies property, + * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). + */ /* - * If we actually created a non-clone, we need to create the - * objset in our new dataset. + * Subtract out approximate space used by indirect blocks. + * Assume most space is used by data blocks (non-indirect, non-dnode). + * Assume all blocks are recordsize. Assume ditto blocks and + * internal fragmentation counter out compression. + * + * Therefore, space used by indirect blocks is sizeof(blkptr_t) per + * block, which we observe in practice. */ - if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { - (void) dmu_objset_create_impl(dp->dp_spa, - cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); - } + uint64_t recordsize; + err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); + if (err != 0) + return (err); + size -= size / recordsize * sizeof (blkptr_t); - rbsa->ds = cds; + /* Add in the space for the record associated with each block. */ + size += size / recordsize * sizeof (dmu_replay_record_t); - spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, - dp->dp_spa, tx, cr, "dataset = %lld", dsobj); + *sizep = size; + + return (0); } -/* - * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() - * succeeds; otherwise we will leak the holds on the datasets. - */ int -dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, - boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) +dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) { - int err = 0; - boolean_t byteswap; - struct recvbeginsyncarg rbsa = { 0 }; - uint64_t versioninfo; - int flags; - dsl_dataset_t *ds; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + int err; + uint64_t size; - if (drrb->drr_magic == DMU_BACKUP_MAGIC) - byteswap = FALSE; - else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) - byteswap = TRUE; - else - return (EINVAL); + ASSERT(dsl_pool_config_held(dp)); - rbsa.tofs = tofs; - rbsa.tosnap = tosnap; - rbsa.origin = origin ? origin->os_dsl_dataset : NULL; - rbsa.fromguid = drrb->drr_fromguid; - rbsa.type = drrb->drr_type; - rbsa.tag = FTAG; - rbsa.dsflags = 0; - versioninfo = drrb->drr_versioninfo; - flags = drrb->drr_flags; - - if (byteswap) { - rbsa.type = BSWAP_32(rbsa.type); - rbsa.fromguid = BSWAP_64(rbsa.fromguid); - versioninfo = BSWAP_64(versioninfo); - flags = BSWAP_32(flags); - } - - if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || - rbsa.type >= DMU_OST_NUMTYPES || - ((flags & DRR_FLAG_CLONE) && origin == NULL)) - return (EINVAL); + /* tosnap must be a snapshot */ + if (!ds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); - if (flags & DRR_FLAG_CI_DATA) - rbsa.dsflags = DS_FLAG_CI_DATASET; + /* fromsnap, if provided, must be a snapshot */ + if (fromds != NULL && !fromds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); - bzero(drc, sizeof (dmu_recv_cookie_t)); - drc->drc_drrb = drrb; - drc->drc_tosnap = tosnap; - drc->drc_top_ds = top_ds; - drc->drc_force = force; + /* + * fromsnap must be an earlier snapshot from the same fs as tosnap, + * or the origin's fs. + */ + if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) + return (SET_ERROR(EXDEV)); + + /* Get uncompressed size estimate of changed data. */ + if (fromds == NULL) { + size = dsl_dataset_phys(ds)->ds_uncompressed_bytes; + } else { + uint64_t used, comp; + err = dsl_dataset_space_written(fromds, ds, + &used, &comp, &size); + if (err != 0) + return (err); + } + + err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep); + return (err); +} + +/* + * Simple callback used to traverse the blocks of a snapshot and sum their + * uncompressed size + */ +/* ARGSUSED */ +static int +dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + uint64_t *spaceptr = arg; + if (bp != NULL && !BP_IS_HOLE(bp)) { + *spaceptr += BP_GET_UCSIZE(bp); + } + return (0); +} + +/* + * Given a desination snapshot and a TXG, calculate the approximate size of a + * send stream sent from that TXG. from_txg may be zero, indicating that the + * whole snapshot will be sent. + */ +int +dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, + uint64_t *sizep) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + int err; + uint64_t size = 0; + + ASSERT(dsl_pool_config_held(dp)); + + /* tosnap must be a snapshot */ + if (!dsl_dataset_is_snapshot(ds)) + return (SET_ERROR(EINVAL)); + + /* verify that from_txg is before the provided snapshot was taken */ + if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) { + return (SET_ERROR(EXDEV)); + } /* - * Process the begin in syncing context. + * traverse the blocks of the snapshot with birth times after + * from_txg, summing their uncompressed size */ + err = traverse_dataset(ds, from_txg, TRAVERSE_POST, + dmu_calculate_send_traversal, &size); + if (err) + return (err); - /* open the dataset we are logically receiving into */ - err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); - if (err == 0) { + err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep); + return (err); +} + +typedef struct dmu_recv_begin_arg { + const char *drba_origin; + dmu_recv_cookie_t *drba_cookie; + cred_t *drba_cred; + uint64_t drba_snapobj; +} dmu_recv_begin_arg_t; + +static int +recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, + uint64_t fromguid) +{ + uint64_t val; + int error; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + /* temporary clone name must not exist */ + error = zap_lookup(dp->dp_meta_objset, + dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, + 8, 1, &val); + if (error != ENOENT) + return (error == 0 ? EBUSY : error); + + /* new snapshot name must not exist */ + error = zap_lookup(dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, + drba->drba_cookie->drc_tosnap, 8, 1, &val); + if (error != ENOENT) + return (error == 0 ? EEXIST : error); + + /* + * Check snapshot limit before receiving. We'll recheck again at the + * end, but might as well abort before receiving if we're already over + * the limit. + * + * Note that we do not check the file system limit with + * dsl_dir_fscount_check because the temporary %clones don't count + * against that limit. + */ + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, + NULL, drba->drba_cred); + if (error != 0) + return (error); + + if (fromguid != 0) { + dsl_dataset_t *snap; + uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + + /* Find snapshot in this dir that matches fromguid. */ + while (obj != 0) { + error = dsl_dataset_hold_obj(dp, obj, FTAG, + &snap); + if (error != 0) + return (SET_ERROR(ENODEV)); + if (snap->ds_dir != ds->ds_dir) { + dsl_dataset_rele(snap, FTAG); + return (SET_ERROR(ENODEV)); + } + if (dsl_dataset_phys(snap)->ds_guid == fromguid) + break; + obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + dsl_dataset_rele(snap, FTAG); + } + if (obj == 0) + return (SET_ERROR(ENODEV)); + + if (drba->drba_cookie->drc_force) { + drba->drba_snapobj = obj; + } else { + /* + * If we are not forcing, there must be no + * changes since fromsnap. + */ + if (dsl_dataset_modified_since_snap(ds, snap)) { + dsl_dataset_rele(snap, FTAG); + return (SET_ERROR(ETXTBSY)); + } + drba->drba_snapobj = ds->ds_prev->ds_object; + } + + dsl_dataset_rele(snap, FTAG); + } else { + /* if full, then must be forced */ + if (!drba->drba_cookie->drc_force) + return (SET_ERROR(EEXIST)); + /* start from $ORIGIN@$ORIGIN, if supported */ + drba->drba_snapobj = dp->dp_origin_snap != NULL ? + dp->dp_origin_snap->ds_object : 0; + } + + return (0); + +} + +static int +dmu_recv_begin_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + uint64_t fromguid = drrb->drr_fromguid; + int flags = drrb->drr_flags; + int error; + uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + dsl_dataset_t *ds; + const char *tofs = drba->drba_cookie->drc_tofs; + + /* already checked */ + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM || + drrb->drr_type >= DMU_OST_NUMTYPES || + ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) + return (SET_ERROR(EINVAL)); + + /* Verify pool version supports SA if SA_SPILL feature set */ + if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && + spa_version(dp->dp_spa) < SPA_VERSION_SA) + return (SET_ERROR(ENOTSUP)); + + if (drba->drba_cookie->drc_resumable && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) + return (SET_ERROR(ENOTSUP)); + + /* + * The receiving code doesn't know how to translate a WRITE_EMBEDDED + * record to a plan WRITE record, so the pool must have the + * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED + * records. Same with WRITE_EMBEDDED records that use LZ4 compression. + */ + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) + return (SET_ERROR(ENOTSUP)); + + /* + * The receiving code doesn't know how to translate large blocks + * to smaller ones, so the pool must have the LARGE_BLOCKS + * feature enabled if the stream has LARGE_BLOCKS. + */ + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ENOTSUP)); + + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if (error == 0) { /* target fs already exists; recv into temp clone */ /* Can't recv a clone into an existing fs */ - if (flags & DRR_FLAG_CLONE) { - dsl_dataset_rele(ds, dmu_recv_tag); - return (EINVAL); - } - - /* must not have an incremental recv already in progress */ - if (!mutex_tryenter(&ds->ds_recvlock)) { - dsl_dataset_rele(ds, dmu_recv_tag); - return (EBUSY); - } - - /* tmp clone name is: tofs/%tosnap" */ - (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), - "%%%s", tosnap); - rbsa.force = force; - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_existing_check, recv_existing_sync, ds, &rbsa, 5); - if (err) { - mutex_exit(&ds->ds_recvlock); - dsl_dataset_rele(ds, dmu_recv_tag); - return (err); + if (flags & DRR_FLAG_CLONE || drba->drba_origin) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); } - drc->drc_logical_ds = ds; - drc->drc_real_ds = rbsa.ds; - } else if (err == ENOENT) { + + error = recv_begin_check_existing_impl(drba, ds, fromguid); + dsl_dataset_rele(ds, FTAG); + } else if (error == ENOENT) { /* target fs does not exist; must be a full backup or clone */ - char *cp; + char buf[ZFS_MAX_DATASET_NAME_LEN]; /* * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. */ - if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) - return (ENOENT); + if (fromguid != 0 && !(flags & DRR_FLAG_CLONE || + drba->drba_origin)) + return (SET_ERROR(ENOENT)); + + /* + * If we're receiving a full send as a clone, and it doesn't + * contain all the necessary free records and freeobject + * records, reject it. + */ + if (fromguid == 0 && drba->drba_origin && + !(flags & DRR_FLAG_FREERECORDS)) + return (SET_ERROR(EINVAL)); /* Open the parent of tofs */ - cp = strrchr(tofs, '/'); - *cp = '\0'; - err = dsl_dataset_hold(tofs, FTAG, &ds); - *cp = '/'; - if (err) - return (err); + ASSERT3U(strlen(tofs), <, sizeof (buf)); + (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); + error = dsl_dataset_hold(dp, buf, FTAG, &ds); + if (error != 0) + return (error); + + /* + * Check filesystem and snapshot limits before receiving. We'll + * recheck snapshot limits again at the end (we create the + * filesystems and increment those counts during begin_sync). + */ + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, + ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (drba->drba_origin != NULL) { + dsl_dataset_t *origin; + error = dsl_dataset_hold(dp, drba->drba_origin, + FTAG, &origin); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + if (!origin->ds_is_snapshot) { + dsl_dataset_rele(origin, FTAG); + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + if (dsl_dataset_phys(origin)->ds_guid != fromguid && + fromguid != 0) { + dsl_dataset_rele(origin, FTAG); + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENODEV)); + } + dsl_dataset_rele(origin, FTAG); + } dsl_dataset_rele(ds, FTAG); - if (err) - return (err); - drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; - drc->drc_newfs = B_TRUE; + error = 0; } + return (error); +} - return (err); +static void +dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + const char *tofs = drba->drba_cookie->drc_tofs; + dsl_dataset_t *ds, *newds; + uint64_t dsobj; + int error; + uint64_t crflags = 0; + + if (drrb->drr_flags & DRR_FLAG_CI_DATA) + crflags |= DS_FLAG_CI_DATASET; + + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if (error == 0) { + /* create temporary clone */ + dsl_dataset_t *snap = NULL; + if (drba->drba_snapobj != 0) { + VERIFY0(dsl_dataset_hold_obj(dp, + drba->drba_snapobj, FTAG, &snap)); + } + dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, + snap, crflags, drba->drba_cred, tx); + if (drba->drba_snapobj != 0) + dsl_dataset_rele(snap, FTAG); + dsl_dataset_rele(ds, FTAG); + } else { + dsl_dir_t *dd; + const char *tail; + dsl_dataset_t *origin = NULL; + + VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); + + if (drba->drba_origin != NULL) { + VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, + FTAG, &origin)); + } + + /* Create new dataset. */ + dsobj = dsl_dataset_create_sync(dd, + strrchr(tofs, '/') + 1, + origin, crflags, drba->drba_cred, tx); + if (origin != NULL) + dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(dd, FTAG); + drba->drba_cookie->drc_newfs = B_TRUE; + } + VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); + + if (drba->drba_cookie->drc_resumable) { + dsl_dataset_zapify(newds, tx); + if (drrb->drr_fromguid != 0) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, + 8, 1, &drrb->drr_fromguid, tx)); + } + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, + 8, 1, &drrb->drr_toguid, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, + 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); + uint64_t one = 1; + uint64_t zero = 0; + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, + 8, 1, &one, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, + 8, 1, &zero, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, + 8, 1, &zero, tx)); + if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_EMBED_DATA) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, + 8, 1, &one, tx)); + } + } + + dmu_buf_will_dirty(newds->ds_dbuf, tx); + dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; + + /* + * If we actually created a non-clone, we need to create the + * objset in our new dataset. + */ + rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG); + if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { + (void) dmu_objset_create_impl(dp->dp_spa, + newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); + } + rrw_exit(&newds->ds_bp_rwlock, FTAG); + + drba->drba_cookie->drc_ds = newds; + + spa_history_log_internal_ds(newds, "receive", tx, ""); +} + +static int +dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + int error; + uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + dsl_dataset_t *ds; + const char *tofs = drba->drba_cookie->drc_tofs; + + /* already checked */ + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING); + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM || + drrb->drr_type >= DMU_OST_NUMTYPES) + return (SET_ERROR(EINVAL)); + + /* Verify pool version supports SA if SA_SPILL feature set */ + if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && + spa_version(dp->dp_spa) < SPA_VERSION_SA) + return (SET_ERROR(ENOTSUP)); + + /* + * The receiving code doesn't know how to translate a WRITE_EMBEDDED + * record to a plain WRITE record, so the pool must have the + * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED + * records. Same with WRITE_EMBEDDED records that use LZ4 compression. + */ + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) + return (SET_ERROR(ENOTSUP)); + + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", + tofs, recv_clone_name); + + if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { + /* %recv does not exist; continue in tofs */ + error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if (error != 0) + return (error); + } + + /* check that ds is marked inconsistent */ + if (!DS_IS_INCONSISTENT(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* check that there is resuming data, and that the toguid matches */ + if (!dsl_dataset_is_zapified(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + uint64_t val; + error = zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); + if (error != 0 || drrb->drr_toguid != val) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Check if the receive is still running. If so, it will be owned. + * Note that nothing else can own the dataset (e.g. after the receive + * fails) because it will be marked inconsistent. + */ + if (dsl_dataset_has_owner(ds)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EBUSY)); + } + + /* There should not be any snapshots of this fs yet. */ + if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Note: resume point will be checked when we process the first WRITE + * record. + */ + + /* check that the origin matches */ + val = 0; + (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); + if (drrb->drr_fromguid != val) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + const char *tofs = drba->drba_cookie->drc_tofs; + dsl_dataset_t *ds; + uint64_t dsobj; + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", + tofs, recv_clone_name); + + if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { + /* %recv does not exist; continue in tofs */ + VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds)); + drba->drba_cookie->drc_newfs = B_TRUE; + } + + /* clear the inconsistent flag so that we can own it */ + ASSERT(DS_IS_INCONSISTENT(ds)); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; + dsobj = ds->ds_object; + dsl_dataset_rele(ds, FTAG); + + VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds)); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; + + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds))); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + drba->drba_cookie->drc_ds = ds; + + spa_history_log_internal_ds(ds, "resume receive", tx, ""); +} + +/* + * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() + * succeeds; otherwise we will leak the holds on the datasets. + */ +int +dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, + boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc) +{ + dmu_recv_begin_arg_t drba = { 0 }; + + bzero(drc, sizeof (dmu_recv_cookie_t)); + drc->drc_drr_begin = drr_begin; + drc->drc_drrb = &drr_begin->drr_u.drr_begin; + drc->drc_tosnap = tosnap; + drc->drc_tofs = tofs; + drc->drc_force = force; + drc->drc_resumable = resumable; + drc->drc_cred = CRED(); + + if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { + drc->drc_byteswap = B_TRUE; + fletcher_4_incremental_byteswap(drr_begin, + sizeof (dmu_replay_record_t), &drc->drc_cksum); + byteswap_record(drr_begin); + } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { + fletcher_4_incremental_native(drr_begin, + sizeof (dmu_replay_record_t), &drc->drc_cksum); + } else { + return (SET_ERROR(EINVAL)); + } + + drba.drba_origin = origin; + drba.drba_cookie = drc; + drba.drba_cred = CRED(); + + if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_RESUMING) { + return (dsl_sync_task(tofs, + dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + } else { + return (dsl_sync_task(tofs, + dmu_recv_begin_check, dmu_recv_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL)); + } } -struct restorearg { +struct receive_record_arg { + dmu_replay_record_t header; + void *payload; /* Pointer to a buffer containing the payload */ + /* + * If the record is a write, pointer to the arc_buf_t containing the + * payload. + */ + arc_buf_t *write_buf; + int payload_size; + uint64_t bytes_read; /* bytes read from stream when record created */ + boolean_t eos_marker; /* Marks the end of the stream */ + bqueue_node_t node; +}; + +struct receive_writer_arg { + objset_t *os; + boolean_t byteswap; + bqueue_t q; + + /* + * These three args are used to signal to the main thread that we're + * done. + */ + kmutex_t mutex; + kcondvar_t cv; + boolean_t done; + int err; - int byteswap; - vnode_t *vp; - char *buf; - uint64_t voff; - int bufsize; /* amount of memory allocated for buf */ + /* A map from guid to dataset to help handle dedup'd streams. */ + avl_tree_t *guid_to_ds_map; + boolean_t resumable; + uint64_t last_object, last_offset; + uint64_t bytes_read; /* bytes read when current record created */ +}; + +struct objlist { + list_t list; /* List of struct receive_objnode. */ + /* + * Last object looked up. Used to assert that objects are being looked + * up in ascending order. + */ + uint64_t last_lookup; +}; + +struct receive_objnode { + list_node_t node; + uint64_t object; +}; + +struct receive_arg { + objset_t *os; + kthread_t *td; + struct file *fp; + uint64_t voff; /* The current offset in the stream */ + uint64_t bytes_read; + /* + * A record that has had its payload read in, but hasn't yet been handed + * off to the worker thread. + */ + struct receive_record_arg *rrd; + /* A record that has had its header read in, but not its payload. */ + struct receive_record_arg *next_rrd; zio_cksum_t cksum; - avl_tree_t guid_to_ds_map; + zio_cksum_t prev_cksum; + int err; + boolean_t byteswap; + /* Sorted list of objects not to issue prefetches for. */ + struct objlist ignore_objlist; }; typedef struct guid_map_entry { @@ -756,102 +1833,100 @@ guid_compare(const void *arg1, const voi return (0); } -/* - * This function is a callback used by dmu_objset_find() (which - * enumerates the object sets) to build an avl tree that maps guids - * to datasets. The resulting table is used when processing DRR_WRITE_BYREF - * send stream records. These records, which are used in dedup'ed - * streams, do not contain data themselves, but refer to a copy - * of the data block that has already been written because it was - * earlier in the stream. That previous copy is identified by the - * guid of the dataset with the referenced data. - */ -int -find_ds_by_guid(const char *name, void *arg) +static void +free_guid_map_onexit(void *arg) { - avl_tree_t *guid_map = arg; - dsl_dataset_t *ds, *snapds; + avl_tree_t *ca = arg; + void *cookie = NULL; guid_map_entry_t *gmep; - dsl_pool_t *dp; - int err; - uint64_t lastobj, firstobj; - - if (dsl_dataset_hold(name, FTAG, &ds) != 0) - return (0); - dp = ds->ds_dir->dd_pool; - rw_enter(&dp->dp_config_rwlock, RW_READER); - firstobj = ds->ds_dir->dd_phys->dd_origin_obj; - lastobj = ds->ds_phys->ds_prev_snap_obj; - - while (lastobj != firstobj) { - err = dsl_dataset_hold_obj(dp, lastobj, guid_map, &snapds); - if (err) { - /* - * Skip this snapshot and move on. It's not - * clear why this would ever happen, but the - * remainder of the snapshot streadm can be - * processed. - */ - rw_exit(&dp->dp_config_rwlock); - dsl_dataset_rele(ds, FTAG); - return (0); - } - - gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); - gmep->guid = snapds->ds_phys->ds_guid; - gmep->gme_ds = snapds; - avl_add(guid_map, gmep); - lastobj = snapds->ds_phys->ds_prev_snap_obj; + while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { + dsl_dataset_long_rele(gmep->gme_ds, gmep); + dsl_dataset_rele(gmep->gme_ds, gmep); + kmem_free(gmep, sizeof (guid_map_entry_t)); } + avl_destroy(ca); + kmem_free(ca, sizeof (avl_tree_t)); +} - rw_exit(&dp->dp_config_rwlock); - dsl_dataset_rele(ds, FTAG); - - return (0); +static int +restore_bytes(struct receive_arg *ra, void *buf, int len, off_t off, ssize_t *resid) +{ + struct uio auio; + struct iovec aiov; + int error; + + aiov.iov_base = buf; + aiov.iov_len = len; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = len; +#ifdef __NetBSD__ +#ifdef _KERNEL + auio.uio_vmspace = vmspace_kernel(); +#endif +#else + auio.uio_segflg = UIO_SYSSPACE; +#endif + auio.uio_rw = UIO_READ; + auio.uio_offset = off; +#ifdef __FreeBSD__ + auio.uio_td = ra->td; +#endif +#ifdef _KERNEL + error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); +#else + fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); + error = EOPNOTSUPP; +#endif + *resid = auio.uio_resid; + return (error); } -static void * -restore_read(struct restorearg *ra, int len) +static int +receive_read(struct receive_arg *ra, int len, void *buf) { - void *rv; int done = 0; - /* some things will require 8-byte alignment, so everything must */ - ASSERT3U(len % 8, ==, 0); + /* + * The code doesn't rely on this (lengths being multiples of 8). See + * comment in dump_bytes. + */ + ASSERT0(len % 8); while (done < len) { ssize_t resid; - ra->err = vn_rdwr(UIO_READ, ra->vp, - (caddr_t)ra->buf + done, len - done, - ra->voff, UIO_SYSSPACE, FAPPEND, - RLIM64_INFINITY, CRED(), &resid); + ra->err = restore_bytes(ra, buf + done, + len - done, ra->voff, &resid); - if (resid == len - done) - ra->err = EINVAL; + if (resid == len - done) { + /* + * Note: ECKSUM indicates that the receive + * was interrupted and can potentially be resumed. + */ + ra->err = SET_ERROR(ECKSUM); + } ra->voff += len - done - resid; done = len - resid; - if (ra->err) - return (NULL); + if (ra->err != 0) + return (ra->err); } + ra->bytes_read += len; + ASSERT3U(done, ==, len); - rv = ra->buf; - if (ra->byteswap) - fletcher_4_incremental_byteswap(rv, len, &ra->cksum); - else - fletcher_4_incremental_native(rv, len, &ra->cksum); - return (rv); + return (0); } static void -backup_byteswap(dmu_replay_record_t *drr) +byteswap_record(dmu_replay_record_t *drr) { #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) drr->drr_type = BSWAP_32(drr->drr_type); drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); + switch (drr->drr_type) { case DRR_BEGIN: DO64(drr_begin.drr_magic); @@ -864,7 +1939,6 @@ backup_byteswap(dmu_replay_record_t *drr break; case DRR_OBJECT: DO64(drr_object.drr_object); - /* DO64(drr_object.drr_allocation_txg); */ DO32(drr_object.drr_type); DO32(drr_object.drr_bonustype); DO32(drr_object.drr_blksz); @@ -882,10 +1956,7 @@ backup_byteswap(dmu_replay_record_t *drr DO64(drr_write.drr_offset); DO64(drr_write.drr_length); DO64(drr_write.drr_toguid); - DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); - DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); - DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); - DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); DO64(drr_write.drr_key.ddk_prop); break; case DRR_WRITE_BYREF: @@ -896,538 +1967,1327 @@ backup_byteswap(dmu_replay_record_t *drr DO64(drr_write_byref.drr_refguid); DO64(drr_write_byref.drr_refobject); DO64(drr_write_byref.drr_refoffset); - DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); - DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); - DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); - DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. + drr_key.ddk_cksum); DO64(drr_write_byref.drr_key.ddk_prop); break; + case DRR_WRITE_EMBEDDED: + DO64(drr_write_embedded.drr_object); + DO64(drr_write_embedded.drr_offset); + DO64(drr_write_embedded.drr_length); + DO64(drr_write_embedded.drr_toguid); + DO32(drr_write_embedded.drr_lsize); + DO32(drr_write_embedded.drr_psize); + break; case DRR_FREE: DO64(drr_free.drr_object); DO64(drr_free.drr_offset); DO64(drr_free.drr_length); DO64(drr_free.drr_toguid); break; + case DRR_SPILL: + DO64(drr_spill.drr_object); + DO64(drr_spill.drr_length); + DO64(drr_spill.drr_toguid); + break; case DRR_END: - DO64(drr_end.drr_checksum.zc_word[0]); - DO64(drr_end.drr_checksum.zc_word[1]); - DO64(drr_end.drr_checksum.zc_word[2]); - DO64(drr_end.drr_checksum.zc_word[3]); DO64(drr_end.drr_toguid); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); break; } + + if (drr->drr_type != DRR_BEGIN) { + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); + } + #undef DO64 #undef DO32 } +static inline uint8_t +deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) +{ + if (bonus_type == DMU_OT_SA) { + return (1); + } else { + return (1 + + ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT)); + } +} + +static void +save_resume_state(struct receive_writer_arg *rwa, + uint64_t object, uint64_t offset, dmu_tx_t *tx) +{ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + if (!rwa->resumable) + return; + + /* + * We use ds_resume_bytes[] != 0 to indicate that we need to + * update this on disk, so it must not be 0. + */ + ASSERT(rwa->bytes_read != 0); + + /* + * We only resume from write records, which have a valid + * (non-meta-dnode) object number. + */ + ASSERT(object != 0); + + /* + * For resuming to work correctly, we must receive records in order, + * sorted by object,offset. This is checked by the callers, but + * assert it here for good measure. + */ + ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); + ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || + offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); + ASSERT3U(rwa->bytes_read, >=, + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); + + rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; + rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; +} + static int -restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) +receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, + void *data) { - int err; + dmu_object_info_t doi; dmu_tx_t *tx; - void *data = NULL; + uint64_t object; + int err; if (drro->drr_type == DMU_OT_NONE || - drro->drr_type >= DMU_OT_NUMTYPES || - drro->drr_bonustype >= DMU_OT_NUMTYPES || + !DMU_OT_IS_VALID(drro->drr_type) || + !DMU_OT_IS_VALID(drro->drr_bonustype) || drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || - drro->drr_blksz > SPA_MAXBLOCKSIZE || + drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || drro->drr_bonuslen > DN_MAX_BONUSLEN) { - return (EINVAL); + return (SET_ERROR(EINVAL)); } - err = dmu_object_info(os, drro->drr_object, NULL); + err = dmu_object_info(rwa->os, drro->drr_object, &doi); if (err != 0 && err != ENOENT) - return (EINVAL); + return (SET_ERROR(EINVAL)); + object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; - if (drro->drr_bonuslen) { - data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); - if (ra->err) - return (ra->err); - } + /* + * If we are losing blkptrs or changing the block size this must + * be a new file instance. We must clear out the previous file + * contents before we can change this type of metadata in the dnode. + */ + if (err == 0) { + int nblkptr; - if (err == ENOENT) { - /* currently free, want to be allocated */ - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); + nblkptr = deduce_nblkptr(drro->drr_bonustype, + drro->drr_bonuslen); + + if (drro->drr_blksz != doi.doi_data_block_size || + nblkptr < doi.doi_nblkptr) { + err = dmu_free_long_range(rwa->os, drro->drr_object, + 0, DMU_OBJECT_END); + if (err != 0) + return (SET_ERROR(EINVAL)); } - err = dmu_object_claim(os, drro->drr_object, - drro->drr_type, drro->drr_blksz, - drro->drr_bonustype, drro->drr_bonuslen, tx); - dmu_tx_commit(tx); - } else { - /* currently allocated, want to be allocated */ - err = dmu_object_reclaim(os, drro->drr_object, - drro->drr_type, drro->drr_blksz, - drro->drr_bonustype, drro->drr_bonuslen); } - if (err) - return (EINVAL); - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, drro->drr_object); + tx = dmu_tx_create(rwa->os); + dmu_tx_hold_bonus(tx, object); err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { + if (err != 0) { dmu_tx_abort(tx); return (err); } - dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, - tx); - dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); + if (object == DMU_NEW_OBJECT) { + /* currently free, want to be allocated */ + err = dmu_object_claim(rwa->os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen, tx); + } else if (drro->drr_type != doi.doi_type || + drro->drr_blksz != doi.doi_data_block_size || + drro->drr_bonustype != doi.doi_bonus_type || + drro->drr_bonuslen != doi.doi_bonus_size) { + /* currently allocated, but with different properties */ + err = dmu_object_reclaim(rwa->os, drro->drr_object, + drro->drr_type, drro->drr_blksz, + drro->drr_bonustype, drro->drr_bonuslen, tx); + } + if (err != 0) { + dmu_tx_commit(tx); + return (SET_ERROR(EINVAL)); + } + + dmu_object_set_checksum(rwa->os, drro->drr_object, + drro->drr_checksumtype, tx); + dmu_object_set_compress(rwa->os, drro->drr_object, + drro->drr_compress, tx); if (data != NULL) { dmu_buf_t *db; - VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); + VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); bcopy(data, db->db_data, drro->drr_bonuslen); - if (ra->byteswap) { - dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, + if (rwa->byteswap) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drro->drr_bonustype); + dmu_ot_byteswap[byteswap].ob_func(db->db_data, drro->drr_bonuslen); } dmu_buf_rele(db, FTAG); } dmu_tx_commit(tx); + return (0); } /* ARGSUSED */ static int -restore_freeobjects(struct restorearg *ra, objset_t *os, +receive_freeobjects(struct receive_writer_arg *rwa, struct drr_freeobjects *drrfo) { uint64_t obj; + int next_err = 0; if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) - return (EINVAL); + return (SET_ERROR(EINVAL)); for (obj = drrfo->drr_firstobj; - obj < drrfo->drr_firstobj + drrfo->drr_numobjs; - (void) dmu_object_next(os, &obj, FALSE, 0)) { + obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0; + next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { int err; - if (dmu_object_info(os, obj, NULL) != 0) + if (dmu_object_info(rwa->os, obj, NULL) != 0) continue; - err = dmu_free_object(os, obj); - if (err) + err = dmu_free_long_object(rwa->os, obj); + if (err != 0) return (err); } + if (next_err != ESRCH) + return (next_err); + return (0); +} + +static int +receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, + arc_buf_t *abuf) +{ + dmu_tx_t *tx; + int err; + + if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || + !DMU_OT_IS_VALID(drrw->drr_type)) + return (SET_ERROR(EINVAL)); + + /* + * For resuming to work, records must be in increasing order + * by (object, offset). + */ + if (drrw->drr_object < rwa->last_object || + (drrw->drr_object == rwa->last_object && + drrw->drr_offset < rwa->last_offset)) { + return (SET_ERROR(EINVAL)); + } + rwa->last_object = drrw->drr_object; + rwa->last_offset = drrw->drr_offset; + + if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) + return (SET_ERROR(EINVAL)); + + tx = dmu_tx_create(rwa->os); + + dmu_tx_hold_write(tx, drrw->drr_object, + drrw->drr_offset, drrw->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + return (err); + } + if (rwa->byteswap) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrw->drr_type); + dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, + drrw->drr_length); + } + + dmu_buf_t *bonus; + if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) + return (SET_ERROR(EINVAL)); + dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); + + /* + * Note: If the receive fails, we want the resume stream to start + * with the same record that we last successfully received (as opposed + * to the next record), so that we can verify that we are + * resuming from the correct location. + */ + save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); + dmu_tx_commit(tx); + dmu_buf_rele(bonus, FTAG); + return (0); } +/* + * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed + * streams to refer to a copy of the data that is already on the + * system because it came in earlier in the stream. This function + * finds the earlier copy of the data, and uses that copy instead of + * data from the stream to fulfill this write. + */ +static int +receive_write_byref(struct receive_writer_arg *rwa, + struct drr_write_byref *drrwbr) +{ + dmu_tx_t *tx; + int err; + guid_map_entry_t gmesrch; + guid_map_entry_t *gmep; + avl_index_t where; + objset_t *ref_os = NULL; + dmu_buf_t *dbp; + + if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) + return (SET_ERROR(EINVAL)); + + /* + * If the GUID of the referenced dataset is different from the + * GUID of the target dataset, find the referenced dataset. + */ + if (drrwbr->drr_toguid != drrwbr->drr_refguid) { + gmesrch.guid = drrwbr->drr_refguid; + if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch, + &where)) == NULL) { + return (SET_ERROR(EINVAL)); + } + if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) + return (SET_ERROR(EINVAL)); + } else { + ref_os = rwa->os; + } + + err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, + drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); + if (err != 0) + return (err); + + tx = dmu_tx_create(rwa->os); + + dmu_tx_hold_write(tx, drrwbr->drr_object, + drrwbr->drr_offset, drrwbr->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + return (err); + } + dmu_write(rwa->os, drrwbr->drr_object, + drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); + dmu_buf_rele(dbp, FTAG); + + /* See comment in restore_write. */ + save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx); + dmu_tx_commit(tx); + return (0); +} + +static int +receive_write_embedded(struct receive_writer_arg *rwa, + struct drr_write_embedded *drrwe, void *data) +{ + dmu_tx_t *tx; + int err; + + if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) + return (EINVAL); + + if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) + return (EINVAL); + + if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) + return (EINVAL); + if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) + return (EINVAL); + + tx = dmu_tx_create(rwa->os); + + dmu_tx_hold_write(tx, drrwe->drr_object, + drrwe->drr_offset, drrwe->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + return (err); + } + + dmu_write_embedded(rwa->os, drrwe->drr_object, + drrwe->drr_offset, data, drrwe->drr_etype, + drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, + rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); + + /* See comment in restore_write. */ + save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); + dmu_tx_commit(tx); + return (0); +} + +static int +receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, + void *data) +{ + dmu_tx_t *tx; + dmu_buf_t *db, *db_spill; + int err; + + if (drrs->drr_length < SPA_MINBLOCKSIZE || + drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) + return (SET_ERROR(EINVAL)); + + if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) + return (SET_ERROR(EINVAL)); + + VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); + if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { + dmu_buf_rele(db, FTAG); + return (err); + } + + tx = dmu_tx_create(rwa->os); + + dmu_tx_hold_spill(tx, db->db_object); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_buf_rele(db, FTAG); + dmu_buf_rele(db_spill, FTAG); + dmu_tx_abort(tx); + return (err); + } + dmu_buf_will_dirty(db_spill, tx); + + if (db_spill->db_size < drrs->drr_length) + VERIFY(0 == dbuf_spill_set_blksz(db_spill, + drrs->drr_length, tx)); + bcopy(data, db_spill->db_data, drrs->drr_length); + + dmu_buf_rele(db, FTAG); + dmu_buf_rele(db_spill, FTAG); + + dmu_tx_commit(tx); + return (0); +} + +/* ARGSUSED */ +static int +receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) +{ + int err; + + if (drrf->drr_length != -1ULL && + drrf->drr_offset + drrf->drr_length < drrf->drr_offset) + return (SET_ERROR(EINVAL)); + + if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) + return (SET_ERROR(EINVAL)); + + err = dmu_free_long_range(rwa->os, drrf->drr_object, + drrf->drr_offset, drrf->drr_length); + + return (err); +} + +/* used to destroy the drc_ds on error */ +static void +dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) +{ + if (drc->drc_resumable) { + /* wait for our resume state to be written to disk */ + txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0); + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + } else { + char name[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(drc->drc_ds, name); + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + (void) dsl_destroy_head(name); + } +} + +static void +receive_cksum(struct receive_arg *ra, int len, void *buf) +{ + if (ra->byteswap) { + fletcher_4_incremental_byteswap(buf, len, &ra->cksum); + } else { + fletcher_4_incremental_native(buf, len, &ra->cksum); + } +} + +/* + * Read the payload into a buffer of size len, and update the current record's + * payload field. + * Allocate ra->next_rrd and read the next record's header into + * ra->next_rrd->header. + * Verify checksum of payload and next record. + */ +static int +receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) +{ + int err; + + if (len != 0) { + ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); + err = receive_read(ra, len, buf); + if (err != 0) + return (err); + receive_cksum(ra, len, buf); + + /* note: rrd is NULL when reading the begin record's payload */ + if (ra->rrd != NULL) { + ra->rrd->payload = buf; + ra->rrd->payload_size = len; + ra->rrd->bytes_read = ra->bytes_read; + } + } + + ra->prev_cksum = ra->cksum; + + ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); + err = receive_read(ra, sizeof (ra->next_rrd->header), + &ra->next_rrd->header); + ra->next_rrd->bytes_read = ra->bytes_read; + if (err != 0) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; + return (err); + } + if (ra->next_rrd->header.drr_type == DRR_BEGIN) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; + return (SET_ERROR(EINVAL)); + } + + /* + * Note: checksum is of everything up to but not including the + * checksum itself. + */ + ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + receive_cksum(ra, + offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + &ra->next_rrd->header); + + zio_cksum_t cksum_orig = + ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; + zio_cksum_t *cksump = + &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; + + if (ra->byteswap) + byteswap_record(&ra->next_rrd->header); + + if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && + !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; + return (SET_ERROR(ECKSUM)); + } + + receive_cksum(ra, sizeof (cksum_orig), &cksum_orig); + + return (0); +} + +static void +objlist_create(struct objlist *list) +{ + list_create(&list->list, sizeof (struct receive_objnode), + offsetof(struct receive_objnode, node)); + list->last_lookup = 0; +} + +static void +objlist_destroy(struct objlist *list) +{ + for (struct receive_objnode *n = list_remove_head(&list->list); + n != NULL; n = list_remove_head(&list->list)) { + kmem_free(n, sizeof (*n)); + } + list_destroy(&list->list); +} + +/* + * This function looks through the objlist to see if the specified object number + * is contained in the objlist. In the process, it will remove all object + * numbers in the list that are smaller than the specified object number. Thus, + * any lookup of an object number smaller than a previously looked up object + * number will always return false; therefore, all lookups should be done in + * ascending order. + */ +static boolean_t +objlist_exists(struct objlist *list, uint64_t object) +{ + struct receive_objnode *node = list_head(&list->list); + ASSERT3U(object, >=, list->last_lookup); + list->last_lookup = object; + while (node != NULL && node->object < object) { + VERIFY3P(node, ==, list_remove_head(&list->list)); + kmem_free(node, sizeof (*node)); + node = list_head(&list->list); + } + return (node != NULL && node->object == object); +} + +/* + * The objlist is a list of object numbers stored in ascending order. However, + * the insertion of new object numbers does not seek out the correct location to + * store a new object number; instead, it appends it to the list for simplicity. + * Thus, any users must take care to only insert new object numbers in ascending + * order. + */ +static void +objlist_insert(struct objlist *list, uint64_t object) +{ + struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP); + node->object = object; +#ifdef ZFS_DEBUG + struct receive_objnode *last_object = list_tail(&list->list); + uint64_t last_objnum = (last_object != NULL ? last_object->object : 0); + ASSERT3U(node->object, >, last_objnum); +#endif + list_insert_tail(&list->list, node); +} + +/* + * Issue the prefetch reads for any necessary indirect blocks. + * + * We use the object ignore list to tell us whether or not to issue prefetches + * for a given object. We do this for both correctness (in case the blocksize + * of an object has changed) and performance (if the object doesn't exist, don't + * needlessly try to issue prefetches). We also trim the list as we go through + * the stream to prevent it from growing to an unbounded size. + * + * The object numbers within will always be in sorted order, and any write + * records we see will also be in sorted order, but they're not sorted with + * respect to each other (i.e. we can get several object records before + * receiving each object's write records). As a result, once we've reached a + * given object number, we can safely remove any reference to lower object + * numbers in the ignore list. In practice, we receive up to 32 object records + * before receiving write records, so the list can have up to 32 nodes in it. + */ +/* ARGSUSED */ +static void +receive_read_prefetch(struct receive_arg *ra, + uint64_t object, uint64_t offset, uint64_t length) +{ + if (!objlist_exists(&ra->ignore_objlist, object)) { + dmu_prefetch(ra->os, object, 1, offset, length, + ZIO_PRIORITY_SYNC_READ); + } +} + +/* + * Read records off the stream, issuing any necessary prefetches. + */ static int -restore_write(struct restorearg *ra, objset_t *os, - struct drr_write *drrw) +receive_read_record(struct receive_arg *ra) { - dmu_tx_t *tx; - void *data; int err; - if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || - drrw->drr_type >= DMU_OT_NUMTYPES) - return (EINVAL); - - data = restore_read(ra, drrw->drr_length); - if (data == NULL) - return (ra->err); - - if (dmu_object_info(os, drrw->drr_object, NULL) != 0) - return (EINVAL); - - tx = dmu_tx_create(os); + switch (ra->rrd->header.drr_type) { + case DRR_OBJECT: + { + struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; + uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8); + void *buf = kmem_zalloc(size, KM_SLEEP); + dmu_object_info_t doi; + err = receive_read_payload_and_next_header(ra, size, buf); + if (err != 0) { + kmem_free(buf, size); + return (err); + } + err = dmu_object_info(ra->os, drro->drr_object, &doi); + /* + * See receive_read_prefetch for an explanation why we're + * storing this object in the ignore_obj_list. + */ + if (err == ENOENT || + (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { + objlist_insert(&ra->ignore_objlist, drro->drr_object); + err = 0; + } + return (err); + } + case DRR_FREEOBJECTS: + { + err = receive_read_payload_and_next_header(ra, 0, NULL); + return (err); + } + case DRR_WRITE: + { + struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; + arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os), + drrw->drr_length); + + err = receive_read_payload_and_next_header(ra, + drrw->drr_length, abuf->b_data); + if (err != 0) { + dmu_return_arcbuf(abuf); + return (err); + } + ra->rrd->write_buf = abuf; + receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, + drrw->drr_length); + return (err); + } + case DRR_WRITE_BYREF: + { + struct drr_write_byref *drrwb = + &ra->rrd->header.drr_u.drr_write_byref; + err = receive_read_payload_and_next_header(ra, 0, NULL); + receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset, + drrwb->drr_length); + return (err); + } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &ra->rrd->header.drr_u.drr_write_embedded; + uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); + void *buf = kmem_zalloc(size, KM_SLEEP); + + err = receive_read_payload_and_next_header(ra, size, buf); + if (err != 0) { + kmem_free(buf, size); + return (err); + } - dmu_tx_hold_write(tx, drrw->drr_object, - drrw->drr_offset, drrw->drr_length); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); + receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset, + drrwe->drr_length); return (err); } - if (ra->byteswap) - dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); - dmu_write(os, drrw->drr_object, - drrw->drr_offset, drrw->drr_length, data, tx); - dmu_tx_commit(tx); - return (0); + case DRR_FREE: + { + /* + * It might be beneficial to prefetch indirect blocks here, but + * we don't really have the data to decide for sure. + */ + err = receive_read_payload_and_next_header(ra, 0, NULL); + return (err); + } + case DRR_END: + { + struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; + if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) + return (SET_ERROR(ECKSUM)); + return (0); + } + case DRR_SPILL: + { + struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill; + void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP); + err = receive_read_payload_and_next_header(ra, drrs->drr_length, + buf); + if (err != 0) + kmem_free(buf, drrs->drr_length); + return (err); + } + default: + return (SET_ERROR(EINVAL)); + } } /* - * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed - * streams to refer to a copy of the data that is already on the - * system because it came in earlier in the stream. This function - * finds the earlier copy of the data, and uses that copy instead of - * data from the stream to fulfill this write. + * Commit the records to the pool. */ static int -restore_write_byref(struct restorearg *ra, objset_t *os, - struct drr_write_byref *drrwbr) +receive_process_record(struct receive_writer_arg *rwa, + struct receive_record_arg *rrd) { - dmu_tx_t *tx; int err; - guid_map_entry_t gmesrch; - guid_map_entry_t *gmep; - avl_index_t where; - objset_t *ref_os = NULL; - dmu_buf_t *dbp; - if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) - return (EINVAL); + /* Processing in order, therefore bytes_read should be increasing. */ + ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); + rwa->bytes_read = rrd->bytes_read; - /* - * If the GUID of the referenced dataset is different from the - * GUID of the target dataset, find the referenced dataset. - */ - if (drrwbr->drr_toguid != drrwbr->drr_refguid) { - gmesrch.guid = drrwbr->drr_refguid; - if ((gmep = avl_find(&ra->guid_to_ds_map, &gmesrch, - &where)) == NULL) { - return (EINVAL); - } - if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) - return (EINVAL); - } else { - ref_os = os; + switch (rrd->header.drr_type) { + case DRR_OBJECT: + { + struct drr_object *drro = &rrd->header.drr_u.drr_object; + err = receive_object(rwa, drro, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); } - - if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, - drrwbr->drr_refoffset, FTAG, &dbp)) + case DRR_FREEOBJECTS: + { + struct drr_freeobjects *drrfo = + &rrd->header.drr_u.drr_freeobjects; + return (receive_freeobjects(rwa, drrfo)); + } + case DRR_WRITE: + { + struct drr_write *drrw = &rrd->header.drr_u.drr_write; + err = receive_write(rwa, drrw, rrd->write_buf); + /* if receive_write() is successful, it consumes the arc_buf */ + if (err != 0) + dmu_return_arcbuf(rrd->write_buf); + rrd->write_buf = NULL; + rrd->payload = NULL; return (err); - - tx = dmu_tx_create(os); - - dmu_tx_hold_write(tx, drrwbr->drr_object, - drrwbr->drr_offset, drrwbr->drr_length); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); + } + case DRR_WRITE_BYREF: + { + struct drr_write_byref *drrwbr = + &rrd->header.drr_u.drr_write_byref; + return (receive_write_byref(rwa, drrwbr)); + } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &rrd->header.drr_u.drr_write_embedded; + err = receive_write_embedded(rwa, drrwe, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; return (err); } - dmu_write(os, drrwbr->drr_object, - drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); - dmu_buf_rele(dbp, FTAG); - dmu_tx_commit(tx); - return (0); + case DRR_FREE: + { + struct drr_free *drrf = &rrd->header.drr_u.drr_free; + return (receive_free(rwa, drrf)); + } + case DRR_SPILL: + { + struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; + err = receive_spill(rwa, drrs, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); + } + default: + return (SET_ERROR(EINVAL)); + } } -/* ARGSUSED */ -static int -restore_free(struct restorearg *ra, objset_t *os, - struct drr_free *drrf) +/* + * dmu_recv_stream's worker thread; pull records off the queue, and then call + * receive_process_record When we're done, signal the main thread and exit. + */ +static void +receive_writer_thread(void *arg) { - int err; - - if (drrf->drr_length != -1ULL && - drrf->drr_offset + drrf->drr_length < drrf->drr_offset) - return (EINVAL); + struct receive_writer_arg *rwa = arg; + struct receive_record_arg *rrd; + for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; + rrd = bqueue_dequeue(&rwa->q)) { + /* + * If there's an error, the main thread will stop putting things + * on the queue, but we need to clear everything in it before we + * can exit. + */ + if (rwa->err == 0) { + rwa->err = receive_process_record(rwa, rrd); + } else if (rrd->write_buf != NULL) { + dmu_return_arcbuf(rrd->write_buf); + rrd->write_buf = NULL; + rrd->payload = NULL; + } else if (rrd->payload != NULL) { + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + } + kmem_free(rrd, sizeof (*rrd)); + } + kmem_free(rrd, sizeof (*rrd)); + mutex_enter(&rwa->mutex); + rwa->done = B_TRUE; + cv_signal(&rwa->cv); + mutex_exit(&rwa->mutex); + thread_exit(); +} - if (dmu_object_info(os, drrf->drr_object, NULL) != 0) - return (EINVAL); +static int +resume_check(struct receive_arg *ra, nvlist_t *begin_nvl) +{ + uint64_t val; + objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset; + uint64_t dsobj = dmu_objset_id(ra->os); + uint64_t resume_obj, resume_off; + + if (nvlist_lookup_uint64(begin_nvl, + "resume_object", &resume_obj) != 0 || + nvlist_lookup_uint64(begin_nvl, + "resume_offset", &resume_off) != 0) { + return (SET_ERROR(EINVAL)); + } + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); + if (resume_obj != val) + return (SET_ERROR(EINVAL)); + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); + if (resume_off != val) + return (SET_ERROR(EINVAL)); - err = dmu_free_long_range(os, drrf->drr_object, - drrf->drr_offset, drrf->drr_length); - return (err); + return (0); } /* + * Read in the stream's records, one by one, and apply them to the pool. There + * are two threads involved; the thread that calls this function will spin up a + * worker thread, read the records off the stream one by one, and issue + * prefetches for any necessary indirect blocks. It will then push the records + * onto an internal blocking queue. The worker thread will pull the records off + * the queue, and actually write the data into the DMU. This way, the worker + * thread doesn't have to wait for reads to complete, since everything it needs + * (the indirect blocks) will be prefetched. + * * NB: callers *must* call dmu_recv_end() if this succeeds. */ int -dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp) +dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, + int cleanup_fd, uint64_t *action_handlep) { - struct restorearg ra = { 0 }; - dmu_replay_record_t *drr; - objset_t *os; - zio_cksum_t pcksum; - guid_map_entry_t *gmep; + int err = 0; + struct receive_arg ra = { 0 }; + struct receive_writer_arg rwa = { 0 }; int featureflags; + nvlist_t *begin_nvl = NULL; - if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) - ra.byteswap = TRUE; - - { - /* compute checksum of drr_begin record */ - dmu_replay_record_t *drr; - drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); - - drr->drr_type = DRR_BEGIN; - drr->drr_u.drr_begin = *drc->drc_drrb; - if (ra.byteswap) { - fletcher_4_incremental_byteswap(drr, - sizeof (dmu_replay_record_t), &ra.cksum); - } else { - fletcher_4_incremental_native(drr, - sizeof (dmu_replay_record_t), &ra.cksum); - } - kmem_free(drr, sizeof (dmu_replay_record_t)); - } + ra.byteswap = drc->drc_byteswap; + ra.cksum = drc->drc_cksum; + ra.td = curthread; + ra.fp = fp; + ra.voff = *voffp; - if (ra.byteswap) { - struct drr_begin *drrb = drc->drc_drrb; - drrb->drr_magic = BSWAP_64(drrb->drr_magic); - drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); - drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); - drrb->drr_type = BSWAP_32(drrb->drr_type); - drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); - drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); + if (dsl_dataset_is_zapified(drc->drc_ds)) { + (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, + drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, + sizeof (ra.bytes_read), 1, &ra.bytes_read); } - ra.vp = vp; - ra.voff = *voffp; - ra.bufsize = 1<<20; - ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); + objlist_create(&ra.ignore_objlist); /* these were verified in dmu_recv_begin */ - ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == + ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, DMU_SUBSTREAM); - ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); + ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); /* * Open the objset we are modifying. */ - VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); + VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os)); - ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); + ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); /* if this stream is dedup'ed, set up the avl tree for guid mapping */ if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { - avl_create(&ra.guid_to_ds_map, guid_compare, - sizeof (guid_map_entry_t), - offsetof(guid_map_entry_t, avlnode)); - (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid, - (void *)&ra.guid_to_ds_map, - DS_FIND_CHILDREN); - } + minor_t minor; - /* - * Read records and process them. - */ - pcksum = ra.cksum; - while (ra.err == 0 && - NULL != (drr = restore_read(&ra, sizeof (*drr)))) { - if (issig(JUSTLOOKING) && issig(FORREAL)) { - ra.err = EINTR; + if (cleanup_fd == -1) { + ra.err = SET_ERROR(EBADF); + goto out; + } + ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (ra.err != 0) { + cleanup_fd = -1; goto out; } - if (ra.byteswap) - backup_byteswap(drr); + if (*action_handlep == 0) { + rwa.guid_to_ds_map = + kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); + avl_create(rwa.guid_to_ds_map, guid_compare, + sizeof (guid_map_entry_t), + offsetof(guid_map_entry_t, avlnode)); + err = zfs_onexit_add_cb(minor, + free_guid_map_onexit, rwa.guid_to_ds_map, + action_handlep); + if (ra.err != 0) + goto out; + } else { + err = zfs_onexit_cb_data(minor, *action_handlep, + (void **)&rwa.guid_to_ds_map); + if (ra.err != 0) + goto out; + } + + drc->drc_guid_to_ds_map = rwa.guid_to_ds_map; + } + + uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen; + void *payload = NULL; + if (payloadlen != 0) + payload = kmem_alloc(payloadlen, KM_SLEEP); + + err = receive_read_payload_and_next_header(&ra, payloadlen, payload); + if (err != 0) { + if (payloadlen != 0) + kmem_free(payload, payloadlen); + goto out; + } + if (payloadlen != 0) { + err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP); + kmem_free(payload, payloadlen); + if (err != 0) + goto out; + } - switch (drr->drr_type) { - case DRR_OBJECT: - { - /* - * We need to make a copy of the record header, - * because restore_{object,write} may need to - * restore_read(), which will invalidate drr. - */ - struct drr_object drro = drr->drr_u.drr_object; - ra.err = restore_object(&ra, os, &drro); - break; - } - case DRR_FREEOBJECTS: - { - struct drr_freeobjects drrfo = - drr->drr_u.drr_freeobjects; - ra.err = restore_freeobjects(&ra, os, &drrfo); - break; - } - case DRR_WRITE: - { - struct drr_write drrw = drr->drr_u.drr_write; - ra.err = restore_write(&ra, os, &drrw); - break; - } - case DRR_WRITE_BYREF: - { - struct drr_write_byref drrwbr = - drr->drr_u.drr_write_byref; - ra.err = restore_write_byref(&ra, os, &drrwbr); + if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { + err = resume_check(&ra, begin_nvl); + if (err != 0) + goto out; + } + + (void) bqueue_init(&rwa.q, zfs_recv_queue_length, + offsetof(struct receive_record_arg, node)); + cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL); + mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL); + rwa.os = ra.os; + rwa.byteswap = drc->drc_byteswap; + rwa.resumable = drc->drc_resumable; + + (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, &p0, + TS_RUN, minclsyspri); + /* + * We're reading rwa.err without locks, which is safe since we are the + * only reader, and the worker thread is the only writer. It's ok if we + * miss a write for an iteration or two of the loop, since the writer + * thread will keep freeing records we send it until we send it an eos + * marker. + * + * We can leave this loop in 3 ways: First, if rwa.err is + * non-zero. In that case, the writer thread will free the rrd we just + * pushed. Second, if we're interrupted; in that case, either it's the + * first loop and ra.rrd was never allocated, or it's later, and ra.rrd + * has been handed off to the writer thread who will free it. Finally, + * if receive_read_record fails or we're at the end of the stream, then + * we free ra.rrd and exit. + */ + while (rwa.err == 0) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + err = SET_ERROR(EINTR); break; } - case DRR_FREE: - { - struct drr_free drrf = drr->drr_u.drr_free; - ra.err = restore_free(&ra, os, &drrf); + + ASSERT3P(ra.rrd, ==, NULL); + ra.rrd = ra.next_rrd; + ra.next_rrd = NULL; + /* Allocates and loads header into ra.next_rrd */ + err = receive_read_record(&ra); + + if (ra.rrd->header.drr_type == DRR_END || err != 0) { + kmem_free(ra.rrd, sizeof (*ra.rrd)); + ra.rrd = NULL; break; } - case DRR_END: - { - struct drr_end drre = drr->drr_u.drr_end; - /* - * We compare against the *previous* checksum - * value, because the stored checksum is of - * everything before the DRR_END record. - */ - if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) - ra.err = ECKSUM; - goto out; - } - default: - ra.err = EINVAL; - goto out; - } - pcksum = ra.cksum; - } - ASSERT(ra.err != 0); + + bqueue_enqueue(&rwa.q, ra.rrd, + sizeof (struct receive_record_arg) + ra.rrd->payload_size); + ra.rrd = NULL; + } + if (ra.next_rrd == NULL) + ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP); + ra.next_rrd->eos_marker = B_TRUE; + bqueue_enqueue(&rwa.q, ra.next_rrd, 1); + + mutex_enter(&rwa.mutex); + while (!rwa.done) { + cv_wait(&rwa.cv, &rwa.mutex); + } + mutex_exit(&rwa.mutex); + + cv_destroy(&rwa.cv); + mutex_destroy(&rwa.mutex); + bqueue_destroy(&rwa.q); + if (err == 0) + err = rwa.err; out: - if (ra.err != 0) { + nvlist_free(begin_nvl); + if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) + zfs_onexit_fd_rele(cleanup_fd); + + if (err != 0) { /* - * destroy what we created, so we don't leave it in the - * inconsistent restoring state. + * Clean up references. If receive is not resumable, + * destroy what we created, so we don't leave it in + * the inconsistent state. */ - txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); - - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, - B_FALSE); - if (drc->drc_real_ds != drc->drc_logical_ds) { - mutex_exit(&drc->drc_logical_ds->ds_recvlock); - dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); - } - } - - if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { - void *cookie = NULL; - - while (gmep = avl_destroy_nodes(&ra.guid_to_ds_map, &cookie)) { - dsl_dataset_rele(gmep->gme_ds, &ra.guid_to_ds_map); - kmem_free(gmep, sizeof (guid_map_entry_t)); - } - avl_destroy(&ra.guid_to_ds_map); + dmu_recv_cleanup_ds(drc); } - kmem_free(ra.buf, ra.bufsize); *voffp = ra.voff; - return (ra.err); + objlist_destroy(&ra.ignore_objlist); + return (err); } -struct recvendsyncarg { - char *tosnap; - uint64_t creation_time; - uint64_t toguid; -}; - static int -recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) +dmu_recv_end_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - struct recvendsyncarg *resa = arg2; + dmu_recv_cookie_t *drc = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + int error; + + ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); + + if (!drc->drc_newfs) { + dsl_dataset_t *origin_head; + + error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); + if (error != 0) + return (error); + if (drc->drc_force) { + /* + * We will destroy any snapshots in tofs (i.e. before + * origin_head) that are after the origin (which is + * the snap before drc_ds, because drc_ds can not + * have any snaps of its own). + */ + uint64_t obj; + + obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + while (obj != + dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { + dsl_dataset_t *snap; + error = dsl_dataset_hold_obj(dp, obj, FTAG, + &snap); + if (error != 0) + break; + if (snap->ds_dir != origin_head->ds_dir) + error = SET_ERROR(EINVAL); + if (error == 0) { + error = dsl_destroy_snapshot_check_impl( + snap, B_FALSE); + } + obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + dsl_dataset_rele(snap, FTAG); + if (error != 0) + break; + } + if (error != 0) { + dsl_dataset_rele(origin_head, FTAG); + return (error); + } + } + error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, + origin_head, drc->drc_force, drc->drc_owner, tx); + if (error != 0) { + dsl_dataset_rele(origin_head, FTAG); + return (error); + } + error = dsl_dataset_snapshot_check_impl(origin_head, + drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); + dsl_dataset_rele(origin_head, FTAG); + if (error != 0) + return (error); - return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); + error = dsl_destroy_head_check_impl(drc->drc_ds, 1); + } else { + error = dsl_dataset_snapshot_check_impl(drc->drc_ds, + drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); + } + return (error); } static void -recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dmu_recv_end_sync(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - struct recvendsyncarg *resa = arg2; + dmu_recv_cookie_t *drc = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx); + spa_history_log_internal_ds(drc->drc_ds, "finish receiving", + tx, "snap=%s", drc->drc_tosnap); - /* set snapshot's creation time and guid */ - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; - ds->ds_prev->ds_phys->ds_guid = resa->toguid; - ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + if (!drc->drc_newfs) { + dsl_dataset_t *origin_head; - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; + VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, + &origin_head)); + + if (drc->drc_force) { + /* + * Destroy any snapshots of drc_tofs (origin_head) + * after the origin (the snap before drc_ds). + */ + uint64_t obj; + + obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + while (obj != + dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { + dsl_dataset_t *snap; + VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, + &snap)); + ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); + obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + dsl_destroy_snapshot_sync_impl(snap, + B_FALSE, tx); + dsl_dataset_rele(snap, FTAG); + } + } + VERIFY3P(drc->drc_ds->ds_prev, ==, + origin_head->ds_prev); + + dsl_dataset_clone_swap_sync_impl(drc->drc_ds, + origin_head, tx); + dsl_dataset_snapshot_sync_impl(origin_head, + drc->drc_tosnap, tx); + + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); + dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = + drc->drc_drrb->drr_creation_time; + dsl_dataset_phys(origin_head->ds_prev)->ds_guid = + drc->drc_drrb->drr_toguid; + dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= + ~DS_FLAG_INCONSISTENT; + + dmu_buf_will_dirty(origin_head->ds_dbuf, tx); + dsl_dataset_phys(origin_head)->ds_flags &= + ~DS_FLAG_INCONSISTENT; + + drc->drc_newsnapobj = + dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + + dsl_dataset_rele(origin_head, FTAG); + dsl_destroy_head_sync_impl(drc->drc_ds, tx); + + if (drc->drc_owner != NULL) + VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); + } else { + dsl_dataset_t *ds = drc->drc_ds; + + dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); + + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + dsl_dataset_phys(ds->ds_prev)->ds_creation_time = + drc->drc_drrb->drr_creation_time; + dsl_dataset_phys(ds->ds_prev)->ds_guid = + drc->drc_drrb->drr_toguid; + dsl_dataset_phys(ds->ds_prev)->ds_flags &= + ~DS_FLAG_INCONSISTENT; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; + if (dsl_dataset_has_resume_receive_state(ds)) { + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, tx); + } + drc->drc_newsnapobj = + dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; + } + /* + * Release the hold from dmu_recv_begin. This must be done before + * we return to open context, so that when we free the dataset's dnode, + * we can evict its bonus buffer. + */ + dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + drc->drc_ds = NULL; } static int -dmu_recv_existing_end(dmu_recv_cookie_t *drc) +add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) { - struct recvendsyncarg resa; - dsl_dataset_t *ds = drc->drc_logical_ds; + dsl_pool_t *dp; + dsl_dataset_t *snapds; + guid_map_entry_t *gmep; int err; - /* - * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() - * expects it to have a ds_user_ptr (and zil), but clone_swap() - * can close it. - */ - txg_wait_synced(ds->ds_dir->dd_pool, 0); - - if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { - err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, - drc->drc_force); - if (err) - goto out; - } else { - mutex_exit(&ds->ds_recvlock); - dsl_dataset_rele(ds, dmu_recv_tag); - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, - B_FALSE); - return (EBUSY); - } - - resa.creation_time = drc->drc_drrb->drr_creation_time; - resa.toguid = drc->drc_drrb->drr_toguid; - resa.tosnap = drc->drc_tosnap; - - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_end_check, recv_end_sync, ds, &resa, 3); - if (err) { - /* swap back */ - (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); - } + ASSERT(guid_map != NULL); -out: - mutex_exit(&ds->ds_recvlock); - dsl_dataset_disown(ds, dmu_recv_tag); - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); + err = dsl_pool_hold(name, FTAG, &dp); + if (err != 0) + return (err); + gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); + err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); + if (err == 0) { + gmep->guid = dsl_dataset_phys(snapds)->ds_guid; + gmep->gme_ds = snapds; + avl_add(guid_map, gmep); + dsl_dataset_long_hold(snapds, gmep); + } else + kmem_free(gmep, sizeof (*gmep)); + + dsl_pool_rele(dp, FTAG); return (err); } +static int dmu_recv_end_modified_blocks = 3; + static int -dmu_recv_new_end(dmu_recv_cookie_t *drc) +dmu_recv_existing_end(dmu_recv_cookie_t *drc) { - struct recvendsyncarg resa; - dsl_dataset_t *ds = drc->drc_logical_ds; - int err; - +#ifdef _KERNEL /* - * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() - * expects it to have a ds_user_ptr (and zil), but clone_swap() - * can close it. - */ - txg_wait_synced(ds->ds_dir->dd_pool, 0); - - resa.creation_time = drc->drc_drrb->drr_creation_time; - resa.toguid = drc->drc_drrb->drr_toguid; - resa.tosnap = drc->drc_tosnap; - - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_end_check, recv_end_sync, ds, &resa, 3); - if (err) { - /* clean up the fs we just recv'd into */ - (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); - } else { - /* release the hold from dmu_recv_begin */ - dsl_dataset_disown(ds, dmu_recv_tag); - } - return (err); + * We will be destroying the ds; make sure its origin is unmounted if + * necessary. + */ + char name[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(drc->drc_ds, name); + zfs_destroy_unmount_origin(name); +#endif + + return (dsl_sync_task(drc->drc_tofs, + dmu_recv_end_check, dmu_recv_end_sync, drc, + dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); +} + +static int +dmu_recv_new_end(dmu_recv_cookie_t *drc) +{ + return (dsl_sync_task(drc->drc_tofs, + dmu_recv_end_check, dmu_recv_end_sync, drc, + dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); } int -dmu_recv_end(dmu_recv_cookie_t *drc) +dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) { - if (drc->drc_logical_ds != drc->drc_real_ds) - return (dmu_recv_existing_end(drc)); + int error; + + drc->drc_owner = owner; + + if (drc->drc_newfs) + error = dmu_recv_new_end(drc); else - return (dmu_recv_new_end(drc)); + error = dmu_recv_existing_end(drc); + + if (error != 0) { + dmu_recv_cleanup_ds(drc); + } else if (drc->drc_guid_to_ds_map != NULL) { + (void) add_ds_to_guidmap(drc->drc_tofs, + drc->drc_guid_to_ds_map, + drc->drc_newsnapobj); + } + return (error); +} + +/* + * Return TRUE if this objset is currently being received into. + */ +boolean_t +dmu_objset_is_receiving(objset_t *os) +{ + return (os->os_dsl_dataset != NULL && + os->os_dsl_dataset->ds_owner == dmu_recv_tag); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 dmu_traverse.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c 27 Feb 2010 22:30:49 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c 10 Oct 2016 11:09:56 -0000 @@ -19,8 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2015 Chunwei Chen. All rights reserved. */ #include @@ -33,40 +34,57 @@ #include #include #include +#include +#include #include +#include -struct prefetch_data { +int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */ +boolean_t send_holes_without_birth_time = B_TRUE; + +#ifdef _KERNEL +SYSCTL_DECL(_vfs_zfs); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, send_holes_without_birth_time, CTLFLAG_RWTUN, + &send_holes_without_birth_time, 0, "Send holes without birth time"); +#endif + +typedef struct prefetch_data { kmutex_t pd_mtx; kcondvar_t pd_cv; - int pd_blks_max; - int pd_blks_fetched; + int32_t pd_bytes_fetched; int pd_flags; boolean_t pd_cancel; boolean_t pd_exited; -}; + zbookmark_phys_t pd_resume; +} prefetch_data_t; -struct traverse_data { +typedef struct traverse_data { spa_t *td_spa; uint64_t td_objset; blkptr_t *td_rootbp; uint64_t td_min_txg; + zbookmark_phys_t *td_resume; int td_flags; - struct prefetch_data *td_pfd; + prefetch_data_t *td_pfd; + boolean_t td_paused; + uint64_t td_hole_birth_enabled_txg; blkptr_cb_t *td_func; void *td_arg; -}; + boolean_t td_realloc_possible; +} traverse_data_t; -static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, - arc_buf_t *buf, uint64_t objset, uint64_t object); +static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, + uint64_t objset, uint64_t object); +static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, + uint64_t objset, uint64_t object); -/* ARGSUSED */ static int traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { - struct traverse_data *td = arg; - zbookmark_t zb; + traverse_data_t *td = arg; + zbookmark_phys_t zb; - if (bp->blk_birth == 0) + if (BP_IS_HOLE(bp)) return (0); if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) @@ -80,25 +98,24 @@ traverse_zil_block(zilog_t *zilog, blkpt return (0); } -/* ARGSUSED */ static int traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) { - struct traverse_data *td = arg; + traverse_data_t *td = arg; if (lrc->lrc_txtype == TX_WRITE) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; - zbookmark_t zb; + zbookmark_phys_t zb; - if (bp->blk_birth == 0) + if (BP_IS_HOLE(bp)) return (0); if (claim_txg == 0 || bp->blk_birth < claim_txg) return (0); - SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL, - lr->lr_offset / BP_GET_LSIZE(bp)); + SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, + ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); @@ -107,7 +124,7 @@ traverse_zil_record(zilog_t *zilog, lr_t } static void -traverse_zil(struct traverse_data *td, zil_header_t *zh) +traverse_zil(traverse_data_t *td, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; zilog_t *zilog; @@ -127,184 +144,383 @@ traverse_zil(struct traverse_data *td, z zil_free(zilog); } +typedef enum resume_skip { + RESUME_SKIP_ALL, + RESUME_SKIP_NONE, + RESUME_SKIP_CHILDREN +} resume_skip_t; + +/* + * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and + * the block indicated by zb does not need to be visited at all. Returns + * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the + * resume point. This indicates that this block should be visited but not its + * children (since they must have been visited in a previous traversal). + * Otherwise returns RESUME_SKIP_NONE. + */ +static resume_skip_t +resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, + const zbookmark_phys_t *zb) +{ + if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { + /* + * If we already visited this bp & everything below, + * don't bother doing it again. + */ + if (zbookmark_subtree_completed(dnp, zb, td->td_resume)) + return (RESUME_SKIP_ALL); + + /* + * If we found the block we're trying to resume from, zero + * the bookmark out to indicate that we have resumed. + */ + if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { + bzero(td->td_resume, sizeof (*zb)); + if (td->td_flags & TRAVERSE_POST) + return (RESUME_SKIP_CHILDREN); + } + } + return (RESUME_SKIP_NONE); +} + +static void +traverse_prefetch_metadata(traverse_data_t *td, + const blkptr_t *bp, const zbookmark_phys_t *zb) +{ + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + + if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) + return; + /* + * If we are in the process of resuming, don't prefetch, because + * some children will not be needed (and in fact may have already + * been freed). + */ + if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) + return; + if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) + return; + if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) + return; + + (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); +} + +static boolean_t +prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp) +{ + ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA); + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || + BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) + return (B_FALSE); + return (B_TRUE); +} + static int -traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, - arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) +traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, + const blkptr_t *bp, const zbookmark_phys_t *zb) { - zbookmark_t czb; - int err = 0, lasterr = 0; + zbookmark_phys_t czb; + int err = 0; arc_buf_t *buf = NULL; - struct prefetch_data *pd = td->td_pfd; + prefetch_data_t *pd = td->td_pfd; boolean_t hard = td->td_flags & TRAVERSE_HARD; - if (bp->blk_birth == 0) { - err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg); - return (err); + switch (resume_skip_check(td, dnp, zb)) { + case RESUME_SKIP_ALL: + return (0); + case RESUME_SKIP_CHILDREN: + goto post; + case RESUME_SKIP_NONE: + break; + default: + ASSERT(0); } - if (bp->blk_birth <= td->td_min_txg) + if (bp->blk_birth == 0) { + /* + * Since this block has a birth time of 0 it must be one of + * two things: a hole created before the + * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole + * which has always been a hole in an object. + * + * If a file is written sparsely, then the unwritten parts of + * the file were "always holes" -- that is, they have been + * holes since this object was allocated. However, we (and + * our callers) can not necessarily tell when an object was + * allocated. Therefore, if it's possible that this object + * was freed and then its object number reused, we need to + * visit all the holes with birth==0. + * + * If it isn't possible that the object number was reused, + * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote + * all the blocks we will visit as part of this traversal, + * then this hole must have always existed, so we can skip + * it. We visit blocks born after (exclusive) td_min_txg. + * + * Note that the meta-dnode cannot be reallocated. + */ + if (!send_holes_without_birth_time && + (!td->td_realloc_possible || + zb->zb_object == DMU_META_DNODE_OBJECT) && + td->td_hole_birth_enabled_txg <= td->td_min_txg) + return (0); + } else if (bp->blk_birth <= td->td_min_txg) { return (0); + } - if (pd && !pd->pd_exited && - ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) || - BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) { + if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) { + uint64_t size = BP_GET_LSIZE(bp); mutex_enter(&pd->pd_mtx); - ASSERT(pd->pd_blks_fetched >= 0); - while (pd->pd_blks_fetched == 0 && !pd->pd_exited) + ASSERT(pd->pd_bytes_fetched >= 0); + while (pd->pd_bytes_fetched < size && !pd->pd_exited) cv_wait(&pd->pd_cv, &pd->pd_mtx); - pd->pd_blks_fetched--; + pd->pd_bytes_fetched -= size; cv_broadcast(&pd->pd_cv); mutex_exit(&pd->pd_mtx); } - if (td->td_flags & TRAVERSE_PRE) { + if (BP_IS_HOLE(bp)) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); - if (err) - return (err); + if (err != 0) + goto post; + return (0); + } + + if (td->td_flags & TRAVERSE_PRE) { + err = td->td_func(td->td_spa, NULL, bp, zb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err != 0) + goto post; } if (BP_GET_LEVEL(bp) > 0) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; - err = arc_read(NULL, td->td_spa, bp, pbuf, - arc_getbuf_func, &buf, + err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) - return (err); + if (err != 0) + goto post; + cbp = buf->b_data; + + for (i = 0; i < epb; i++) { + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + traverse_prefetch_metadata(td, &cbp[i], &czb); + } /* recursively visitbp() blocks below this */ - cbp = buf->b_data; - for (i = 0; i < epb; i++, cbp++) { + for (i = 0; i < epb; i++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); - err = traverse_visitbp(td, dnp, buf, cbp, &czb); - if (err) { - if (!hard) - break; - lasterr = err; - } + err = traverse_visitbp(td, dnp, &cbp[i], &czb); + if (err != 0) + break; } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - err = arc_read(NULL, td->td_spa, bp, pbuf, - arc_getbuf_func, &buf, + err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) - return (err); + if (err != 0) + goto post; + dnode_phys_t *child_dnp = buf->b_data; + + for (i = 0; i < epb; i++) { + prefetch_dnode_metadata(td, &child_dnp[i], + zb->zb_objset, zb->zb_blkid * epb + i); + } /* recursively visitbp() blocks below this */ - dnp = buf->b_data; - for (i = 0; i < epb; i++, dnp++) { - err = traverse_dnode(td, dnp, buf, zb->zb_objset, - zb->zb_blkid * epb + i); - if (err) { - if (!hard) - break; - lasterr = err; - } + for (i = 0; i < epb; i++) { + err = traverse_dnode(td, &child_dnp[i], + zb->zb_objset, zb->zb_blkid * epb + i); + if (err != 0) + break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - uint32_t flags = ARC_WAIT; - objset_phys_t *osp; - dnode_phys_t *dnp; + arc_flags_t flags = ARC_FLAG_WAIT; - err = arc_read_nolock(NULL, td->td_spa, bp, - arc_getbuf_func, &buf, + err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) - return (err); - - osp = buf->b_data; - traverse_zil(td, &osp->os_zil_header); + if (err != 0) + goto post; - dnp = &osp->os_meta_dnode; - err = traverse_dnode(td, dnp, buf, zb->zb_objset, + objset_phys_t *osp = buf->b_data; + prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset, DMU_META_DNODE_OBJECT); - if (err && hard) { - lasterr = err; - err = 0; + /* + * See the block comment above for the goal of this variable. + * If the maxblkid of the meta-dnode is 0, then we know that + * we've never had more than DNODES_PER_BLOCK objects in the + * dataset, which means we can't have reused any object ids. + */ + if (osp->os_meta_dnode.dn_maxblkid == 0) + td->td_realloc_possible = B_FALSE; + + if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { + prefetch_dnode_metadata(td, &osp->os_groupused_dnode, + zb->zb_objset, DMU_GROUPUSED_OBJECT); + prefetch_dnode_metadata(td, &osp->os_userused_dnode, + zb->zb_objset, DMU_USERUSED_OBJECT); } + + err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset, + DMU_META_DNODE_OBJECT); if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { - dnp = &osp->os_userused_dnode; - err = traverse_dnode(td, dnp, buf, zb->zb_objset, - DMU_USERUSED_OBJECT); - } - if (err && hard) { - lasterr = err; - err = 0; + err = traverse_dnode(td, &osp->os_groupused_dnode, + zb->zb_objset, DMU_GROUPUSED_OBJECT); } if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { - dnp = &osp->os_groupused_dnode; - err = traverse_dnode(td, dnp, buf, zb->zb_objset, - DMU_GROUPUSED_OBJECT); + err = traverse_dnode(td, &osp->os_userused_dnode, + zb->zb_objset, DMU_USERUSED_OBJECT); } } if (buf) - (void) arc_buf_remove_ref(buf, &buf); + arc_buf_destroy(buf, &buf); - if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) +post: + if (err == 0 && (td->td_flags & TRAVERSE_POST)) err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); - return (err != 0 ? err : lasterr); + if (hard && (err == EIO || err == ECKSUM)) { + /* + * Ignore this disk error as requested by the HARD flag, + * and continue traversal. + */ + err = 0; + } + + /* + * If we are stopping here, set td_resume. + */ + if (td->td_resume != NULL && err != 0 && !td->td_paused) { + td->td_resume->zb_objset = zb->zb_objset; + td->td_resume->zb_object = zb->zb_object; + td->td_resume->zb_level = 0; + /* + * If we have stopped on an indirect block (e.g. due to + * i/o error), we have not visited anything below it. + * Set the bookmark to the first level-0 block that we need + * to visit. This way, the resuming code does not need to + * deal with resuming from indirect blocks. + * + * Note, if zb_level <= 0, dnp may be NULL, so we don't want + * to dereference it. + */ + td->td_resume->zb_blkid = zb->zb_blkid; + if (zb->zb_level > 0) { + td->td_resume->zb_blkid <<= zb->zb_level * + (dnp->dn_indblkshift - SPA_BLKPTRSHIFT); + } + td->td_paused = B_TRUE; + } + + return (err); +} + +static void +prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, + uint64_t objset, uint64_t object) +{ + int j; + zbookmark_phys_t czb; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); + traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); + traverse_prefetch_metadata(td, &dnp->dn_spill, &czb); + } } static int -traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, - arc_buf_t *buf, uint64_t objset, uint64_t object) +traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, + uint64_t objset, uint64_t object) { - int j, err = 0, lasterr = 0; - zbookmark_t czb; - boolean_t hard = (td->td_flags & TRAVERSE_HARD); + int j, err = 0; + zbookmark_phys_t czb; + + if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL && + object < td->td_resume->zb_object) + return (0); + + if (td->td_flags & TRAVERSE_PRE) { + SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, + ZB_DNODE_BLKID); + err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err != 0) + return (err); + } for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); - err = traverse_visitbp(td, dnp, buf, - (blkptr_t *)&dnp->dn_blkptr[j], &czb); - if (err) { - if (!hard) - break; - lasterr = err; - } + err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); + if (err != 0) + break; + } + + if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { + SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); + err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); + } + + if (err == 0 && (td->td_flags & TRAVERSE_POST)) { + SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, + ZB_DNODE_BLKID); + err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err != 0) + return (err); } - return (err != 0 ? err : lasterr); + return (err); } /* ARGSUSED */ static int traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { - struct prefetch_data *pfd = arg; - uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + prefetch_data_t *pfd = arg; + arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; - ASSERT(pfd->pd_blks_fetched >= 0); + ASSERT(pfd->pd_bytes_fetched >= 0); + if (bp == NULL) + return (0); if (pfd->pd_cancel) - return (EINTR); + return (SET_ERROR(EINTR)); - if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || - BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) || - BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) + if (!prefetch_needed(pfd, bp)) return (0); mutex_enter(&pfd->pd_mtx); - while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max) + while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max) cv_wait(&pfd->pd_cv, &pfd->pd_mtx); - pfd->pd_blks_fetched++; + pfd->pd_bytes_fetched += BP_GET_LSIZE(bp); cv_broadcast(&pfd->pd_cv); mutex_exit(&pfd->pd_mtx); - (void) arc_read_nolock(NULL, spa, bp, NULL, NULL, - ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &aflags, zb); + (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb); return (0); } @@ -312,17 +528,18 @@ traverse_prefetcher(spa_t *spa, zilog_t static void traverse_prefetch_thread(void *arg) { - struct traverse_data *td_main = arg; - struct traverse_data td = *td_main; - zbookmark_t czb; + traverse_data_t *td_main = arg; + traverse_data_t td = *td_main; + zbookmark_phys_t czb; td.td_func = traverse_prefetcher; td.td_arg = td_main->td_pfd; td.td_pfd = NULL; + td.td_resume = &td_main->td_pfd->pd_resume; SET_BOOKMARK(&czb, td.td_objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb); + (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb); mutex_enter(&td_main->td_pfd->pd_mtx); td_main->td_pfd->pd_exited = B_TRUE; @@ -335,36 +552,68 @@ traverse_prefetch_thread(void *arg) * in syncing context). */ static int -traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp, - uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) +traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, + uint64_t txg_start, zbookmark_phys_t *resume, int flags, + blkptr_cb_t func, void *arg) { - struct traverse_data td; - struct prefetch_data pd = { 0 }; - zbookmark_t czb; + traverse_data_t td; + prefetch_data_t pd = { 0 }; + zbookmark_phys_t czb; int err; + ASSERT(ds == NULL || objset == ds->ds_object); + ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); + td.td_spa = spa; td.td_objset = objset; td.td_rootbp = rootbp; td.td_min_txg = txg_start; + td.td_resume = resume; td.td_func = func; td.td_arg = arg; td.td_pfd = &pd; td.td_flags = flags; + td.td_paused = B_FALSE; + td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE); + + if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { + VERIFY(spa_feature_enabled_txg(spa, + SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg)); + } else { + td.td_hole_birth_enabled_txg = UINT64_MAX; + } - pd.pd_blks_max = 100; pd.pd_flags = flags; + if (resume != NULL) + pd.pd_resume = *resume; mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); - if (!(flags & TRAVERSE_PREFETCH) || + /* See comment on ZIL traversal in dsl_scan_visitds. */ + if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { + arc_flags_t flags = ARC_FLAG_WAIT; + objset_phys_t *osp; + arc_buf_t *buf; + + err = arc_read(NULL, td.td_spa, rootbp, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL); + if (err != 0) + return (err); + + osp = buf->b_data; + traverse_zil(&td, &osp->os_zil_header); + arc_buf_destroy(buf, &buf); + } + + if (!(flags & TRAVERSE_PREFETCH_DATA) || 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, &td, TQ_NOQUEUE)) pd.pd_exited = B_TRUE; - SET_BOOKMARK(&czb, objset, + SET_BOOKMARK(&czb, td.td_objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb); + err = traverse_visitbp(&td, NULL, rootbp, &czb); mutex_enter(&pd.pd_mtx); pd.pd_cancel = B_TRUE; @@ -384,11 +633,28 @@ traverse_impl(spa_t *spa, uint64_t objse * in syncing context). */ int -traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, +traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start, + zbookmark_phys_t *resume, + int flags, blkptr_cb_t func, void *arg) +{ + return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, + &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg)); +} + +int +traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, + int flags, blkptr_cb_t func, void *arg) +{ + return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg)); +} + +int +traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, + uint64_t txg_start, zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg) { - return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object, - &ds->ds_phys->ds_bp, txg_start, flags, func, arg)); + return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, + blkptr, txg_start, resume, flags, func, arg)); } /* @@ -398,56 +664,50 @@ int traverse_pool(spa_t *spa, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { - int err, lasterr = 0; - uint64_t obj; + int err; dsl_pool_t *dp = spa_get_dsl(spa); objset_t *mos = dp->dp_meta_objset; boolean_t hard = (flags & TRAVERSE_HARD); /* visit the MOS */ - err = traverse_impl(spa, 0, spa_get_rootblkptr(spa), - txg_start, flags, func, arg); - if (err) + err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), + txg_start, NULL, flags, func, arg); + if (err != 0) return (err); /* visit each dataset */ - for (obj = 1; err == 0 || (err != ESRCH && hard); - err = dmu_object_next(mos, &obj, FALSE, txg_start)) { + for (uint64_t obj = 1; err == 0; + err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) { dmu_object_info_t doi; err = dmu_object_info(mos, obj, &doi); - if (err) { - if (!hard) - return (err); - lasterr = err; - continue; + if (err != 0) { + if (hard) + continue; + break; } - if (doi.doi_type == DMU_OT_DSL_DATASET) { + if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) { dsl_dataset_t *ds; uint64_t txg = txg_start; - rw_enter(&dp->dp_config_rwlock, RW_READER); + dsl_pool_config_enter(dp, FTAG); err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - if (err) { - if (!hard) - return (err); - lasterr = err; - continue; + dsl_pool_config_exit(dp, FTAG); + if (err != 0) { + if (hard) + continue; + break; } - if (ds->ds_phys->ds_prev_snap_txg > txg) - txg = ds->ds_phys->ds_prev_snap_txg; + if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg) + txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; err = traverse_dataset(ds, txg, flags, func, arg); dsl_dataset_rele(ds, FTAG); - if (err) { - if (!hard) - return (err); - lasterr = err; - } + if (err != 0) + break; } } if (err == ESRCH) err = 0; - return (err != 0 ? err : lasterr); + return (err); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 dmu_tx.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c 27 Feb 2010 22:30:50 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c 16 May 2017 01:04:48 -0000 @@ -19,8 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ #include @@ -33,7 +35,10 @@ #include #include /* for fzap_default_block_shift */ #include +#include +#include #include +#include typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); @@ -44,12 +49,13 @@ dmu_tx_create_dd(dsl_dir_t *dd) { dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); tx->tx_dir = dd; - if (dd) + if (dd != NULL) tx->tx_pool = dd->dd_pool; list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), offsetof(dmu_tx_hold_t, txh_node)); list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); + tx->tx_start = gethrtime(); #ifdef ZFS_DEBUG refcount_create(&tx->tx_space_written); refcount_create(&tx->tx_space_freed); @@ -123,6 +129,12 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, ob txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); txh->txh_tx = tx; txh->txh_dnode = dn; + refcount_create(&txh->txh_space_towrite); + refcount_create(&txh->txh_space_tofree); + refcount_create(&txh->txh_space_tooverwrite); + refcount_create(&txh->txh_space_tounref); + refcount_create(&txh->txh_memory_tohold); + refcount_create(&txh->txh_fudge); #ifdef ZFS_DEBUG txh->txh_type = type; txh->txh_arg1 = arg1; @@ -156,7 +168,7 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t * db = dbuf_hold_level(dn, level, blkid, FTAG); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) - return (EIO); + return (SET_ERROR(EIO)); err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); dbuf_rele(db, FTAG); return (err); @@ -184,7 +196,7 @@ dmu_tx_count_twig(dmu_tx_hold_t *txh, dn ASSERT(level != 0); db = NULL; } else { - ASSERT(db->db_dnode == dn); + ASSERT(DB_DNODE(db) == dn); ASSERT(db->db_level == level); ASSERT(db->db.db_size == space); ASSERT(db->db_blkid == blkid); @@ -193,14 +205,20 @@ dmu_tx_count_twig(dmu_tx_hold_t *txh, dn } freeable = (bp && (freeable || - dsl_dataset_block_freeable(ds, bp->blk_birth))); + dsl_dataset_block_freeable(ds, bp, bp->blk_birth))); - if (freeable) - txh->txh_space_tooverwrite += space; - else - txh->txh_space_towrite += space; - if (bp) - txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp); + if (freeable) { + (void) refcount_add_many(&txh->txh_space_tooverwrite, + space, FTAG); + } else { + (void) refcount_add_many(&txh->txh_space_towrite, + space, FTAG); + } + + if (bp) { + (void) refcount_add_many(&txh->txh_space_tounref, + bp_get_dsize(os->os_spa, bp), FTAG); + } dmu_tx_count_twig(txh, dn, parent, level + 1, blkid >> epbs, freeable, history); @@ -219,7 +237,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u return; min_bs = SPA_MINBLOCKSHIFT; - max_bs = SPA_MAXBLOCKSHIFT; + max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1; min_ibs = DN_MIN_INDBLKSHIFT; max_ibs = DN_MAX_INDBLKSHIFT; @@ -280,6 +298,7 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u delta = P2NPHASE(off, dn->dn_datablksz); } + min_ibs = max_ibs = dn->dn_indblkshift; if (dn->dn_maxblkid > 0) { /* * The blocksize can't change, @@ -287,13 +306,14 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u */ ASSERT(dn->dn_datablkshift != 0); min_bs = max_bs = dn->dn_datablkshift; - min_ibs = max_ibs = dn->dn_indblkshift; - } else if (dn->dn_indblkshift > max_ibs) { + } else { /* - * This ensures that if we reduce DN_MAX_INDBLKSHIFT, - * the code will still work correctly on older pools. + * The blocksize can increase up to the recordsize, + * or if it is already more than the recordsize, + * up to the next power of 2. */ - min_ibs = max_ibs = dn->dn_indblkshift; + min_bs = highbit64(dn->dn_datablksz - 1); + max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1)); } /* @@ -308,8 +328,15 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); - db = dbuf_hold_level(dn, 0, start, FTAG); + err = dbuf_hold_impl(dn, 0, start, + FALSE, FALSE, FTAG, &db); rw_exit(&dn->dn_struct_rwlock); + + if (err) { + txh->txh_tx->tx_err = err; + return; + } + dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, history); dbuf_rele(db, FTAG); @@ -321,8 +348,11 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u bits = 64 - min_bs; epbs = min_ibs - SPA_BLKPTRSHIFT; for (bits -= epbs * (nlvls - 1); - bits >= 0; bits -= epbs) - txh->txh_fudge += 1ULL << max_ibs; + bits >= 0; bits -= epbs) { + (void) refcount_add_many( + &txh->txh_fudge, + 1ULL << max_ibs, FTAG); + } goto out; } off += delta; @@ -338,7 +368,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u */ start = P2ALIGN(off, 1ULL << max_bs); end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; - txh->txh_space_towrite += end - start + 1; + (void) refcount_add_many(&txh->txh_space_towrite, + end - start + 1, FTAG); start >>= min_bs; end >>= min_bs; @@ -353,20 +384,23 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, u start >>= epbs; end >>= epbs; ASSERT3U(end, >=, start); - txh->txh_space_towrite += (end - start + 1) << max_ibs; + (void) refcount_add_many(&txh->txh_space_towrite, + (end - start + 1) << max_ibs, FTAG); if (start != 0) { /* * We also need a new blkid=0 indirect block * to reference any existing file data. */ - txh->txh_space_towrite += 1ULL << max_ibs; + (void) refcount_add_many(&txh->txh_space_towrite, + 1ULL << max_ibs, FTAG); } } out: - if (txh->txh_space_towrite + txh->txh_space_tooverwrite > + if (refcount_count(&txh->txh_space_towrite) + + refcount_count(&txh->txh_space_tooverwrite) > 2 * DMU_MAX_ACCESS) - err = EFBIG; + err = SET_ERROR(EFBIG); if (err) txh->txh_tx->tx_err = err; @@ -376,19 +410,22 @@ static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { dnode_t *dn = txh->txh_dnode; - dnode_t *mdn = txh->txh_tx->tx_objset->os_meta_dnode; + dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset); uint64_t space = mdn->dn_datablksz + ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); if (dn && dn->dn_dbuf->db_blkptr && dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_dbuf->db_blkptr->blk_birth)) { - txh->txh_space_tooverwrite += space; - txh->txh_space_tounref += space; + dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) { + (void) refcount_add_many(&txh->txh_space_tooverwrite, + space, FTAG); + (void) refcount_add_many(&txh->txh_space_tounref, space, FTAG); } else { - txh->txh_space_towrite += space; - if (dn && dn->dn_dbuf->db_blkptr) - txh->txh_space_tounref += space; + (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); + if (dn && dn->dn_dbuf->db_blkptr) { + (void) refcount_add_many(&txh->txh_space_tounref, + space, FTAG); + } } } @@ -419,6 +456,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, ui dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; spa_t *spa = txh->txh_tx->tx_pool->dp_spa; int epbs; + uint64_t l0span = 0, nl1blks = 0; if (dn->dn_nlevels == 0) return; @@ -427,7 +465,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, ui * The struct_rwlock protects us against dn_nlevels * changing, in case (against all odds) we manage to dirty & * sync out the changes after we check for being dirty. - * Also, dbuf_hold_level() wants us to have the struct_rwlock. + * Also, dbuf_hold_impl() wants us to have the struct_rwlock. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; @@ -443,44 +481,31 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, ui blkid = off >> dn->dn_datablkshift; nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; - if (blkid >= dn->dn_maxblkid) { + if (blkid > dn->dn_maxblkid) { rw_exit(&dn->dn_struct_rwlock); return; } if (blkid + nblks > dn->dn_maxblkid) - nblks = dn->dn_maxblkid - blkid; + nblks = dn->dn_maxblkid - blkid + 1; } + l0span = nblks; /* save for later use to calc level > 1 overhead */ if (dn->dn_nlevels == 1) { int i; for (i = 0; i < nblks; i++) { blkptr_t *bp = dn->dn_phys->dn_blkptr; ASSERT3U(blkid + i, <, dn->dn_nblkptr); bp += blkid + i; - if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { + if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) { dprintf_bp(bp, "can free old%s", ""); space += bp_get_dsize(spa, bp); } unref += BP_GET_ASIZE(bp); } + nl1blks = 1; nblks = 0; } - /* - * Add in memory requirements of higher-level indirects. - * This assumes a worst-possible scenario for dn_nlevels. - */ - { - uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs); - int level = (dn->dn_nlevels > 1) ? 2 : 1; - - while (level++ < DN_MAX_LEVELS) { - txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift; - blkcnt = 1 + (blkcnt >> epbs); - } - ASSERT(blkcnt <= dn->dn_nblkptr); - } - lastblk = blkid + nblks - 1; while (nblks) { dmu_buf_impl_t *dbuf; @@ -515,9 +540,15 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, ui blkoff = P2PHASE(blkid, epb); tochk = MIN(epb - blkoff, nblks); - dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG); + err = dbuf_hold_impl(dn, 1, blkid >> epbs, + FALSE, FALSE, FTAG, &dbuf); + if (err) { + txh->txh_tx->tx_err = err; + break; + } - txh->txh_memory_tohold += dbuf->db.db_size; + (void) refcount_add_many(&txh->txh_memory_tohold, + dbuf->db.db_size, FTAG); /* * We don't check memory_tohold against DMU_MAX_ACCESS because @@ -538,7 +569,8 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, ui bp += blkoff; for (i = 0; i < tochk; i++) { - if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) { + if (dsl_dataset_block_freeable(ds, &bp[i], + bp[i].blk_birth)) { dprintf_bp(&bp[i], "can free old%s", ""); space += bp_get_dsize(spa, &bp[i]); } @@ -546,19 +578,75 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, ui } dbuf_rele(dbuf, FTAG); + ++nl1blks; blkid += tochk; nblks -= tochk; } rw_exit(&dn->dn_struct_rwlock); + /* + * Add in memory requirements of higher-level indirects. + * This assumes a worst-possible scenario for dn_nlevels and a + * worst-possible distribution of l1-blocks over the region to free. + */ + { + uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs); + int level = 2; + /* + * Here we don't use DN_MAX_LEVEL, but calculate it with the + * given datablkshift and indblkshift. This makes the + * difference between 19 and 8 on large files. + */ + int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) / + (dn->dn_indblkshift - SPA_BLKPTRSHIFT); + + while (level++ < maxlevel) { + (void) refcount_add_many(&txh->txh_memory_tohold, + MAX(MIN(blkcnt, nl1blks), 1) << dn->dn_indblkshift, + FTAG); + blkcnt = 1 + (blkcnt >> epbs); + } + } + /* account for new level 1 indirect blocks that might show up */ if (skipped > 0) { - txh->txh_fudge += skipped << dn->dn_indblkshift; + (void) refcount_add_many(&txh->txh_fudge, + skipped << dn->dn_indblkshift, FTAG); skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); - txh->txh_memory_tohold += skipped << dn->dn_indblkshift; + (void) refcount_add_many(&txh->txh_memory_tohold, + skipped << dn->dn_indblkshift, FTAG); } - txh->txh_space_tofree += space; - txh->txh_space_tounref += unref; + (void) refcount_add_many(&txh->txh_space_tofree, space, FTAG); + (void) refcount_add_many(&txh->txh_space_tounref, unref, FTAG); +} + +/* + * This function marks the transaction as being a "net free". The end + * result is that refquotas will be disabled for this transaction, and + * this transaction will be able to use half of the pool space overhead + * (see dsl_pool_adjustedsize()). Therefore this function should only + * be called for transactions that we expect will not cause a net increase + * in the amount of space used (but it's OK if that is occasionally not true). + */ +void +dmu_tx_mark_netfree(dmu_tx_t *tx) +{ + dmu_tx_hold_t *txh; + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + DMU_NEW_OBJECT, THT_FREE, 0, 0); + + /* + * Pretend that this operation will free 1GB of space. This + * should be large enough to cancel out the largest write. + * We don't want to use something like UINT64_MAX, because that would + * cause overflows when doing math with these values (e.g. in + * dmu_tx_try_assign()). + */ + (void) refcount_add_many(&txh->txh_space_tofree, + 1024 * 1024 * 1024, FTAG); + (void) refcount_add_many(&txh->txh_space_tounref, + 1024 * 1024 * 1024, FTAG); } void @@ -566,8 +654,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t { dmu_tx_hold_t *txh; dnode_t *dn; - uint64_t start, end, i; - int err, shift; + int err; zio_t *zio; ASSERT(tx->tx_txg == 0); @@ -577,14 +664,6 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t if (txh == NULL) return; dn = txh->txh_dnode; - - /* first block */ - if (off != 0) - dmu_tx_count_write(txh, off, 1); - /* last block */ - if (len != DMU_OBJECT_END) - dmu_tx_count_write(txh, off+len, 1); - dmu_tx_count_dnode(txh); if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) @@ -592,24 +671,54 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t if (len == DMU_OBJECT_END) len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; + + /* + * For i/o error checking, we read the first and last level-0 + * blocks if they are not aligned, and all the level-1 blocks. + * + * Note: dbuf_free_range() assumes that we have not instantiated + * any level-0 dbufs that will be completely freed. Therefore we must + * exercise care to not read or count the first and last blocks + * if they are blocksize-aligned. + */ + if (dn->dn_datablkshift == 0) { + if (off != 0 || len < dn->dn_datablksz) + dmu_tx_count_write(txh, 0, dn->dn_datablksz); + } else { + /* first block will be modified if it is not aligned */ + if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) + dmu_tx_count_write(txh, off, 1); + /* last block will be modified if it is not aligned */ + if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) + dmu_tx_count_write(txh, off+len, 1); + } + /* - * For i/o error checking, read the first and last level-0 - * blocks, and all the level-1 blocks. The above count_write's - * have already taken care of the level-0 blocks. + * Check level-1 blocks. */ if (dn->dn_nlevels > 1) { - shift = dn->dn_datablkshift + dn->dn_indblkshift - + int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; - start = off >> shift; - end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; + uint64_t start = off >> shift; + uint64_t end = (off + len) >> shift; + + ASSERT(dn->dn_indblkshift != 0); + + /* + * dnode_reallocate() can result in an object with indirect + * blocks having an odd data block size. In this case, + * just check the single block. + */ + if (dn->dn_datablkshift == 0) + start = end = 0; zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); - for (i = start; i <= end; i++) { + for (uint64_t i = start; i <= end; i++) { uint64_t ibyte = i << shift; err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); i = ibyte >> shift; - if (err == ESRCH) + if (err == ESRCH || i > end) break; if (err) { tx->tx_err = err; @@ -637,8 +746,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t o { dmu_tx_hold_t *txh; dnode_t *dn; - uint64_t nblocks; - int epbs, err; + int err; ASSERT(tx->tx_txg == 0); @@ -660,9 +768,11 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t o return; } - ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); + ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); if (dn->dn_maxblkid == 0 && !add) { + blkptr_t *bp; + /* * If there is only one block (i.e. this is a micro-zap) * and we are not adding anything, the accounting is simple. @@ -677,14 +787,19 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t o * Use max block size here, since we don't know how much * the size will change between now and the dbuf dirty call. */ + bp = &dn->dn_phys->dn_blkptr[0]; if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_phys->dn_blkptr[0].blk_birth)) { - txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; + bp, bp->blk_birth)) { + (void) refcount_add_many(&txh->txh_space_tooverwrite, + MZAP_MAX_BLKSZ, FTAG); } else { - txh->txh_space_towrite += SPA_MAXBLOCKSIZE; + (void) refcount_add_many(&txh->txh_space_towrite, + MZAP_MAX_BLKSZ, FTAG); + } + if (!BP_IS_HOLE(bp)) { + (void) refcount_add_many(&txh->txh_space_tounref, + MZAP_MAX_BLKSZ, FTAG); } - if (dn->dn_phys->dn_blkptr[0].blk_birth) - txh->txh_space_tounref += SPA_MAXBLOCKSIZE; return; } @@ -693,27 +808,41 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t o * access the name in this fat-zap so that we'll check * for i/o errors to the leaf blocks, etc. */ - err = zap_lookup(dn->dn_objset, dn->dn_object, name, - 8, 0, NULL); + err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); if (err == EIO) { tx->tx_err = err; return; } } - err = zap_count_write(dn->dn_objset, dn->dn_object, name, add, + err = zap_count_write_by_dnode(dn, name, add, &txh->txh_space_towrite, &txh->txh_space_tooverwrite); /* * If the modified blocks are scattered to the four winds, - * we'll have to modify an indirect twig for each. + * we'll have to modify an indirect twig for each. We can make + * modifications at up to 3 locations: + * - header block at the beginning of the object + * - target leaf block + * - end of the object, where we might need to write: + * - a new leaf block if the target block needs to be split + * - the new pointer table, if it is growing + * - the new cookie table, if it is growing */ - epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) - if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj) - txh->txh_space_towrite += 3 << dn->dn_indblkshift; - else - txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + dsl_dataset_phys_t *ds_phys = + dsl_dataset_phys(dn->dn_objset->os_dsl_dataset); + for (int lvl = 1; lvl < dn->dn_nlevels; lvl++) { + uint64_t num_indirects = 1 + (dn->dn_maxblkid >> (epbs * lvl)); + uint64_t spc = MIN(3, num_indirects) << dn->dn_indblkshift; + if (ds_phys->ds_prev_snap_obj != 0) { + (void) refcount_add_many(&txh->txh_space_towrite, + spc, FTAG); + } else { + (void) refcount_add_many(&txh->txh_space_tooverwrite, + spc, FTAG); + } + } } void @@ -738,7 +867,7 @@ dmu_tx_hold_space(dmu_tx_t *tx, uint64_t txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE, space, 0); - txh->txh_space_towrite += space; + (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); } int @@ -773,18 +902,24 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_i { dmu_tx_hold_t *txh; int match_object = FALSE, match_offset = FALSE; - dnode_t *dn = db->db_dnode; + dnode_t *dn; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); ASSERT(tx->tx_txg != 0); ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); - if (tx->tx_anyobj) + if (tx->tx_anyobj) { + DB_DNODE_EXIT(db); return; + } /* XXX No checking on the meta dnode for now */ - if (db->db.db_object == DMU_META_DNODE_OBJECT) + if (db->db.db_object == DMU_META_DNODE_OBJECT) { + DB_DNODE_EXIT(db); return; + } for (txh = list_head(&tx->tx_holds); txh; txh = list_next(&tx->tx_holds, txh)) { @@ -813,10 +948,11 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_i match_offset = TRUE; /* * We will let this hold work for the bonus - * buffer so that we don't need to hold it - * when creating a new object. + * or spill buffer so that we don't need to + * hold it when creating a new object. */ - if (blkid == DB_BONUS_BLKID) + if (blkid == DMU_BONUS_BLKID || + blkid == DMU_SPILL_BLKID) match_offset = TRUE; /* * They might have to increase nlevels, @@ -837,8 +973,12 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_i txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; break; + case THT_SPILL: + if (blkid == DMU_SPILL_BLKID) + match_offset = TRUE; + break; case THT_BONUS: - if (blkid == DB_BONUS_BLKID) + if (blkid == DMU_BONUS_BLKID) match_offset = TRUE; break; case THT_ZAP: @@ -851,24 +991,186 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_i ASSERT(!"bad txh_type"); } } - if (match_object && match_offset) + if (match_object && match_offset) { + DB_DNODE_EXIT(db); return; + } } + DB_DNODE_EXIT(db); panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", (u_longlong_t)db->db.db_object, db->db_level, (u_longlong_t)db->db_blkid); } #endif +/* + * If we can't do 10 iops, something is wrong. Let us go ahead + * and hit zfs_dirty_data_max. + */ +hrtime_t zfs_delay_max_ns = MSEC2NSEC(100); +int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ + +/* + * We delay transactions when we've determined that the backend storage + * isn't able to accommodate the rate of incoming writes. + * + * If there is already a transaction waiting, we delay relative to when + * that transaction finishes waiting. This way the calculated min_time + * is independent of the number of threads concurrently executing + * transactions. + * + * If we are the only waiter, wait relative to when the transaction + * started, rather than the current time. This credits the transaction for + * "time already served", e.g. reading indirect blocks. + * + * The minimum time for a transaction to take is calculated as: + * min_time = scale * (dirty - min) / (max - dirty) + * min_time is then capped at zfs_delay_max_ns. + * + * The delay has two degrees of freedom that can be adjusted via tunables. + * The percentage of dirty data at which we start to delay is defined by + * zfs_delay_min_dirty_percent. This should typically be at or above + * zfs_vdev_async_write_active_max_dirty_percent so that we only start to + * delay after writing at full speed has failed to keep up with the incoming + * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly + * speaking, this variable determines the amount of delay at the midpoint of + * the curve. + * + * delay + * 10ms +-------------------------------------------------------------*+ + * | *| + * 9ms + *+ + * | *| + * 8ms + *+ + * | * | + * 7ms + * + + * | * | + * 6ms + * + + * | * | + * 5ms + * + + * | * | + * 4ms + * + + * | * | + * 3ms + * + + * | * | + * 2ms + (midpoint) * + + * | | ** | + * 1ms + v *** + + * | zfs_delay_scale ----------> ******** | + * 0 +-------------------------------------*********----------------+ + * 0% <- zfs_dirty_data_max -> 100% + * + * Note that since the delay is added to the outstanding time remaining on the + * most recent transaction, the delay is effectively the inverse of IOPS. + * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve + * was chosen such that small changes in the amount of accumulated dirty data + * in the first 3/4 of the curve yield relatively small differences in the + * amount of delay. + * + * The effects can be easier to understand when the amount of delay is + * represented on a log scale: + * + * delay + * 100ms +-------------------------------------------------------------++ + * + + + * | | + * + *+ + * 10ms + *+ + * + ** + + * | (midpoint) ** | + * + | ** + + * 1ms + v **** + + * + zfs_delay_scale ----------> ***** + + * | **** | + * + **** + + * 100us + ** + + * + * + + * | * | + * + * + + * 10us + * + + * + + + * | | + * + + + * +--------------------------------------------------------------+ + * 0% <- zfs_dirty_data_max -> 100% + * + * Note here that only as the amount of dirty data approaches its limit does + * the delay start to increase rapidly. The goal of a properly tuned system + * should be to keep the amount of dirty data out of that range by first + * ensuring that the appropriate limits are set for the I/O scheduler to reach + * optimal throughput on the backend storage, and then by changing the value + * of zfs_delay_scale to increase the steepness of the curve. + */ +static void +dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) +{ + dsl_pool_t *dp = tx->tx_pool; + uint64_t delay_min_bytes = + zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; + hrtime_t wakeup, min_tx_time, now; + + if (dirty <= delay_min_bytes) + return; + + /* + * The caller has already waited until we are under the max. + * We make them pass us the amount of dirty data so we don't + * have to handle the case of it being >= the max, which could + * cause a divide-by-zero if it's == the max. + */ + ASSERT3U(dirty, <, zfs_dirty_data_max); + + now = gethrtime(); + min_tx_time = zfs_delay_scale * + (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); + if (now > tx->tx_start + min_tx_time) + return; + + min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); + + DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, + uint64_t, min_tx_time); + + mutex_enter(&dp->dp_lock); + wakeup = MAX(tx->tx_start + min_tx_time, + dp->dp_last_wakeup + min_tx_time); + dp->dp_last_wakeup = wakeup; + mutex_exit(&dp->dp_lock); + +#ifdef _KERNEL +#ifdef illumos + mutex_enter(&curthread->t_delay_lock); + while (cv_timedwait_hires(&curthread->t_delay_cv, + &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns, + CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0) + continue; + mutex_exit(&curthread->t_delay_lock); +#endif +#ifdef __FreeBSD__ + pause_sbt("dmu_tx_delay", wakeup * SBT_1NS, + zfs_delay_resolution_ns * SBT_1NS, C_ABSOLUTE); +#endif +#ifdef __NetBSD__ + kpause("dmu_tx_delay", false, (wakeup - now) * hz / 1000000000, NULL); +#endif +#else + hrtime_t delta = wakeup - gethrtime(); + struct timespec ts; + ts.tv_sec = delta / NANOSEC; + ts.tv_nsec = delta % NANOSEC; + (void) nanosleep(&ts, NULL); +#endif +} + static int -dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) +dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) { dmu_tx_hold_t *txh; spa_t *spa = tx->tx_pool->dp_spa; uint64_t memory, asize, fsize, usize; uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; - ASSERT3U(tx->tx_txg, ==, 0); + ASSERT0(tx->tx_txg); if (tx->tx_err) return (tx->tx_err); @@ -885,9 +1187,15 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t */ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && txg_how != TXG_WAIT) - return (EIO); + return (SET_ERROR(EIO)); + + return (SET_ERROR(ERESTART)); + } - return (ERESTART); + if (!tx->tx_waited && + dsl_pool_need_dirty_delay(tx->tx_pool)) { + tx->tx_wait_dirty = B_TRUE; + return (SET_ERROR(ERESTART)); } tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); @@ -908,7 +1216,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t if (dn->dn_assigned_txg == tx->tx_txg - 1) { mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = txh; - return (ERESTART); + return (SET_ERROR(ERESTART)); } if (dn->dn_assigned_txg == 0) dn->dn_assigned_txg = tx->tx_txg; @@ -916,22 +1224,15 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t (void) refcount_add(&dn->dn_tx_holds, tx); mutex_exit(&dn->dn_mtx); } - towrite += txh->txh_space_towrite; - tofree += txh->txh_space_tofree; - tooverwrite += txh->txh_space_tooverwrite; - tounref += txh->txh_space_tounref; - tohold += txh->txh_memory_tohold; - fudge += txh->txh_fudge; + towrite += refcount_count(&txh->txh_space_towrite); + tofree += refcount_count(&txh->txh_space_tofree); + tooverwrite += refcount_count(&txh->txh_space_tooverwrite); + tounref += refcount_count(&txh->txh_space_tounref); + tohold += refcount_count(&txh->txh_memory_tohold); + fudge += refcount_count(&txh->txh_fudge); } /* - * NB: This check must be after we've held the dnodes, so that - * the dmu_tx_unassign() logic will work properly - */ - if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) - return (ERESTART); - - /* * If a snapshot has been taken since we made our estimates, * assume that we won't be able to free or overwrite anything. */ @@ -984,6 +1285,10 @@ dmu_tx_unassign(dmu_tx_t *tx) txg_rele_to_quiesce(&tx->tx_txgh); + /* + * Walk the transaction's hold list, removing the hold on the + * associated dnode, and notifying waiters if the refcount drops to 0. + */ for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; @@ -1011,26 +1316,33 @@ dmu_tx_unassign(dmu_tx_t *tx) * * (1) TXG_WAIT. If the current open txg is full, waits until there's * a new one. This should be used when you're not holding locks. - * If will only fail if we're truly out of space (or over quota). + * It will only fail if we're truly out of space (or over quota). * * (2) TXG_NOWAIT. If we can't assign into the current open txg without * blocking, returns immediately with ERESTART. This should be used * whenever you're holding locks. On an ERESTART error, the caller * should drop locks, do a dmu_tx_wait(tx), and try again. * - * (3) A specific txg. Use this if you need to ensure that multiple - * transactions all sync in the same txg. Like TXG_NOWAIT, it - * returns ERESTART if it can't assign you into the requested txg. + * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait() + * has already been called on behalf of this operation (though + * most likely on a different tx). */ int -dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) +dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) { int err; ASSERT(tx->tx_txg == 0); - ASSERT(txg_how != 0); + ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || + txg_how == TXG_WAITED); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); + /* If we might wait, we must not hold the config lock. */ + ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); + + if (txg_how == TXG_WAITED) + tx->tx_waited = B_TRUE; + while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { dmu_tx_unassign(tx); @@ -1049,17 +1361,48 @@ void dmu_tx_wait(dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; + dsl_pool_t *dp = tx->tx_pool; ASSERT(tx->tx_txg == 0); + ASSERT(!dsl_pool_config_held(tx->tx_pool)); - /* - * It's possible that the pool has become active after this thread - * has tried to obtain a tx. If that's the case then his - * tx_lasttried_txg would not have been assigned. - */ - if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { - txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1); + if (tx->tx_wait_dirty) { + /* + * dmu_tx_try_assign() has determined that we need to wait + * because we've consumed much or all of the dirty buffer + * space. + */ + mutex_enter(&dp->dp_lock); + while (dp->dp_dirty_total >= zfs_dirty_data_max) + cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); + uint64_t dirty = dp->dp_dirty_total; + mutex_exit(&dp->dp_lock); + + dmu_tx_delay(tx, dirty); + + tx->tx_wait_dirty = B_FALSE; + + /* + * Note: setting tx_waited only has effect if the caller + * used TX_WAIT. Otherwise they are going to destroy + * this tx and try again. The common case, zfs_write(), + * uses TX_WAIT. + */ + tx->tx_waited = B_TRUE; + } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { + /* + * If the pool is suspended we need to wait until it + * is resumed. Note that it's possible that the pool + * has become active after this thread has tried to + * obtain a tx. If that's the case then tx_lasttried_txg + * would not have been set. + */ + txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } else if (tx->tx_needassign_txh) { + /* + * A dnode is assigned to the quiescing txg. Wait for its + * transaction to complete. + */ dnode_t *dn = tx->tx_needassign_txh->txh_dnode; mutex_enter(&dn->dn_mtx); @@ -1089,20 +1432,59 @@ dmu_tx_willuse_space(dmu_tx_t *tx, int64 #endif } -void -dmu_tx_commit(dmu_tx_t *tx) +static void +dmu_tx_destroy(dmu_tx_t *tx) { dmu_tx_hold_t *txh; - ASSERT(tx->tx_txg != 0); - - while (txh = list_head(&tx->tx_holds)) { + while ((txh = list_head(&tx->tx_holds)) != NULL) { dnode_t *dn = txh->txh_dnode; list_remove(&tx->tx_holds, txh); + refcount_destroy_many(&txh->txh_space_towrite, + refcount_count(&txh->txh_space_towrite)); + refcount_destroy_many(&txh->txh_space_tofree, + refcount_count(&txh->txh_space_tofree)); + refcount_destroy_many(&txh->txh_space_tooverwrite, + refcount_count(&txh->txh_space_tooverwrite)); + refcount_destroy_many(&txh->txh_space_tounref, + refcount_count(&txh->txh_space_tounref)); + refcount_destroy_many(&txh->txh_memory_tohold, + refcount_count(&txh->txh_memory_tohold)); + refcount_destroy_many(&txh->txh_fudge, + refcount_count(&txh->txh_fudge)); kmem_free(txh, sizeof (dmu_tx_hold_t)); + if (dn != NULL) + dnode_rele(dn, tx); + } + + list_destroy(&tx->tx_callbacks); + list_destroy(&tx->tx_holds); +#ifdef ZFS_DEBUG + refcount_destroy_many(&tx->tx_space_written, + refcount_count(&tx->tx_space_written)); + refcount_destroy_many(&tx->tx_space_freed, + refcount_count(&tx->tx_space_freed)); +#endif + kmem_free(tx, sizeof (dmu_tx_t)); +} + +void +dmu_tx_commit(dmu_tx_t *tx) +{ + ASSERT(tx->tx_txg != 0); + + /* + * Go through the transaction's hold list and remove holds on + * associated dnodes, notifying waiters if no holds remain. + */ + for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; + txh = list_next(&tx->tx_holds, txh)) { + dnode_t *dn = txh->txh_dnode; + if (dn == NULL) continue; + mutex_enter(&dn->dn_mtx); ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); @@ -1111,7 +1493,6 @@ dmu_tx_commit(dmu_tx_t *tx) cv_broadcast(&dn->dn_notxholds); } mutex_exit(&dn->dn_mtx); - dnode_rele(dn, tx); } if (tx->tx_tempreserve_cookie) @@ -1123,51 +1504,26 @@ dmu_tx_commit(dmu_tx_t *tx) if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); - list_destroy(&tx->tx_callbacks); - list_destroy(&tx->tx_holds); #ifdef ZFS_DEBUG dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", tx->tx_space_towrite, refcount_count(&tx->tx_space_written), tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); - refcount_destroy_many(&tx->tx_space_written, - refcount_count(&tx->tx_space_written)); - refcount_destroy_many(&tx->tx_space_freed, - refcount_count(&tx->tx_space_freed)); #endif - kmem_free(tx, sizeof (dmu_tx_t)); + dmu_tx_destroy(tx); } void dmu_tx_abort(dmu_tx_t *tx) { - dmu_tx_hold_t *txh; - ASSERT(tx->tx_txg == 0); - while (txh = list_head(&tx->tx_holds)) { - dnode_t *dn = txh->txh_dnode; - - list_remove(&tx->tx_holds, txh); - kmem_free(txh, sizeof (dmu_tx_hold_t)); - if (dn != NULL) - dnode_rele(dn, tx); - } - /* * Call any registered callbacks with an error code. */ if (!list_is_empty(&tx->tx_callbacks)) dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); - list_destroy(&tx->tx_callbacks); - list_destroy(&tx->tx_holds); -#ifdef ZFS_DEBUG - refcount_destroy_many(&tx->tx_space_written, - refcount_count(&tx->tx_space_written)); - refcount_destroy_many(&tx->tx_space_freed, - refcount_count(&tx->tx_space_freed)); -#endif - kmem_free(tx, sizeof (dmu_tx_t)); + dmu_tx_destroy(tx); } uint64_t @@ -1177,6 +1533,14 @@ dmu_tx_get_txg(dmu_tx_t *tx) return (tx->tx_txg); } +dsl_pool_t * +dmu_tx_pool(dmu_tx_t *tx) +{ + ASSERT(tx->tx_pool != NULL); + return (tx->tx_pool); +} + + void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) { @@ -1198,9 +1562,163 @@ dmu_tx_do_callbacks(list_t *cb_list, int { dmu_tx_callback_t *dcb; - while (dcb = list_head(cb_list)) { + while ((dcb = list_head(cb_list)) != NULL) { list_remove(cb_list, dcb); dcb->dcb_func(dcb->dcb_data, error); kmem_free(dcb, sizeof (dmu_tx_callback_t)); } } + +/* + * Interface to hold a bunch of attributes. + * used for creating new files. + * attrsize is the total size of all attributes + * to be added during object creation + * + * For updating/adding a single attribute dmu_tx_hold_sa() should be used. + */ + +/* + * hold necessary attribute name for attribute registration. + * should be a very rare case where this is needed. If it does + * happen it would only happen on the first write to the file system. + */ +static void +dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) +{ + int i; + + if (!sa->sa_need_attr_registration) + return; + + for (i = 0; i != sa->sa_num_attrs; i++) { + if (!sa->sa_attr_table[i].sa_registered) { + if (sa->sa_reg_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, + B_TRUE, sa->sa_attr_table[i].sa_name); + else + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, + B_TRUE, sa->sa_attr_table[i].sa_name); + } + } +} + + +void +dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) +{ + dnode_t *dn; + dmu_tx_hold_t *txh; + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, + THT_SPILL, 0, 0); + + dn = txh->txh_dnode; + + if (dn == NULL) + return; + + /* If blkptr doesn't exist then add space to towrite */ + if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { + (void) refcount_add_many(&txh->txh_space_towrite, + SPA_OLD_MAXBLOCKSIZE, FTAG); + } else { + blkptr_t *bp; + + bp = &dn->dn_phys->dn_spill; + if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, + bp, bp->blk_birth)) { + (void) refcount_add_many(&txh->txh_space_tooverwrite, + SPA_OLD_MAXBLOCKSIZE, FTAG); + } else { + (void) refcount_add_many(&txh->txh_space_towrite, + SPA_OLD_MAXBLOCKSIZE, FTAG); + } + if (!BP_IS_HOLE(bp)) { + (void) refcount_add_many(&txh->txh_space_tounref, + SPA_OLD_MAXBLOCKSIZE, FTAG); + } + } +} + +void +dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) +{ + sa_os_t *sa = tx->tx_objset->os_sa; + + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_layout_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + else { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) + return; + + (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, + THT_SPILL, 0, 0); +} + +/* + * Hold SA attribute + * + * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) + * + * variable_size is the total size of all variable sized attributes + * passed to this function. It is not the total size of all + * variable size attributes that *may* exist on this object. + */ +void +dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) +{ + uint64_t object; + sa_os_t *sa = tx->tx_objset->os_sa; + + ASSERT(hdl != NULL); + + object = sa_handle_object(hdl); + + dmu_tx_hold_bonus(tx, object); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || + tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + + if (sa->sa_force_spill || may_grow || hdl->sa_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } else { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn->dn_have_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } + DB_DNODE_EXIT(db); + } +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 dmu_zfetch.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c 27 Feb 2010 22:30:50 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c 19 Dec 2016 02:06:18 -0000 @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + */ + #include #include #include @@ -32,207 +36,60 @@ #include /* - * I'm against tune-ables, but these should probably exist as tweakable globals - * until we can get this working the way we want it to. + * This tunable disables predictive prefetch. Note that it leaves "prescient" + * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, + * prescient prefetch never issues i/os that end up not being needed, + * so it can't hurt performance. */ - -int zfs_prefetch_disable = 0; +boolean_t zfs_prefetch_disable = B_FALSE; /* max # of streams per zfetch */ uint32_t zfetch_max_streams = 8; /* min time before stream reclaim */ uint32_t zfetch_min_sec_reap = 2; -/* max number of blocks to fetch at a time */ -uint32_t zfetch_block_cap = 256; -/* number of bytes in a array_read at which we stop prefetching (1Mb) */ +/* max bytes to prefetch per stream (default 8MB) */ +uint32_t zfetch_max_distance = 8 * 1024 * 1024; +/* max bytes to prefetch indirects for per stream (default 64MB) */ +uint32_t zfetch_max_idistance = 64 * 1024 * 1024; +/* max number of bytes in an array_read in which we allow prefetching (1MB) */ uint64_t zfetch_array_rd_sz = 1024 * 1024; -/* forward decls for static routines */ -static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); -static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); -static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); -static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); -static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); -static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); -static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); -static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); -static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *); +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW, + &zfs_prefetch_disable, 0, "Disable prefetch"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH"); +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RWTUN, + &zfetch_max_streams, 0, "Max # of streams per zfetch"); +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RWTUN, + &zfetch_min_sec_reap, 0, "Min time before stream reclaim"); +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN, + &zfetch_max_distance, 0, "Max bytes to prefetch per stream"); +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN, + &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream"); +SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RWTUN, + &zfetch_array_rd_sz, 0, + "Number of bytes in a array_read at which we stop prefetching"); typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; kstat_named_t zfetchstat_misses; - kstat_named_t zfetchstat_colinear_hits; - kstat_named_t zfetchstat_colinear_misses; - kstat_named_t zfetchstat_stride_hits; - kstat_named_t zfetchstat_stride_misses; - kstat_named_t zfetchstat_reclaim_successes; - kstat_named_t zfetchstat_reclaim_failures; - kstat_named_t zfetchstat_stream_resets; - kstat_named_t zfetchstat_stream_noresets; - kstat_named_t zfetchstat_bogus_streams; + kstat_named_t zfetchstat_max_streams; } zfetch_stats_t; static zfetch_stats_t zfetch_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, - { "colinear_hits", KSTAT_DATA_UINT64 }, - { "colinear_misses", KSTAT_DATA_UINT64 }, - { "stride_hits", KSTAT_DATA_UINT64 }, - { "stride_misses", KSTAT_DATA_UINT64 }, - { "reclaim_successes", KSTAT_DATA_UINT64 }, - { "reclaim_failures", KSTAT_DATA_UINT64 }, - { "streams_resets", KSTAT_DATA_UINT64 }, - { "streams_noresets", KSTAT_DATA_UINT64 }, - { "bogus_streams", KSTAT_DATA_UINT64 }, + { "max_streams", KSTAT_DATA_UINT64 }, }; -#define ZFETCHSTAT_INCR(stat, val) \ - atomic_add_64(&zfetch_stats.stat.value.ui64, (val)); - -#define ZFETCHSTAT_BUMP(stat) ZFETCHSTAT_INCR(stat, 1); +#define ZFETCHSTAT_BUMP(stat) \ + atomic_inc_64(&zfetch_stats.stat.value.ui64); kstat_t *zfetch_ksp; -/* - * Given a zfetch structure and a zstream structure, determine whether the - * blocks to be read are part of a co-linear pair of existing prefetch - * streams. If a set is found, coalesce the streams, removing one, and - * configure the prefetch so it looks for a strided access pattern. - * - * In other words: if we find two sequential access streams that are - * the same length and distance N appart, and this read is N from the - * last stream, then we are probably in a strided access pattern. So - * combine the two sequential streams into a single strided stream. - * - * If no co-linear streams are found, return NULL. - */ -static int -dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) -{ - zstream_t *z_walk; - zstream_t *z_comp; - - if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) - return (0); - - if (zh == NULL) { - rw_exit(&zf->zf_rwlock); - return (0); - } - - for (z_walk = list_head(&zf->zf_stream); z_walk; - z_walk = list_next(&zf->zf_stream, z_walk)) { - for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp; - z_comp = list_next(&zf->zf_stream, z_comp)) { - int64_t diff; - - if (z_walk->zst_len != z_walk->zst_stride || - z_comp->zst_len != z_comp->zst_stride) { - continue; - } - - diff = z_comp->zst_offset - z_walk->zst_offset; - if (z_comp->zst_offset + diff == zh->zst_offset) { - z_walk->zst_offset = zh->zst_offset; - z_walk->zst_direction = diff < 0 ? -1 : 1; - z_walk->zst_stride = - diff * z_walk->zst_direction; - z_walk->zst_ph_offset = - zh->zst_offset + z_walk->zst_stride; - dmu_zfetch_stream_remove(zf, z_comp); - mutex_destroy(&z_comp->zst_lock); - kmem_free(z_comp, sizeof (zstream_t)); - - dmu_zfetch_dofetch(zf, z_walk); - - rw_exit(&zf->zf_rwlock); - return (1); - } - - diff = z_walk->zst_offset - z_comp->zst_offset; - if (z_walk->zst_offset + diff == zh->zst_offset) { - z_walk->zst_offset = zh->zst_offset; - z_walk->zst_direction = diff < 0 ? -1 : 1; - z_walk->zst_stride = - diff * z_walk->zst_direction; - z_walk->zst_ph_offset = - zh->zst_offset + z_walk->zst_stride; - dmu_zfetch_stream_remove(zf, z_comp); - mutex_destroy(&z_comp->zst_lock); - kmem_free(z_comp, sizeof (zstream_t)); - - dmu_zfetch_dofetch(zf, z_walk); - - rw_exit(&zf->zf_rwlock); - return (1); - } - } - } - - rw_exit(&zf->zf_rwlock); - return (0); -} - -/* - * Given a zstream_t, determine the bounds of the prefetch. Then call the - * routine that actually prefetches the individual blocks. - */ -static void -dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) -{ - uint64_t prefetch_tail; - uint64_t prefetch_limit; - uint64_t prefetch_ofst; - uint64_t prefetch_len; - uint64_t blocks_fetched; - - zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len); - zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap); - - prefetch_tail = MAX((int64_t)zs->zst_ph_offset, - (int64_t)(zs->zst_offset + zs->zst_stride)); - /* - * XXX: use a faster division method? - */ - prefetch_limit = zs->zst_offset + zs->zst_len + - (zs->zst_cap * zs->zst_stride) / zs->zst_len; - - while (prefetch_tail < prefetch_limit) { - prefetch_ofst = zs->zst_offset + zs->zst_direction * - (prefetch_tail - zs->zst_offset); - - prefetch_len = zs->zst_len; - - /* - * Don't prefetch beyond the end of the file, if working - * backwards. - */ - if ((zs->zst_direction == ZFETCH_BACKWARD) && - (prefetch_ofst > prefetch_tail)) { - prefetch_len += prefetch_ofst; - prefetch_ofst = 0; - } - - /* don't prefetch more than we're supposed to */ - if (prefetch_len > zs->zst_len) - break; - - blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode, - prefetch_ofst, zs->zst_len); - - prefetch_tail += zs->zst_stride; - /* stop if we've run out of stuff to prefetch */ - if (blocks_fetched < zs->zst_len) - break; - } - zs->zst_ph_offset = prefetch_tail; - zs->zst_last = ddi_get_lbolt(); -} - void zfetch_init(void) { - zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -260,272 +117,41 @@ zfetch_fini(void) void dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) { - if (zf == NULL) { + if (zf == NULL) return; - } zf->zf_dnode = dno; - zf->zf_stream_cnt = 0; - zf->zf_alloc_fail = 0; list_create(&zf->zf_stream, sizeof (zstream_t), - offsetof(zstream_t, zst_node)); + offsetof(zstream_t, zs_node)); rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); } -/* - * This function computes the actual size, in blocks, that can be prefetched, - * and fetches it. - */ -static uint64_t -dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) -{ - uint64_t fetchsz; - uint64_t i; - - fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); - - for (i = 0; i < fetchsz; i++) { - dbuf_prefetch(dn, blkid + i); - } - - return (fetchsz); -} - -/* - * this function returns the number of blocks that would be prefetched, based - * upon the supplied dnode, blockid, and nblks. This is used so that we can - * update streams in place, and then prefetch with their old value after the - * fact. This way, we can delay the prefetch, but subsequent accesses to the - * stream won't result in the same data being prefetched multiple times. - */ -static uint64_t -dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) -{ - uint64_t fetchsz; - - if (blkid > dn->dn_maxblkid) { - return (0); - } - - /* compute fetch size */ - if (blkid + nblks + 1 > dn->dn_maxblkid) { - fetchsz = (dn->dn_maxblkid - blkid) + 1; - ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid); - } else { - fetchsz = nblks; - } - - - return (fetchsz); -} - -/* - * given a zfetch and a zstream structure, see if there is an associated zstream - * for this block read. If so, it starts a prefetch for the stream it - * located and returns true, otherwise it returns false - */ -static int -dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) +static void +dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) { - zstream_t *zs; - int64_t diff; - int reset = !prefetched; - int rc = 0; - - if (zh == NULL) - return (0); - - /* - * XXX: This locking strategy is a bit coarse; however, it's impact has - * yet to be tested. If this turns out to be an issue, it can be - * modified in a number of different ways. - */ - - rw_enter(&zf->zf_rwlock, RW_READER); -top: - - for (zs = list_head(&zf->zf_stream); zs; - zs = list_next(&zf->zf_stream, zs)) { - - /* - * XXX - should this be an assert? - */ - if (zs->zst_len == 0) { - /* bogus stream */ - ZFETCHSTAT_BUMP(zfetchstat_bogus_streams); - continue; - } - - /* - * We hit this case when we are in a strided prefetch stream: - * we will read "len" blocks before "striding". - */ - if (zh->zst_offset >= zs->zst_offset && - zh->zst_offset < zs->zst_offset + zs->zst_len) { - if (prefetched) { - /* already fetched */ - ZFETCHSTAT_BUMP(zfetchstat_stride_hits); - rc = 1; - goto out; - } else { - ZFETCHSTAT_BUMP(zfetchstat_stride_misses); - } - } - - /* - * This is the forward sequential read case: we increment - * len by one each time we hit here, so we will enter this - * case on every read. - */ - if (zh->zst_offset == zs->zst_offset + zs->zst_len) { - - reset = !prefetched && zs->zst_len > 1; - - mutex_enter(&zs->zst_lock); - - if (zh->zst_offset != zs->zst_offset + zs->zst_len) { - mutex_exit(&zs->zst_lock); - goto top; - } - zs->zst_len += zh->zst_len; - diff = zs->zst_len - zfetch_block_cap; - if (diff > 0) { - zs->zst_offset += diff; - zs->zst_len = zs->zst_len > diff ? - zs->zst_len - diff : 0; - } - zs->zst_direction = ZFETCH_FORWARD; - - break; - - /* - * Same as above, but reading backwards through the file. - */ - } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) { - /* backwards sequential access */ - - reset = !prefetched && zs->zst_len > 1; - - mutex_enter(&zs->zst_lock); - - if (zh->zst_offset != zs->zst_offset - zh->zst_len) { - mutex_exit(&zs->zst_lock); - goto top; - } - - zs->zst_offset = zs->zst_offset > zh->zst_len ? - zs->zst_offset - zh->zst_len : 0; - zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ? - zs->zst_ph_offset - zh->zst_len : 0; - zs->zst_len += zh->zst_len; - - diff = zs->zst_len - zfetch_block_cap; - if (diff > 0) { - zs->zst_ph_offset = zs->zst_ph_offset > diff ? - zs->zst_ph_offset - diff : 0; - zs->zst_len = zs->zst_len > diff ? - zs->zst_len - diff : zs->zst_len; - } - zs->zst_direction = ZFETCH_BACKWARD; - - break; - - } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride < - zs->zst_len) && (zs->zst_len != zs->zst_stride)) { - /* strided forward access */ - - mutex_enter(&zs->zst_lock); - - if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= - zs->zst_len) || (zs->zst_len == zs->zst_stride)) { - mutex_exit(&zs->zst_lock); - goto top; - } - - zs->zst_offset += zs->zst_stride; - zs->zst_direction = ZFETCH_FORWARD; - - break; - - } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride < - zs->zst_len) && (zs->zst_len != zs->zst_stride)) { - /* strided reverse access */ - - mutex_enter(&zs->zst_lock); - - if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= - zs->zst_len) || (zs->zst_len == zs->zst_stride)) { - mutex_exit(&zs->zst_lock); - goto top; - } - - zs->zst_offset = zs->zst_offset > zs->zst_stride ? - zs->zst_offset - zs->zst_stride : 0; - zs->zst_ph_offset = (zs->zst_ph_offset > - (2 * zs->zst_stride)) ? - (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0; - zs->zst_direction = ZFETCH_BACKWARD; - - break; - } - } - - if (zs) { - if (reset) { - zstream_t *remove = zs; - - ZFETCHSTAT_BUMP(zfetchstat_stream_resets); - rc = 0; - mutex_exit(&zs->zst_lock); - rw_exit(&zf->zf_rwlock); - rw_enter(&zf->zf_rwlock, RW_WRITER); - /* - * Relocate the stream, in case someone removes - * it while we were acquiring the WRITER lock. - */ - for (zs = list_head(&zf->zf_stream); zs; - zs = list_next(&zf->zf_stream, zs)) { - if (zs == remove) { - dmu_zfetch_stream_remove(zf, zs); - mutex_destroy(&zs->zst_lock); - kmem_free(zs, sizeof (zstream_t)); - break; - } - } - } else { - ZFETCHSTAT_BUMP(zfetchstat_stream_noresets); - rc = 1; - dmu_zfetch_dofetch(zf, zs); - mutex_exit(&zs->zst_lock); - } - } -out: - rw_exit(&zf->zf_rwlock); - return (rc); + ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); + list_remove(&zf->zf_stream, zs); + mutex_destroy(&zs->zs_lock); + kmem_free(zs, sizeof (*zs)); } /* - * Clean-up state associated with a zfetch structure. This frees allocated - * structure members, empties the zf_stream tree, and generally makes things - * nice. This doesn't free the zfetch_t itself, that's left to the caller. + * Clean-up state associated with a zfetch structure (e.g. destroy the + * streams). This doesn't free the zfetch_t itself, that's left to the caller. */ void -dmu_zfetch_rele(zfetch_t *zf) +dmu_zfetch_fini(zfetch_t *zf) { - zstream_t *zs; - zstream_t *zs_next; + zstream_t *zs; ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); - for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { - zs_next = list_next(&zf->zf_stream, zs); - - list_remove(&zf->zf_stream, zs); - mutex_destroy(&zs->zst_lock); - kmem_free(zs, sizeof (zstream_t)); - } + rw_enter(&zf->zf_rwlock, RW_WRITER); + while ((zs = list_head(&zf->zf_stream)) != NULL) + dmu_zfetch_stream_remove(zf, zs); + rw_exit(&zf->zf_rwlock); list_destroy(&zf->zf_stream); rw_destroy(&zf->zf_rwlock); @@ -533,192 +159,190 @@ dmu_zfetch_rele(zfetch_t *zf) } /* - * Given a zfetch and zstream structure, insert the zstream structure into the - * AVL tree contained within the zfetch structure. Peform the appropriate - * book-keeping. It is possible that another thread has inserted a stream which - * matches one that we are about to insert, so we must be sure to check for this - * case. If one is found, return failure, and let the caller cleanup the - * duplicates. + * If there aren't too many streams already, create a new stream. + * The "blkid" argument is the next block that we expect this stream to access. + * While we're here, clean up old streams (which haven't been + * accessed for at least zfetch_min_sec_reap seconds). */ -static int -dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs) +static void +dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) { - zstream_t *zs_walk; - zstream_t *zs_next; + zstream_t *zs_next; + int numstreams = 0; ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); - for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) { - zs_next = list_next(&zf->zf_stream, zs_walk); - - if (dmu_zfetch_streams_equal(zs_walk, zs)) { - return (0); - } - } - - list_insert_head(&zf->zf_stream, zs); - zf->zf_stream_cnt++; - return (1); -} - - -/* - * Walk the list of zstreams in the given zfetch, find an old one (by time), and - * reclaim it for use by the caller. - */ -static zstream_t * -dmu_zfetch_stream_reclaim(zfetch_t *zf) -{ - zstream_t *zs; - - if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) - return (0); - - for (zs = list_head(&zf->zf_stream); zs; - zs = list_next(&zf->zf_stream, zs)) { - - if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap) - break; + /* + * Clean up old streams. + */ + for (zstream_t *zs = list_head(&zf->zf_stream); + zs != NULL; zs = zs_next) { + zs_next = list_next(&zf->zf_stream, zs); + if (((gethrtime() - zs->zs_atime) / NANOSEC) > + zfetch_min_sec_reap) + dmu_zfetch_stream_remove(zf, zs); + else + numstreams++; } - if (zs) { - dmu_zfetch_stream_remove(zf, zs); - mutex_destroy(&zs->zst_lock); - bzero(zs, sizeof (zstream_t)); - } else { - zf->zf_alloc_fail++; + /* + * The maximum number of streams is normally zfetch_max_streams, + * but for small files we lower it such that it's at least possible + * for all the streams to be non-overlapping. + * + * If we are already at the maximum number of streams for this file, + * even after removing old streams, then don't create this stream. + */ + uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, + zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / + zfetch_max_distance)); + if (numstreams >= max_streams) { + ZFETCHSTAT_BUMP(zfetchstat_max_streams); + return; } - rw_exit(&zf->zf_rwlock); - return (zs); -} + zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); + zs->zs_blkid = blkid; + zs->zs_pf_blkid = blkid; + zs->zs_ipf_blkid = blkid; + zs->zs_atime = gethrtime(); + mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL); -/* - * Given a zfetch and zstream structure, remove the zstream structure from its - * container in the zfetch structure. Perform the appropriate book-keeping. - */ -static void -dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) -{ - ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); - - list_remove(&zf->zf_stream, zs); - zf->zf_stream_cnt--; -} - -static int -dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2) -{ - if (zs1->zst_offset != zs2->zst_offset) - return (0); - - if (zs1->zst_len != zs2->zst_len) - return (0); - - if (zs1->zst_stride != zs2->zst_stride) - return (0); - - if (zs1->zst_ph_offset != zs2->zst_ph_offset) - return (0); - - if (zs1->zst_cap != zs2->zst_cap) - return (0); - - if (zs1->zst_direction != zs2->zst_direction) - return (0); - - return (1); + list_insert_head(&zf->zf_stream, zs); } /* - * This is the prefetch entry point. It calls all of the other dmu_zfetch - * routines to create, delete, find, or operate upon prefetch streams. + * This is the predictive prefetch entry point. It associates dnode access + * specified with blkid and nblks arguments with prefetch stream, predicts + * further accesses based on that stats and initiates speculative prefetch. + * fetch_data argument specifies whether actual data blocks should be fetched: + * FALSE -- prefetch only indirect blocks for predicted data blocks; + * TRUE -- prefetch predicted data blocks plus following indirect blocks. */ void -dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) +dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data) { - zstream_t zst; - zstream_t *newstream; - int fetched; - int inserted; - unsigned int blkshft; - uint64_t blksz; + zstream_t *zs; + int64_t pf_start, ipf_start, ipf_istart, ipf_iend; + int64_t pf_ahead_blks, max_blks; + int epbs, max_dist_blks, pf_nblks, ipf_nblks; + uint64_t end_of_access_blkid = blkid + nblks; if (zfs_prefetch_disable) return; - /* files that aren't ln2 blocksz are only one block -- nothing to do */ - if (!zf->zf_dnode->dn_datablkshift) + /* + * As a fast path for small (single-block) files, ignore access + * to the first block. + */ + if (blkid == 0) return; - /* convert offset and size, into blockid and nblocks */ - blkshft = zf->zf_dnode->dn_datablkshift; - blksz = (1 << blkshft); - - bzero(&zst, sizeof (zstream_t)); - zst.zst_offset = offset >> blkshft; - zst.zst_len = (P2ROUNDUP(offset + size, blksz) - - P2ALIGN(offset, blksz)) >> blkshft; - - fetched = dmu_zfetch_find(zf, &zst, prefetched); - if (fetched) { - ZFETCHSTAT_BUMP(zfetchstat_hits); - } else { - ZFETCHSTAT_BUMP(zfetchstat_misses); - if (fetched = dmu_zfetch_colinear(zf, &zst)) { - ZFETCHSTAT_BUMP(zfetchstat_colinear_hits); - } else { - ZFETCHSTAT_BUMP(zfetchstat_colinear_misses); + rw_enter(&zf->zf_rwlock, RW_READER); + + for (zs = list_head(&zf->zf_stream); zs != NULL; + zs = list_next(&zf->zf_stream, zs)) { + if (blkid == zs->zs_blkid) { + mutex_enter(&zs->zs_lock); + /* + * zs_blkid could have changed before we + * acquired zs_lock; re-check them here. + */ + if (blkid != zs->zs_blkid) { + mutex_exit(&zs->zs_lock); + continue; + } + break; } } - if (!fetched) { - newstream = dmu_zfetch_stream_reclaim(zf); - + if (zs == NULL) { /* - * we still couldn't find a stream, drop the lock, and allocate - * one if possible. Otherwise, give up and go home. + * This access is not part of any existing stream. Create + * a new stream for it. */ - if (newstream) { - ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes); - } else { - uint64_t maxblocks; - uint32_t max_streams; - uint32_t cur_streams; - - ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures); - cur_streams = zf->zf_stream_cnt; - maxblocks = zf->zf_dnode->dn_maxblkid; - - max_streams = MIN(zfetch_max_streams, - (maxblocks / zfetch_block_cap)); - if (max_streams == 0) { - max_streams++; - } + ZFETCHSTAT_BUMP(zfetchstat_misses); + if (rw_tryupgrade(&zf->zf_rwlock)) + dmu_zfetch_stream_create(zf, end_of_access_blkid); + rw_exit(&zf->zf_rwlock); + return; + } - if (cur_streams >= max_streams) { - return; - } - newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); - } + /* + * This access was to a block that we issued a prefetch for on + * behalf of this stream. Issue further prefetches for this stream. + * + * Normally, we start prefetching where we stopped + * prefetching last (zs_pf_blkid). But when we get our first + * hit on this stream, zs_pf_blkid == zs_blkid, we don't + * want to prefetch the block we just accessed. In this case, + * start just after the block we just accessed. + */ + pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid); + + /* + * Double our amount of prefetched data, but don't let the + * prefetch get further ahead than zfetch_max_distance. + */ + if (fetch_data) { + max_dist_blks = + zfetch_max_distance >> zf->zf_dnode->dn_datablkshift; + /* + * Previously, we were (zs_pf_blkid - blkid) ahead. We + * want to now be double that, so read that amount again, + * plus the amount we are catching up by (i.e. the amount + * read just now). + */ + pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks; + max_blks = max_dist_blks - (pf_start - end_of_access_blkid); + pf_nblks = MIN(pf_ahead_blks, max_blks); + } else { + pf_nblks = 0; + } - newstream->zst_offset = zst.zst_offset; - newstream->zst_len = zst.zst_len; - newstream->zst_stride = zst.zst_len; - newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; - newstream->zst_cap = zst.zst_len; - newstream->zst_direction = ZFETCH_FORWARD; - newstream->zst_last = ddi_get_lbolt(); + zs->zs_pf_blkid = pf_start + pf_nblks; - mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); + /* + * Do the same for indirects, starting from where we stopped last, + * or where we will stop reading data blocks (and the indirects + * that point to them). + */ + ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid); + max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift; + /* + * We want to double our distance ahead of the data prefetch + * (or reader, if we are not prefetching data). Previously, we + * were (zs_ipf_blkid - blkid) ahead. To double that, we read + * that amount again, plus the amount we are catching up by + * (i.e. the amount read now + the amount of data prefetched now). + */ + pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks; + max_blks = max_dist_blks - (ipf_start - end_of_access_blkid); + ipf_nblks = MIN(pf_ahead_blks, max_blks); + zs->zs_ipf_blkid = ipf_start + ipf_nblks; + + epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; + ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; + ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs; + + zs->zs_atime = gethrtime(); + zs->zs_blkid = end_of_access_blkid; + mutex_exit(&zs->zs_lock); + rw_exit(&zf->zf_rwlock); - rw_enter(&zf->zf_rwlock, RW_WRITER); - inserted = dmu_zfetch_stream_insert(zf, newstream); - rw_exit(&zf->zf_rwlock); + /* + * dbuf_prefetch() is asynchronous (even when it needs to read + * indirect blocks), but we still prefer to drop our locks before + * calling it to reduce the time we hold them. + */ - if (!inserted) { - mutex_destroy(&newstream->zst_lock); - kmem_free(newstream, sizeof (zstream_t)); - } + for (int i = 0; i < pf_nblks; i++) { + dbuf_prefetch(zf->zf_dnode, 0, pf_start + i, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); + } + for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) { + dbuf_prefetch(zf->zf_dnode, 1, iblk, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); } + ZFETCHSTAT_BUMP(zfetchstat_hits); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c,v retrieving revision 1.4 diff -u -p -r1.4 dnode.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c 21 Jun 2013 16:22:46 -0000 1.4 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c 26 Apr 2017 00:42:13 -0000 @@ -19,8 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ #include @@ -35,44 +37,130 @@ #include #include #include - -static int free_range_compar(const void *node1, const void *node2); +#include static kmem_cache_t *dnode_cache; +/* + * Define DNODE_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef DEBUG +#define DNODE_STATS +#endif /* DEBUG */ + +#ifdef DNODE_STATS +#define DNODE_STAT_ADD(stat) ((stat)++) +#else +#define DNODE_STAT_ADD(stat) /* nothing */ +#endif /* DNODE_STATS */ static dnode_phys_t dnode_phys_zero; int zfs_default_bs = SPA_MINBLOCKSHIFT; int zfs_default_ibs = DN_MAX_INDBLKSHIFT; +#ifdef illumos +static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); +#endif + +static int +dbuf_compare(const void *x1, const void *x2) +{ + const dmu_buf_impl_t *d1 = x1; + const dmu_buf_impl_t *d2 = x2; + + if (d1->db_level < d2->db_level) { + return (-1); + } + if (d1->db_level > d2->db_level) { + return (1); + } + + if (d1->db_blkid < d2->db_blkid) { + return (-1); + } + if (d1->db_blkid > d2->db_blkid) { + return (1); + } + + if (d1->db_state == DB_SEARCH) { + ASSERT3S(d2->db_state, !=, DB_SEARCH); + return (-1); + } else if (d2->db_state == DB_SEARCH) { + ASSERT3S(d1->db_state, !=, DB_SEARCH); + return (1); + } + + if ((uintptr_t)d1 < (uintptr_t)d2) { + return (-1); + } + if ((uintptr_t)d1 > (uintptr_t)d2) { + return (1); + } + return (0); +} + /* ARGSUSED */ static int dnode_cons(void *arg, void *unused, int kmflag) { + dnode_t *dn = arg; int i; - dnode_t *dn = unused; - bzero(dn, sizeof (dnode_t)); +#ifdef __NetBSD__ + dn = unused; +#endif rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); - refcount_create(&dn->dn_holds); + /* + * Every dbuf has a reference, and dropping a tracked reference is + * O(number of references), so don't track dn_holds. + */ + refcount_create_untracked(&dn->dn_holds); refcount_create(&dn->dn_tx_holds); + list_link_init(&dn->dn_link); + + bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); + bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); + bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); + bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype)); + bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); + bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); + bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); for (i = 0; i < TXG_SIZE; i++) { - avl_create(&dn->dn_ranges[i], free_range_compar, - sizeof (free_range_t), - offsetof(struct free_range, fr_node)); + list_link_init(&dn->dn_dirty_link[i]); + dn->dn_free_ranges[i] = NULL; list_create(&dn->dn_dirty_records[i], sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } - list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_assigned_txg = 0; + dn->dn_dirtyctx = 0; + dn->dn_dirtyctx_firstset = NULL; + dn->dn_bonus = NULL; + dn->dn_have_spill = B_FALSE; + dn->dn_zio = NULL; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + dn->dn_olduid = 0; + dn->dn_oldgid = 0; + dn->dn_newuid = 0; + dn->dn_newgid = 0; + dn->dn_id_flags = 0; + + dn->dn_dbufs_count = 0; + avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); + dn->dn_moved = 0; + POINTER_INVALIDATE(&dn->dn_objset); return (0); } @@ -81,35 +169,67 @@ static void dnode_dest(void *arg, void *unused) { int i; - dnode_t *dn = unused; + dnode_t *dn = arg; +#ifdef __NetBSD__ + dn = unused; +#endif rw_destroy(&dn->dn_struct_rwlock); mutex_destroy(&dn->dn_mtx); mutex_destroy(&dn->dn_dbufs_mtx); cv_destroy(&dn->dn_notxholds); refcount_destroy(&dn->dn_holds); refcount_destroy(&dn->dn_tx_holds); + ASSERT(!list_link_active(&dn->dn_link)); for (i = 0; i < TXG_SIZE; i++) { - avl_destroy(&dn->dn_ranges[i]); + ASSERT(!list_link_active(&dn->dn_dirty_link[i])); + ASSERT3P(dn->dn_free_ranges[i], ==, NULL); list_destroy(&dn->dn_dirty_records[i]); - } + ASSERT0(dn->dn_next_nblkptr[i]); + ASSERT0(dn->dn_next_nlevels[i]); + ASSERT0(dn->dn_next_indblkshift[i]); + ASSERT0(dn->dn_next_bonustype[i]); + ASSERT0(dn->dn_rm_spillblk[i]); + ASSERT0(dn->dn_next_bonuslen[i]); + ASSERT0(dn->dn_next_blksz[i]); + } + + ASSERT0(dn->dn_allocated_txg); + ASSERT0(dn->dn_free_txg); + ASSERT0(dn->dn_assigned_txg); + ASSERT0(dn->dn_dirtyctx); + ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); + ASSERT3P(dn->dn_bonus, ==, NULL); + ASSERT(!dn->dn_have_spill); + ASSERT3P(dn->dn_zio, ==, NULL); + ASSERT0(dn->dn_oldused); + ASSERT0(dn->dn_oldflags); + ASSERT0(dn->dn_olduid); + ASSERT0(dn->dn_oldgid); + ASSERT0(dn->dn_newuid); + ASSERT0(dn->dn_newgid); + ASSERT0(dn->dn_id_flags); - list_destroy(&dn->dn_dbufs); + ASSERT0(dn->dn_dbufs_count); + avl_destroy(&dn->dn_dbufs); } void dnode_init(void) { + ASSERT(dnode_cache == NULL); dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); + kmem_cache_set_move(dnode_cache, dnode_move); } void dnode_fini(void) { kmem_cache_destroy(dnode_cache); + dnode_cache = NULL; } @@ -121,8 +241,9 @@ dnode_verify(dnode_t *dn) ASSERT(dn->dn_phys); ASSERT(dn->dn_objset); + ASSERT(dn->dn_handle->dnh_dnode == dn); - ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) return; @@ -141,7 +262,7 @@ dnode_verify(dnode_t *dn) ASSERT3U(1<dn_datablkshift, ==, dn->dn_datablksz); } ASSERT3U(dn->dn_nlevels, <=, 30); - ASSERT3U(dn->dn_type, <=, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_type)); ASSERT3U(dn->dn_nblkptr, >=, 1); ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); @@ -207,9 +328,16 @@ dnode_byteswap(dnode_phys_t *dnp) */ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); size_t len = DN_MAX_BONUSLEN - off; - ASSERT3U(dnp->dn_bonustype, <, DMU_OT_NUMTYPES); - dmu_ot[dnp->dn_bonustype].ot_byteswap(dnp->dn_bonus + off, len); + ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(dnp->dn_bonustype); + dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len); } + + /* Swap SPILL block if we have one */ + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) + byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t)); + } void @@ -228,19 +356,6 @@ dnode_buf_byteswap(void *vbuf, size_t si } } -static int -free_range_compar(const void *node1, const void *node2) -{ - const free_range_t *rp1 = node1; - const free_range_t *rp2 = node2; - - if (rp1->fr_blkid < rp2->fr_blkid) - return (-1); - else if (rp1->fr_blkid > rp2->fr_blkid) - return (1); - else return (0); -} - void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) { @@ -258,33 +373,66 @@ dnode_setbonuslen(dnode_t *dn, int newsi rw_exit(&dn->dn_struct_rwlock); } +void +dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + dnode_setdirty(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dn->dn_bonustype = newtype; + dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; + rw_exit(&dn->dn_struct_rwlock); +} + +void +dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + dnode_setdirty(dn, tx); + dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK; + dn->dn_have_spill = B_FALSE; +} + static void dnode_setdblksz(dnode_t *dn, int size) { - ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0); + ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE)); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(size, >=, SPA_MINBLOCKSIZE); ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); dn->dn_datablksz = size; dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; - dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0; + dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0; } static dnode_t * dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, - uint64_t object) + uint64_t object, dnode_handle_t *dnh) { - dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); -// (void) dnode_cons(dn, NULL, 0); /* XXX */ + dnode_t *dn; - dn->dn_objset = os; + dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); + ASSERT(!POINTER_IS_VALID(dn->dn_objset)); + dn->dn_moved = 0; + + /* + * Defer setting dn_objset until the dnode is ready to be a candidate + * for the dnode_move() callback. + */ dn->dn_object = object; dn->dn_dbuf = db; + dn->dn_handle = dnh; dn->dn_phys = dnp; - if (dnp->dn_datablkszsec) + if (dnp->dn_datablkszsec) { dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + } else { + dn->dn_datablksz = 0; + dn->dn_datablkszsec = 0; + dn->dn_datablkshift = 0; + } dn->dn_indblkshift = dnp->dn_indblkshift; dn->dn_nlevels = dnp->dn_nlevels; dn->dn_type = dnp->dn_type; @@ -294,51 +442,100 @@ dnode_create(objset_t *os, dnode_phys_t dn->dn_bonustype = dnp->dn_bonustype; dn->dn_bonuslen = dnp->dn_bonuslen; dn->dn_maxblkid = dnp->dn_maxblkid; + dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); + dn->dn_id_flags = 0; dmu_zfetch_init(&dn->dn_zfetch, dn); - ASSERT(dn->dn_phys->dn_type < DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); + mutex_enter(&os->os_lock); - list_insert_head(&os->os_dnodes, dn); + if (dnh->dnh_dnode != NULL) { + /* Lost the allocation race. */ + mutex_exit(&os->os_lock); + kmem_cache_free(dnode_cache, dn); + return (dnh->dnh_dnode); + } + + /* + * Exclude special dnodes from os_dnodes so an empty os_dnodes + * signifies that the special dnodes have no references from + * their children (the entries in os_dnodes). This allows + * dnode_destroy() to easily determine if the last child has + * been removed and then complete eviction of the objset. + */ + if (!DMU_OBJECT_IS_SPECIAL(object)) + list_insert_head(&os->os_dnodes, dn); + membar_producer(); + + /* + * Everything else must be valid before assigning dn_objset + * makes the dnode eligible for dnode_move(). + */ + dn->dn_objset = os; + + dnh->dnh_dnode = dn; mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); return (dn); } +/* + * Caller must be holding the dnode handle, which is released upon return. + */ static void dnode_destroy(dnode_t *dn) { objset_t *os = dn->dn_objset; + boolean_t complete_os_eviction = B_FALSE; -#ifdef ZFS_DEBUG - int i; - - for (i = 0; i < TXG_SIZE; i++) { - ASSERT(!list_link_active(&dn->dn_dirty_link[i])); - ASSERT(NULL == list_head(&dn->dn_dirty_records[i])); - ASSERT(0 == avl_numnodes(&dn->dn_ranges[i])); - } - ASSERT(NULL == list_head(&dn->dn_dbufs)); -#endif - ASSERT(dn->dn_oldphys == NULL); + ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); mutex_enter(&os->os_lock); - list_remove(&os->os_dnodes, dn); + POINTER_INVALIDATE(&dn->dn_objset); + if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + list_remove(&os->os_dnodes, dn); + complete_os_eviction = + list_is_empty(&os->os_dnodes) && + list_link_active(&os->os_evicting_node); + } mutex_exit(&os->os_lock); - if (dn->dn_dirtyctx_firstset) { + /* the dnode can no longer move, so we can release the handle */ + zrl_remove(&dn->dn_handle->dnh_zrlock); + + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_assigned_txg = 0; + + dn->dn_dirtyctx = 0; + if (dn->dn_dirtyctx_firstset != NULL) { kmem_free(dn->dn_dirtyctx_firstset, 1); dn->dn_dirtyctx_firstset = NULL; } - dmu_zfetch_rele(&dn->dn_zfetch); - if (dn->dn_bonus) { + if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); + dbuf_destroy(dn->dn_bonus); dn->dn_bonus = NULL; } + dn->dn_zio = NULL; + + dn->dn_have_spill = B_FALSE; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + dn->dn_olduid = 0; + dn->dn_oldgid = 0; + dn->dn_newuid = 0; + dn->dn_newgid = 0; + dn->dn_id_flags = 0; + + dmu_zfetch_fini(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); + + if (complete_os_eviction) + dmu_objset_evict_done(os); } void @@ -347,10 +544,10 @@ dnode_allocate(dnode_t *dn, dmu_object_t { int i; + ASSERT3U(blocksize, <=, + spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (blocksize == 0) blocksize = 1 << zfs_default_bs; - else if (blocksize > SPA_MAXBLOCKSIZE) - blocksize = SPA_MAXBLOCKSIZE; else blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); @@ -366,34 +563,42 @@ dnode_allocate(dnode_t *dn, dmu_object_t ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); ASSERT(ot != DMU_OT_NONE); - ASSERT3U(ot, <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(ot)); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || + (bonustype == DMU_OT_SA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); - ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); ASSERT(dn->dn_type == DMU_OT_NONE); - ASSERT3U(dn->dn_maxblkid, ==, 0); - ASSERT3U(dn->dn_allocated_txg, ==, 0); - ASSERT3U(dn->dn_assigned_txg, ==, 0); + ASSERT0(dn->dn_maxblkid); + ASSERT0(dn->dn_allocated_txg); + ASSERT0(dn->dn_assigned_txg); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + ASSERT(avl_is_empty(&dn->dn_dbufs)); for (i = 0; i < TXG_SIZE; i++) { - ASSERT3U(dn->dn_next_nlevels[i], ==, 0); - ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); - ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); - ASSERT3U(dn->dn_next_blksz[i], ==, 0); + ASSERT0(dn->dn_next_nblkptr[i]); + ASSERT0(dn->dn_next_nlevels[i]); + ASSERT0(dn->dn_next_indblkshift[i]); + ASSERT0(dn->dn_next_bonuslen[i]); + ASSERT0(dn->dn_next_bonustype[i]); + ASSERT0(dn->dn_rm_spillblk[i]); + ASSERT0(dn->dn_next_blksz[i]); ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); - ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0); + ASSERT3P(dn->dn_free_ranges[i], ==, NULL); } dn->dn_type = ot; dnode_setdblksz(dn, blocksize); dn->dn_indblkshift = ibs; dn->dn_nlevels = 1; - dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ + dn->dn_nblkptr = 1; + else + dn->dn_nblkptr = 1 + + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; @@ -407,10 +612,12 @@ dnode_allocate(dnode_t *dn, dmu_object_t } dn->dn_allocated_txg = tx->tx_txg; + dn->dn_id_flags = 0; dnode_setdirty(dn, tx); dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; + dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; } @@ -421,18 +628,22 @@ dnode_reallocate(dnode_t *dn, dmu_object int nblkptr; ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); - ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE); - ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0); + ASSERT3U(blocksize, <=, + spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); + ASSERT0(blocksize % SPA_MINBLOCKSIZE); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); ASSERT(tx->tx_txg != 0); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || - (bonustype != DMU_OT_NONE && bonuslen != 0)); - ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); + (bonustype != DMU_OT_NONE && bonuslen != 0) || + (bonustype == DMU_OT_SA && bonuslen == 0)); + ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); /* clean up any unreferenced dbufs */ dnode_evict_dbufs(dn); + dn->dn_id_flags = 0; + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_setdirty(dn, tx); if (dn->dn_datablksz != blocksize) { @@ -445,9 +656,19 @@ dnode_reallocate(dnode_t *dn, dmu_object } if (dn->dn_bonuslen != bonuslen) dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; - nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + + if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ + nblkptr = 1; + else + nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + if (dn->dn_bonustype != bonustype) + dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype; if (dn->dn_nblkptr != nblkptr) dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; + if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + dbuf_rm_spill(dn, tx); + dnode_rm_spill(dn, tx); + } rw_exit(&dn->dn_struct_rwlock); /* change type */ @@ -473,9 +694,304 @@ dnode_reallocate(dnode_t *dn, dmu_object mutex_exit(&dn->dn_mtx); } +#ifdef DNODE_STATS +static struct { + uint64_t dms_dnode_invalid; + uint64_t dms_dnode_recheck1; + uint64_t dms_dnode_recheck2; + uint64_t dms_dnode_special; + uint64_t dms_dnode_handle; + uint64_t dms_dnode_rwlock; + uint64_t dms_dnode_active; +} dnode_move_stats; +#endif /* DNODE_STATS */ + +static void +dnode_move_impl(dnode_t *odn, dnode_t *ndn) +{ + int i; + + ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); + ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); + ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); + ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock)); + + /* Copy fields. */ + ndn->dn_objset = odn->dn_objset; + ndn->dn_object = odn->dn_object; + ndn->dn_dbuf = odn->dn_dbuf; + ndn->dn_handle = odn->dn_handle; + ndn->dn_phys = odn->dn_phys; + ndn->dn_type = odn->dn_type; + ndn->dn_bonuslen = odn->dn_bonuslen; + ndn->dn_bonustype = odn->dn_bonustype; + ndn->dn_nblkptr = odn->dn_nblkptr; + ndn->dn_checksum = odn->dn_checksum; + ndn->dn_compress = odn->dn_compress; + ndn->dn_nlevels = odn->dn_nlevels; + ndn->dn_indblkshift = odn->dn_indblkshift; + ndn->dn_datablkshift = odn->dn_datablkshift; + ndn->dn_datablkszsec = odn->dn_datablkszsec; + ndn->dn_datablksz = odn->dn_datablksz; + ndn->dn_maxblkid = odn->dn_maxblkid; + bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], + sizeof (odn->dn_next_nblkptr)); + bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], + sizeof (odn->dn_next_nlevels)); + bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0], + sizeof (odn->dn_next_indblkshift)); + bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0], + sizeof (odn->dn_next_bonustype)); + bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0], + sizeof (odn->dn_rm_spillblk)); + bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0], + sizeof (odn->dn_next_bonuslen)); + bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], + sizeof (odn->dn_next_blksz)); + for (i = 0; i < TXG_SIZE; i++) { + list_move_tail(&ndn->dn_dirty_records[i], + &odn->dn_dirty_records[i]); + } + bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0], + sizeof (odn->dn_free_ranges)); + ndn->dn_allocated_txg = odn->dn_allocated_txg; + ndn->dn_free_txg = odn->dn_free_txg; + ndn->dn_assigned_txg = odn->dn_assigned_txg; + ndn->dn_dirtyctx = odn->dn_dirtyctx; + ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; + ASSERT(refcount_count(&odn->dn_tx_holds) == 0); + refcount_transfer(&ndn->dn_holds, &odn->dn_holds); + ASSERT(avl_is_empty(&ndn->dn_dbufs)); + avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs); + ndn->dn_dbufs_count = odn->dn_dbufs_count; + ndn->dn_bonus = odn->dn_bonus; + ndn->dn_have_spill = odn->dn_have_spill; + ndn->dn_zio = odn->dn_zio; + ndn->dn_oldused = odn->dn_oldused; + ndn->dn_oldflags = odn->dn_oldflags; + ndn->dn_olduid = odn->dn_olduid; + ndn->dn_oldgid = odn->dn_oldgid; + ndn->dn_newuid = odn->dn_newuid; + ndn->dn_newgid = odn->dn_newgid; + ndn->dn_id_flags = odn->dn_id_flags; + dmu_zfetch_init(&ndn->dn_zfetch, NULL); + list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream); + ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode; + + /* + * Update back pointers. Updating the handle fixes the back pointer of + * every descendant dbuf as well as the bonus dbuf. + */ + ASSERT(ndn->dn_handle->dnh_dnode == odn); + ndn->dn_handle->dnh_dnode = ndn; + if (ndn->dn_zfetch.zf_dnode == odn) { + ndn->dn_zfetch.zf_dnode = ndn; + } + + /* + * Invalidate the original dnode by clearing all of its back pointers. + */ + odn->dn_dbuf = NULL; + odn->dn_handle = NULL; + avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + odn->dn_dbufs_count = 0; + odn->dn_bonus = NULL; + odn->dn_zfetch.zf_dnode = NULL; + + /* + * Set the low bit of the objset pointer to ensure that dnode_move() + * recognizes the dnode as invalid in any subsequent callback. + */ + POINTER_INVALIDATE(&odn->dn_objset); + + /* + * Satisfy the destructor. + */ + for (i = 0; i < TXG_SIZE; i++) { + list_create(&odn->dn_dirty_records[i], + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + odn->dn_free_ranges[i] = NULL; + odn->dn_next_nlevels[i] = 0; + odn->dn_next_indblkshift[i] = 0; + odn->dn_next_bonustype[i] = 0; + odn->dn_rm_spillblk[i] = 0; + odn->dn_next_bonuslen[i] = 0; + odn->dn_next_blksz[i] = 0; + } + odn->dn_allocated_txg = 0; + odn->dn_free_txg = 0; + odn->dn_assigned_txg = 0; + odn->dn_dirtyctx = 0; + odn->dn_dirtyctx_firstset = NULL; + odn->dn_have_spill = B_FALSE; + odn->dn_zio = NULL; + odn->dn_oldused = 0; + odn->dn_oldflags = 0; + odn->dn_olduid = 0; + odn->dn_oldgid = 0; + odn->dn_newuid = 0; + odn->dn_newgid = 0; + odn->dn_id_flags = 0; + + /* + * Mark the dnode. + */ + ndn->dn_moved = 1; + odn->dn_moved = (uint8_t)-1; +} + +#ifdef illumos +#ifdef _KERNEL +/*ARGSUSED*/ +static kmem_cbrc_t +dnode_move(void *buf, void *newbuf, size_t size, void *arg) +{ + dnode_t *odn = buf, *ndn = newbuf; + objset_t *os; + int64_t refcount; + uint32_t dbufs; + + /* + * The dnode is on the objset's list of known dnodes if the objset + * pointer is valid. We set the low bit of the objset pointer when + * freeing the dnode to invalidate it, and the memory patterns written + * by kmem (baddcafe and deadbeef) set at least one of the two low bits. + * A newly created dnode sets the objset pointer last of all to indicate + * that the dnode is known and in a valid state to be moved by this + * function. + */ + os = odn->dn_objset; + if (!POINTER_IS_VALID(os)) { + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * Ensure that the objset does not go away during the move. + */ + rw_enter(&os_lock, RW_WRITER); + if (os != odn->dn_objset) { + rw_exit(&os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * If the dnode is still valid, then so is the objset. We know that no + * valid objset can be freed while we hold os_lock, so we can safely + * ensure that the objset remains in use. + */ + mutex_enter(&os->os_lock); + + /* + * Recheck the objset pointer in case the dnode was removed just before + * acquiring the lock. + */ + if (os != odn->dn_objset) { + mutex_exit(&os->os_lock); + rw_exit(&os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * At this point we know that as long as we hold os->os_lock, the dnode + * cannot be freed and fields within the dnode can be safely accessed. + * The objset listing this dnode cannot go away as long as this dnode is + * on its list. + */ + rw_exit(&os_lock); + if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special); + return (KMEM_CBRC_NO); + } + ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ + + /* + * Lock the dnode handle to prevent the dnode from obtaining any new + * holds. This also prevents the descendant dbufs and the bonus dbuf + * from accessing the dnode, so that we can discount their holds. The + * handle is safe to access because we know that while the dnode cannot + * go away, neither can its handle. Once we hold dnh_zrlock, we can + * safely move any dnode referenced only by dbufs. + */ + if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle); + return (KMEM_CBRC_LATER); + } + + /* + * Ensure a consistent view of the dnode's holds and the dnode's dbufs. + * We need to guarantee that there is a hold for every dbuf in order to + * determine whether the dnode is actively referenced. Falsely matching + * a dbuf to an active hold would lead to an unsafe move. It's possible + * that a thread already having an active dnode hold is about to add a + * dbuf, and we can't compare hold and dbuf counts while the add is in + * progress. + */ + if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { + zrl_exit(&odn->dn_handle->dnh_zrlock); + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock); + return (KMEM_CBRC_LATER); + } + + /* + * A dbuf may be removed (evicted) without an active dnode hold. In that + * case, the dbuf count is decremented under the handle lock before the + * dbuf's hold is released. This order ensures that if we count the hold + * after the dbuf is removed but before its hold is released, we will + * treat the unmatched hold as active and exit safely. If we count the + * hold before the dbuf is removed, the hold is discounted, and the + * removal is blocked until the move completes. + */ + refcount = refcount_count(&odn->dn_holds); + ASSERT(refcount >= 0); + dbufs = odn->dn_dbufs_count; + + /* We can't have more dbufs than dnode holds. */ + ASSERT3U(dbufs, <=, refcount); + DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount, + uint32_t, dbufs); + + if (refcount > dbufs) { + rw_exit(&odn->dn_struct_rwlock); + zrl_exit(&odn->dn_handle->dnh_zrlock); + mutex_exit(&os->os_lock); + DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active); + return (KMEM_CBRC_LATER); + } + + rw_exit(&odn->dn_struct_rwlock); + + /* + * At this point we know that anyone with a hold on the dnode is not + * actively referencing it. The dnode is known and in a valid state to + * move. We're holding the locks needed to execute the critical section. + */ + dnode_move_impl(odn, ndn); + + list_link_replace(&odn->dn_link, &ndn->dn_link); + /* If the dnode was safe to move, the refcount cannot have changed. */ + ASSERT(refcount == refcount_count(&ndn->dn_holds)); + ASSERT(dbufs == ndn->dn_dbufs_count); + zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ + mutex_exit(&os->os_lock); + + return (KMEM_CBRC_YES); +} +#endif /* _KERNEL */ +#endif /* illumos */ + void -dnode_special_close(dnode_t *dn) +dnode_special_close(dnode_handle_t *dnh) { + dnode_t *dn = dnh->dnh_dnode; + /* * Wait for final references to the dnode to clear. This can * only happen if the arc is asyncronously evicting state that @@ -483,49 +999,63 @@ dnode_special_close(dnode_t *dn) * dnode. */ while (refcount_count(&dn->dn_holds) > 0) - xdelay(1); - dnode_destroy(dn); + delay(1); + ASSERT(dn->dn_dbuf == NULL || + dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); + zrl_add(&dnh->dnh_zrlock); + dnode_destroy(dn); /* implicit zrl_remove() */ + zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = NULL; } -dnode_t * -dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object) +void +dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, + dnode_handle_t *dnh) { - dnode_t *dn = dnode_create(os, dnp, NULL, object); + dnode_t *dn; + + dn = dnode_create(os, dnp, NULL, object, dnh); + zrl_init(&dnh->dnh_zrlock); DNODE_VERIFY(dn); - return (dn); } static void -dnode_buf_pageout(dmu_buf_t *db, void *arg) +dnode_buf_evict_async(void *dbu) { - dnode_t **children_dnodes = arg; + dnode_children_t *children_dnodes = dbu; int i; - int epb = db->db_size >> DNODE_SHIFT; - for (i = 0; i < epb; i++) { - dnode_t *dn = children_dnodes[i]; - int n; + for (i = 0; i < children_dnodes->dnc_count; i++) { + dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; + dnode_t *dn; - if (dn == NULL) + /* + * The dnode handle lock guards against the dnode moving to + * another valid address, so there is no need here to guard + * against changes to or from NULL. + */ + if (dnh->dnh_dnode == NULL) { + zrl_destroy(&dnh->dnh_zrlock); continue; -#ifdef ZFS_DEBUG + } + + zrl_add(&dnh->dnh_zrlock); + dn = dnh->dnh_dnode; /* * If there are holds on this dnode, then there should * be holds on the dnode's containing dbuf as well; thus - * it wouldn't be eligable for eviction and this function + * it wouldn't be eligible for eviction and this function * would not have been called. */ ASSERT(refcount_is_zero(&dn->dn_holds)); - ASSERT(list_head(&dn->dn_dbufs) == NULL); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); - for (n = 0; n < TXG_SIZE; n++) - ASSERT(!list_link_active(&dn->dn_dirty_link[n])); -#endif - children_dnodes[i] = NULL; - dnode_destroy(dn); + dnode_destroy(dn); /* implicit zrl_remove() */ + zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = NULL; } - kmem_free(children_dnodes, epb * sizeof (dnode_t *)); + kmem_free(children_dnodes, sizeof (dnode_children_t) + + children_dnodes->dnc_count * sizeof (dnode_handle_t)); } /* @@ -544,24 +1074,29 @@ dnode_hold_impl(objset_t *os, uint64_t o uint64_t blk; dnode_t *mdn, *dn; dmu_buf_impl_t *db; - dnode_t **children_dnodes; + dnode_children_t *children_dnodes; + dnode_handle_t *dnh; /* * If you are holding the spa config lock as writer, you shouldn't - * be asking the DMU to do *anything*. + * be asking the DMU to do *anything* unless it's the root pool + * which may require us to read from the root filesystem while + * holding some (not all) of the locks as writer. */ - ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0); + ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 || + (spa_is_root(os->os_spa) && + spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { dn = (object == DMU_USERUSED_OBJECT) ? - os->os_userused_dnode : os->os_groupused_dnode; + DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os); if (dn == NULL) - return (ENOENT); + return (SET_ERROR(ENOENT)); type = dn->dn_type; if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) - return (ENOENT); + return (SET_ERROR(ENOENT)); if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) - return (EEXIST); + return (SET_ERROR(EEXIST)); DNODE_VERIFY(dn); (void) refcount_add(&dn->dn_holds, tag); *dnp = dn; @@ -569,9 +1104,10 @@ dnode_hold_impl(objset_t *os, uint64_t o } if (object == 0 || object >= DN_MAX_OBJECT) - return (EINVAL); + return (SET_ERROR(EINVAL)); - mdn = os->os_meta_dnode; + mdn = DMU_META_DNODE(os); + ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT); DNODE_VERIFY(mdn); @@ -580,13 +1116,13 @@ dnode_hold_impl(objset_t *os, uint64_t o drop_struct_lock = TRUE; } - blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); + blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) rw_exit(&mdn->dn_struct_rwlock); if (db == NULL) - return (EIO); + return (SET_ERROR(EIO)); err = dbuf_read(db, NULL, DB_RF_CANFAIL); if (err) { dbuf_rele(db, FTAG); @@ -598,28 +1134,41 @@ dnode_hold_impl(objset_t *os, uint64_t o idx = object & (epb-1); + ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); children_dnodes = dmu_buf_get_user(&db->db); if (children_dnodes == NULL) { - dnode_t **winner; - children_dnodes = kmem_zalloc(epb * sizeof (dnode_t *), - KM_SLEEP); - if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL, - dnode_buf_pageout)) { - kmem_free(children_dnodes, epb * sizeof (dnode_t *)); + int i; + dnode_children_t *winner; + children_dnodes = kmem_zalloc(sizeof (dnode_children_t) + + epb * sizeof (dnode_handle_t), KM_SLEEP); + children_dnodes->dnc_count = epb; + dnh = &children_dnodes->dnc_children[0]; + for (i = 0; i < epb; i++) { + zrl_init(&dnh[i].dnh_zrlock); + } + dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL, + dnode_buf_evict_async, NULL); + winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu); + if (winner != NULL) { + + for (i = 0; i < epb; i++) { + zrl_destroy(&dnh[i].dnh_zrlock); + } + + kmem_free(children_dnodes, sizeof (dnode_children_t) + + epb * sizeof (dnode_handle_t)); children_dnodes = winner; } } + ASSERT(children_dnodes->dnc_count == epb); - if ((dn = children_dnodes[idx]) == NULL) { - dnode_phys_t *dnp = (dnode_phys_t *)db->db.db_data+idx; - dnode_t *winner; + dnh = &children_dnodes->dnc_children[idx]; + zrl_add(&dnh->dnh_zrlock); + dn = dnh->dnh_dnode; + if (dn == NULL) { + dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; - dn = dnode_create(os, dnp, db, object); - winner = atomic_cas_ptr(&children_dnodes[idx], NULL, dn); - if (winner != NULL) { - dnode_destroy(dn); - dn = winner; - } + dn = dnode_create(os, phys, db, object, dnh); } mutex_enter(&dn->dn_mtx); @@ -627,15 +1176,18 @@ dnode_hold_impl(objset_t *os, uint64_t o if (dn->dn_free_txg || ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || ((flag & DNODE_MUST_BE_FREE) && - (type != DMU_OT_NONE || dn->dn_oldphys))) { + (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) { mutex_exit(&dn->dn_mtx); + zrl_remove(&dnh->dnh_zrlock); dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); } + if (refcount_add(&dn->dn_holds, tag) == 1) + dbuf_add_ref(db, dnh); mutex_exit(&dn->dn_mtx); - if (refcount_add(&dn->dn_holds, tag) == 1) - dbuf_add_ref(db, dn); + /* Now we can rely on the hold to prevent the dnode from moving. */ + zrl_remove(&dnh->dnh_zrlock); DNODE_VERIFY(dn); ASSERT3P(dn->dn_dbuf, ==, db); @@ -676,14 +1228,44 @@ dnode_add_ref(dnode_t *dn, void *tag) void dnode_rele(dnode_t *dn, void *tag) { + mutex_enter(&dn->dn_mtx); + dnode_rele_and_unlock(dn, tag); +} + +void +dnode_rele_and_unlock(dnode_t *dn, void *tag) +{ uint64_t refs; + /* Get while the hold prevents the dnode from moving. */ + dmu_buf_impl_t *db = dn->dn_dbuf; + dnode_handle_t *dnh = dn->dn_handle; - mutex_enter(&dn->dn_mtx); refs = refcount_remove(&dn->dn_holds, tag); mutex_exit(&dn->dn_mtx); + + /* + * It's unsafe to release the last hold on a dnode by dnode_rele() or + * indirectly by dbuf_rele() while relying on the dnode handle to + * prevent the dnode from moving, since releasing the last hold could + * result in the dnode's parent dbuf evicting its dnode handles. For + * that reason anyone calling dnode_rele() or dbuf_rele() without some + * other direct or indirect hold on the dnode must first drop the dnode + * handle. + */ + ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); + /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ - if (refs == 0 && dn->dn_dbuf) - dbuf_rele(dn->dn_dbuf, dn); + if (refs == 0 && db != NULL) { + /* + * Another thread could add a hold to the dnode handle in + * dnode_hold_impl() while holding the parent dbuf. Since the + * hold on the parent dbuf prevents the handle from being + * destroyed, the hold on the handle is OK. We can't yet assert + * that the handle has zero references, but that will be + * asserted anyway when the handle gets destroyed. + */ + dbuf_rele(db, dnh); + } } void @@ -702,10 +1284,15 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx #ifdef ZFS_DEBUG mutex_enter(&dn->dn_mtx); ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); - /* ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); */ + ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); mutex_exit(&dn->dn_mtx); #endif + /* + * Determine old uid/gid when necessary + */ + dmu_objset_userquota_get_ids(dn, B_TRUE, tx); + mutex_enter(&os->os_lock); /* @@ -716,10 +1303,12 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx return; } - ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs)); + ASSERT(!refcount_is_zero(&dn->dn_holds) || + !avl_is_empty(&dn->dn_dbufs)); ASSERT(dn->dn_datablksz != 0); - ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0); - ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0); + ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]); + ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]); + ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]); dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", dn->dn_object, txg); @@ -735,7 +1324,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx /* * The dnode maintains a hold on its containing dbuf as * long as there are holds on it. Each instantiated child - * dbuf maintaines a hold on the dnode. When the last child + * dbuf maintains a hold on the dnode. When the last child * drops its hold, the dnode will drop its hold on the * containing dbuf. We add a "dirty hold" here so that the * dnode will hang around after we finish processing its @@ -788,13 +1377,12 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx) int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) { - dmu_buf_impl_t *db, *db_next; + dmu_buf_impl_t *db; int err; + ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (size == 0) size = SPA_MINBLOCKSIZE; - if (size > SPA_MAXBLOCKSIZE) - size = SPA_MAXBLOCKSIZE; else size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); @@ -807,14 +1395,14 @@ dnode_set_blksz(dnode_t *dn, uint64_t si rw_enter(&dn->dn_struct_rwlock, RW_WRITER); /* Check for any allocated blocks beyond the first */ - if (dn->dn_phys->dn_maxblkid != 0) + if (dn->dn_maxblkid != 0) goto fail; mutex_enter(&dn->dn_dbufs_mtx); - for (db = list_head(&dn->dn_dbufs); db; db = db_next) { - db_next = list_next(&dn->dn_dbufs, db); - - if (db->db_blkid != 0 && db->db_blkid != DB_BONUS_BLKID) { + for (db = avl_first(&dn->dn_dbufs); db != NULL; + db = AVL_NEXT(&dn->dn_dbufs, db)) { + if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && + db->db_blkid != DMU_SPILL_BLKID) { mutex_exit(&dn->dn_dbufs_mtx); goto fail; } @@ -825,7 +1413,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t si goto fail; /* resize the old block */ - err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); + err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); if (err == 0) dbuf_new_size(db, size, tx); else if (err != ENOENT) @@ -847,7 +1435,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t si fail: rw_exit(&dn->dn_struct_rwlock); - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); } /* read-holding callers must not rely on the lock being continuously held */ @@ -858,7 +1446,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t bl int epbs, new_nlevels; uint64_t sz; - ASSERT(blkid != DB_BONUS_BLKID); + ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(have_read ? RW_READ_HELD(&dn->dn_struct_rwlock) : @@ -905,6 +1493,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t bl /* dirty the left indirects */ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); + ASSERT(db != NULL); new = dbuf_dirty(db, tx); dbuf_rele(db, FTAG); @@ -915,7 +1504,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t bl for (dr = list_head(list); dr; dr = dr_next) { dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); if (dr->dr_dbuf->db_level != new_nlevels-1 && - dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) { + dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); list_remove(&dn->dn_dirty_records[txgoff], dr); list_insert_tail(&new->dt.di.dr_children, dr); @@ -931,56 +1521,13 @@ out: rw_downgrade(&dn->dn_struct_rwlock); } -void -dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) +static void +dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) { - avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK]; - avl_index_t where; - free_range_t *rp; - free_range_t rp_tofind; - uint64_t endblk = blkid + nblks; - - ASSERT(MUTEX_HELD(&dn->dn_mtx)); - ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */ - - dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", - blkid, nblks, tx->tx_txg); - rp_tofind.fr_blkid = blkid; - rp = avl_find(tree, &rp_tofind, &where); - if (rp == NULL) - rp = avl_nearest(tree, where, AVL_BEFORE); - if (rp == NULL) - rp = avl_nearest(tree, where, AVL_AFTER); - - while (rp && (rp->fr_blkid <= blkid + nblks)) { - uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks; - free_range_t *nrp = AVL_NEXT(tree, rp); - - if (blkid <= rp->fr_blkid && endblk >= fr_endblk) { - /* clear this entire range */ - avl_remove(tree, rp); - kmem_free(rp, sizeof (free_range_t)); - } else if (blkid <= rp->fr_blkid && - endblk > rp->fr_blkid && endblk < fr_endblk) { - /* clear the beginning of this range */ - rp->fr_blkid = endblk; - rp->fr_nblks = fr_endblk - endblk; - } else if (blkid > rp->fr_blkid && blkid < fr_endblk && - endblk >= fr_endblk) { - /* clear the end of this range */ - rp->fr_nblks = blkid - rp->fr_blkid; - } else if (blkid > rp->fr_blkid && endblk < fr_endblk) { - /* clear a chunk out of this range */ - free_range_t *new_rp = - kmem_alloc(sizeof (free_range_t), KM_SLEEP); - - new_rp->fr_blkid = endblk; - new_rp->fr_nblks = fr_endblk - endblk; - avl_insert_here(tree, new_rp, rp, AVL_AFTER); - rp->fr_nblks = blkid - rp->fr_blkid; - } - /* there may be no overlap */ - rp = nrp; + dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG); + if (db != NULL) { + dmu_buf_will_dirty(&db->db, tx); + dbuf_rele(db, FTAG); } } @@ -998,7 +1545,7 @@ dnode_free_range(dnode_t *dn, uint64_t o blkshift = dn->dn_datablkshift; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - if (len == -1ULL) { + if (len == DMU_OBJECT_END) { len = UINT64_MAX - off; trunc = TRUE; } @@ -1014,7 +1561,13 @@ dnode_free_range(dnode_t *dn, uint64_t o } else { ASSERT(dn->dn_maxblkid == 0); if (off == 0 && len >= blksz) { - /* Freeing the whole block; fast-track this request */ + /* + * Freeing the whole block; fast-track this request. + * Note that we won't dirty any indirect blocks, + * which is fine because we will be freeing the entire + * file and thus all indirect blocks will be freed + * by free_children(). + */ blkid = 0; nblks = 1; goto done; @@ -1033,15 +1586,15 @@ dnode_free_range(dnode_t *dn, uint64_t o ASSERT3U(blkoff + head, ==, blksz); if (len < head) head = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, - FTAG, &db) == 0) { + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), + TRUE, FALSE, FTAG, &db) == 0) { caddr_t data; /* don't dirty if it isn't on disk and isn't dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); - dbuf_will_dirty(db, tx); + dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); data = db->db.db_data; bzero(data + blkoff, head); @@ -1066,18 +1619,18 @@ dnode_free_range(dnode_t *dn, uint64_t o else tail = P2PHASE(len, blksz); - ASSERT3U(P2PHASE(off, blksz), ==, 0); + ASSERT0(P2PHASE(off, blksz)); /* zero out any partial block data at the end of the range */ if (tail) { if (len < tail) tail = len; - if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), - TRUE, FTAG, &db) == 0) { + if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), + TRUE, FALSE, FTAG, &db) == 0) { /* don't dirty if not on disk and not dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); - dbuf_will_dirty(db, tx); + dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); bzero(db->db.db_data, tail); } @@ -1098,87 +1651,116 @@ dnode_free_range(dnode_t *dn, uint64_t o nblks += 1; /* - * Read in and mark all the level-1 indirects dirty, - * so that they will stay in memory until syncing phase. - * Always dirty the first and last indirect to make sure - * we dirty all the partial indirects. + * Dirty all the indirect blocks in this range. Note that only + * the first and last indirect blocks can actually be written + * (if they were partially freed) -- they must be dirtied, even if + * they do not exist on disk yet. The interior blocks will + * be freed by free_children(), so they will not actually be written. + * Even though these interior blocks will not be written, we + * dirty them for two reasons: + * + * - It ensures that the indirect blocks remain in memory until + * syncing context. (They have already been prefetched by + * dmu_tx_hold_free(), so we don't have to worry about reading + * them serially here.) + * + * - The dirty space accounting will put pressure on the txg sync + * mechanism to begin syncing, and to delay transactions if there + * is a large amount of freeing. Even though these indirect + * blocks will not be written, we could need to write the same + * amount of space if we copy the freed BPs into deadlists. */ if (dn->dn_nlevels > 1) { - uint64_t i, first, last; - int shift = epbs + dn->dn_datablkshift; + uint64_t first, last; first = blkid >> epbs; - if (db = dbuf_hold_level(dn, 1, first, FTAG)) { - dbuf_will_dirty(db, tx); - dbuf_rele(db, FTAG); - } + dnode_dirty_l1(dn, first, tx); if (trunc) last = dn->dn_maxblkid >> epbs; else last = (blkid + nblks - 1) >> epbs; - if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) { - dbuf_will_dirty(db, tx); - dbuf_rele(db, FTAG); - } - for (i = first + 1; i < last; i++) { - uint64_t ibyte = i << shift; - int err; + if (last != first) + dnode_dirty_l1(dn, last, tx); - err = dnode_next_offset(dn, - DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0); + int shift = dn->dn_datablkshift + dn->dn_indblkshift - + SPA_BLKPTRSHIFT; + for (uint64_t i = first + 1; i < last; i++) { + /* + * Set i to the blockid of the next non-hole + * level-1 indirect block at or after i. Note + * that dnode_next_offset() operates in terms of + * level-0-equivalent bytes. + */ + uint64_t ibyte = i << shift; + int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, + &ibyte, 2, 1, 0); i = ibyte >> shift; - if (err == ESRCH || i >= last) + if (i >= last) break; - ASSERT(err == 0); - db = dbuf_hold_level(dn, 1, i, FTAG); - if (db) { - dbuf_will_dirty(db, tx); - dbuf_rele(db, FTAG); - } + + /* + * Normally we should not see an error, either + * from dnode_next_offset() or dbuf_hold_level() + * (except for ESRCH from dnode_next_offset). + * If there is an i/o error, then when we read + * this block in syncing context, it will use + * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according + * to the "failmode" property. dnode_next_offset() + * doesn't have a flag to indicate MUSTSUCCEED. + */ + if (err != 0) + break; + + dnode_dirty_l1(dn, i, tx); } } + done: /* * Add this range to the dnode range list. * We will finish up this free operation in the syncing phase. */ mutex_enter(&dn->dn_mtx); - dnode_clear_range(dn, blkid, nblks, tx); - { - free_range_t *rp, *found; - avl_index_t where; - avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK]; - - /* Add new range to dn_ranges */ - rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP); - rp->fr_blkid = blkid; - rp->fr_nblks = nblks; - found = avl_find(tree, rp, &where); - ASSERT(found == NULL); - avl_insert(tree, rp, where); - dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", - blkid, nblks, tx->tx_txg); + int txgoff = tx->tx_txg & TXG_MASK; + if (dn->dn_free_ranges[txgoff] == NULL) { + dn->dn_free_ranges[txgoff] = + range_tree_create(NULL, NULL, &dn->dn_mtx); } + range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); + range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); + dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", + blkid, nblks, tx->tx_txg); mutex_exit(&dn->dn_mtx); dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); dnode_setdirty(dn, tx); out: - if (trunc && dn->dn_maxblkid >= (off >> blkshift)) - dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0); rw_exit(&dn->dn_struct_rwlock); } +static boolean_t +dnode_spill_freed(dnode_t *dn) +{ + int i; + + mutex_enter(&dn->dn_mtx); + for (i = 0; i < TXG_SIZE; i++) { + if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK) + break; + } + mutex_exit(&dn->dn_mtx); + return (i < TXG_SIZE); +} + /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid) { - free_range_t range_tofind; void *dp = spa_get_dsl(dn->dn_objset->os_spa); int i; - if (blkid == DB_BONUS_BLKID) + if (blkid == DMU_BONUS_BLKID) return (FALSE); /* @@ -1191,20 +1773,13 @@ dnode_block_freed(dnode_t *dn, uint64_t if (dn->dn_free_txg) return (TRUE); - range_tofind.fr_blkid = blkid; + if (blkid == DMU_SPILL_BLKID) + return (dnode_spill_freed(dn)); + mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { - free_range_t *range_found; - avl_index_t idx; - - range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx); - if (range_found) { - ASSERT(range_found->fr_nblks > 0); - break; - } - range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE); - if (range_found && - range_found->fr_blkid + range_found->fr_nblks > blkid) + if (dn->dn_free_ranges[i] != NULL && + range_tree_contains(dn->dn_free_ranges[i], blkid, 1)) break; } mutex_exit(&dn->dn_mtx); @@ -1231,7 +1806,7 @@ dnode_diduse_space(dnode_t *dn, int64_t space += delta; if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) { ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); - ASSERT3U(P2PHASE(space, 1<dn_phys->dn_used = space >> DEV_BSHIFT; } else { dn->dn_phys->dn_used = space; @@ -1241,34 +1816,35 @@ dnode_diduse_space(dnode_t *dn, int64_t } /* - * Call when we think we're going to write/free space in open context. - * Be conservative (ie. OK to write less than this or free more than - * this, but don't write more or free less). + * Call when we think we're going to write/free space in open context to track + * the amount of memory in use by the currently open txg. */ void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; dsl_dataset_t *ds = os->os_dsl_dataset; + int64_t aspace = spa_get_asize(os->os_spa, space); - if (space > 0) - space = spa_get_asize(os->os_spa, space); - - if (ds) - dsl_dir_willuse_space(ds->ds_dir, space, tx); + if (ds != NULL) { + dsl_dir_willuse_space(ds->ds_dir, aspace, tx); + dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); + } - dmu_tx_willuse_space(tx, space); + dmu_tx_willuse_space(tx, aspace); } /* - * This function scans a block at the indicated "level" looking for - * a hole or data (depending on 'flags'). If level > 0, then we are - * scanning an indirect block looking at its pointers. If level == 0, - * then we are looking at a block of dnodes. If we don't find what we - * are looking for in the block, we return ESRCH. Otherwise, return - * with *offset pointing to the beginning (if searching forwards) or - * end (if searching backwards) of the range covered by the block - * pointer we matched on (or dnode). + * Scans a block at the indicated "level" looking for a hole or data, + * depending on 'flags'. + * + * If level > 0, then we are scanning an indirect block looking at its + * pointers. If level == 0, then we are looking at a block of dnodes. + * + * If we don't find what we are looking for in the block, we return ESRCH. + * Otherwise, return with *offset pointing to the beginning (if searching + * forwards) or end (if searching backwards) of the range covered by the + * block pointer we matched on (or dnode). * * The basic search algorithm used below by dnode_next_offset() is to * use this function to search up the block tree (widen the search) until @@ -1278,7 +1854,7 @@ dnode_willuse_space(dnode_t *dn, int64_t */ static int dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, - int lvl, uint64_t blkfill, uint64_t txg) + int lvl, uint64_t blkfill, uint64_t txg) { dmu_buf_impl_t *db = NULL; void *data = NULL; @@ -1300,8 +1876,8 @@ dnode_next_offset_level(dnode_t *dn, int epb = dn->dn_phys->dn_nblkptr; data = dn->dn_phys->dn_blkptr; } else { - uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); - error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); + uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); + error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); if (error) { if (error != ENOENT) return (error); @@ -1314,7 +1890,7 @@ dnode_next_offset_level(dnode_t *dn, int * at the pointer to this block in its parent, and its * going to be unallocated, so we will skip over it. */ - return (ESRCH); + return (SET_ERROR(ESRCH)); } error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT); if (error) { @@ -1324,13 +1900,15 @@ dnode_next_offset_level(dnode_t *dn, int data = db->db.db_data; } - if (db && txg && - (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) { + + if (db != NULL && txg != 0 && (db->db_blkptr == NULL || + db->db_blkptr->blk_birth <= txg || + BP_IS_HOLE(db->db_blkptr))) { /* * This can only happen when we are searching up the tree * and these conditions mean that we need to keep climbing. */ - error = ESRCH; + error = SET_ERROR(ESRCH); } else if (lvl == 0) { dnode_phys_t *dnp = data; span = DNODE_SHIFT; @@ -1343,7 +1921,7 @@ dnode_next_offset_level(dnode_t *dn, int *offset += (1ULL << span) * inc; } if (i < 0 || i == blkfill) - error = ESRCH; + error = SET_ERROR(ESRCH); } else { blkptr_t *bp = data; uint64_t start = *offset; @@ -1359,8 +1937,8 @@ dnode_next_offset_level(dnode_t *dn, int *offset = *offset >> span; for (i = BF64_GET(*offset, 0, epbs); i >= 0 && i < epb; i += inc) { - if (bp[i].blk_fill >= minfill && - bp[i].blk_fill <= maxfill && + if (BP_GET_FILL(&bp[i]) >= minfill && + BP_GET_FILL(&bp[i]) <= maxfill && (hole || bp[i].blk_birth > txg)) break; if (inc > 0 || *offset > 0) @@ -1375,7 +1953,7 @@ dnode_next_offset_level(dnode_t *dn, int *offset = start; } if (i < 0 || i >= epb) - error = ESRCH; + error = SET_ERROR(ESRCH); } if (db) @@ -1419,7 +1997,7 @@ dnode_next_offset(dnode_t *dn, int flags rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_phys->dn_nlevels == 0) { - error = ESRCH; + error = SET_ERROR(ESRCH); goto out; } @@ -1428,7 +2006,7 @@ dnode_next_offset(dnode_t *dn, int flags if (flags & DNODE_FIND_HOLE) *offset = dn->dn_datablksz; } else { - error = ESRCH; + error = SET_ERROR(ESRCH); } goto out; } @@ -1447,9 +2025,18 @@ dnode_next_offset(dnode_t *dn, int flags flags, offset, lvl, blkfill, txg); } + /* + * There's always a "virtual hole" at the end of the object, even + * if all BP's which physically exist are non-holes. + */ + if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 && + minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) { + error = 0; + } + if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? initial_offset < *offset : initial_offset > *offset)) - error = ESRCH; + error = SET_ERROR(ESRCH); out: if (!(flags & DNODE_FIND_HAVELOCK)) rw_exit(&dn->dn_struct_rwlock); Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c,v retrieving revision 1.5 diff -u -p -r1.5 dnode_sync.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c 21 Jun 2013 16:22:46 -0000 1.5 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c 10 Oct 2016 11:09:56 -0000 @@ -18,9 +18,11 @@ * * CDDL HEADER END */ + /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include @@ -31,6 +33,8 @@ #include #include #include +#include +#include static void dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) @@ -56,28 +60,27 @@ dnode_increase_indirection(dnode_t *dn, dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, dn->dn_object, dn->dn_phys->dn_nlevels); - /* check for existing blkptrs in the dnode */ - for (i = 0; i < nblkptr; i++) - if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i])) - break; - if (i != nblkptr) { - /* transfer dnode's block pointers to new indirect block */ - (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); - ASSERT(db->db.db_data); - ASSERT(arc_released(db->db_buf)); - ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); - bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, - sizeof (blkptr_t) * nblkptr); - arc_buf_freeze(db->db_buf); - } + /* transfer dnode's block pointers to new indirect block */ + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); + ASSERT(db->db.db_data); + ASSERT(arc_released(db->db_buf)); + ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); + bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, + sizeof (blkptr_t) * nblkptr); + arc_buf_freeze(db->db_buf); /* set dbuf's parent pointers to new indirect buf */ for (i = 0; i < nblkptr; i++) { - dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i); + dmu_buf_impl_t *child = + dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); if (child == NULL) continue; - ASSERT3P(child->db_dnode, ==, dn); +#ifdef DEBUG + DB_DNODE_ENTER(child); + ASSERT3P(DB_DNODE(child), ==, dn); + DB_DNODE_EXIT(child); +#endif /* DEBUG */ if (child->db_parent && child->db_parent != dn->dn_dbuf) { ASSERT(child->db_parent->db_level == db->db_level); ASSERT(child->db_blkptr != @@ -107,26 +110,44 @@ dnode_increase_indirection(dnode_t *dn, rw_exit(&dn->dn_struct_rwlock); } -static int +static void free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint64_t bytesfreed = 0; - int i, blocks_freed = 0; dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); - for (i = 0; i < num; i++, bp++) { + for (int i = 0; i < num; i++, bp++) { if (BP_IS_HOLE(bp)) continue; bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE); ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); + + /* + * Save some useful information on the holes being + * punched, including logical size, type, and indirection + * level. Retaining birth time enables detection of when + * holes are punched for reducing the number of free + * records transmitted during a zfs send. + */ + + uint64_t lsize = BP_GET_LSIZE(bp); + dmu_object_type_t type = BP_GET_TYPE(bp); + uint64_t lvl = BP_GET_LEVEL(bp); + bzero(bp, sizeof (blkptr_t)); - blocks_freed += 1; + + if (spa_feature_is_active(dn->dn_objset->os_spa, + SPA_FEATURE_HOLE_BIRTH)) { + BP_SET_LSIZE(bp, lsize); + BP_SET_TYPE(bp, type); + BP_SET_LEVEL(bp, lvl); + BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0); + } } dnode_diduse_space(dn, -bytesfreed); - return (blocks_freed); } #ifdef ZFS_DEBUG @@ -136,15 +157,18 @@ free_verify(dmu_buf_impl_t *db, uint64_t int off, num; int i, err, epbs; uint64_t txg = tx->tx_txg; + dnode_t *dn; - epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; off = start - (db->db_blkid * 1<=, 0); ASSERT3U(num, >=, 0); ASSERT3U(db->db_level, >, 0); - ASSERT3U(db->db.db_size, ==, 1<db_dnode->dn_phys->dn_indblkshift); + ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); ASSERT(db->db_blkptr != NULL); @@ -156,10 +180,10 @@ free_verify(dmu_buf_impl_t *db, uint64_t ASSERT(db->db_level == 1); - rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(db->db_dnode, db->db_level-1, - (db->db_blkid << epbs) + i, TRUE, FTAG, &child); - rw_exit(&db->db_dnode->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(dn, db->db_level-1, + (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); + rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) continue; ASSERT(err == 0); @@ -201,43 +225,40 @@ free_verify(dmu_buf_impl_t *db, uint64_t dbuf_rele(child, FTAG); } + DB_DNODE_EXIT(db); } #endif -#define ALL -1 - -static int -free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, +static void +free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) { - dnode_t *dn = db->db_dnode; + dnode_t *dn; blkptr_t *bp; dmu_buf_impl_t *subdb; uint64_t start, end, dbstart, dbend, i; - int epbs, shift, err; - int all = TRUE; - int blocks_freed = 0; + int epbs, shift; /* * There is a small possibility that this block will not be cached: * 1 - if level > 1 and there are no children with level <= 1 - * 2 - if we didn't get a dirty hold (because this block had just - * finished being written -- and so had no holds), and then this - * block got evicted before we got here. + * 2 - if this block was evicted since we read it from + * dmu_tx_hold_free(). */ if (db->db_state != DB_CACHED) (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); - arc_release(db->db_buf, db); - bp = (blkptr_t *)db->db.db_data; + dbuf_release_bp(db); + bp = db->db.db_data; - epbs = db->db_dnode->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; shift = (db->db_level - 1) * epbs; dbstart = db->db_blkid << epbs; start = blkid >> shift; if (dbstart < start) { bp += start - dbstart; - all = FALSE; } else { start = dbstart; } @@ -245,68 +266,68 @@ free_children(dmu_buf_impl_t *db, uint64 end = (blkid + nblks - 1) >> shift; if (dbend <= end) end = dbend; - else if (all) - all = trunc; + ASSERT3U(start, <=, end); if (db->db_level == 1) { FREE_VERIFY(db, start, end, tx); - blocks_freed = free_blocks(dn, bp, end-start+1, tx); - arc_buf_freeze(db->db_buf); - ASSERT(all || blocks_freed == 0 || db->db_last_dirty); - return (all ? ALL : blocks_freed); + free_blocks(dn, bp, end-start+1, tx); + } else { + for (i = start; i <= end; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, + i, TRUE, FALSE, FTAG, &subdb)); + rw_exit(&dn->dn_struct_rwlock); + ASSERT3P(bp, ==, subdb->db_blkptr); + + free_children(subdb, blkid, nblks, tx); + dbuf_rele(subdb, FTAG); + } } - for (i = start; i <= end; i++, bp++) { - if (BP_IS_HOLE(bp)) - continue; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb); - ASSERT3U(err, ==, 0); - rw_exit(&dn->dn_struct_rwlock); - - if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) { - ASSERT3P(subdb->db_blkptr, ==, bp); - blocks_freed += free_blocks(dn, bp, 1, tx); - } else { - all = FALSE; - } - dbuf_rele(subdb, FTAG); + /* If this whole block is free, free ourself too. */ + for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { + if (!BP_IS_HOLE(bp)) + break; } - arc_buf_freeze(db->db_buf); -#ifdef ZFS_DEBUG - bp -= (end-start)+1; - for (i = start; i <= end; i++, bp++) { - if (i == start && blkid != 0) - continue; - else if (i == end && !trunc) - continue; - ASSERT3U(bp->blk_birth, ==, 0); + if (i == 1 << epbs) { + /* didn't find any non-holes */ + bzero(db->db.db_data, db->db.db_size); + free_blocks(dn, db->db_blkptr, 1, tx); + } else { + /* + * Partial block free; must be marked dirty so that it + * will be written out. + */ + ASSERT(db->db_dirtycnt > 0); } -#endif - ASSERT(all || blocks_freed == 0 || db->db_last_dirty); - return (all ? ALL : blocks_freed); + + DB_DNODE_EXIT(db); + arc_buf_freeze(db->db_buf); } /* - * free_range: Traverse the indicated range of the provided file + * Traverse the indicated range of the provided file * and "free" all the blocks contained there. */ static void -dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) +dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, + dmu_tx_t *tx) { blkptr_t *bp = dn->dn_phys->dn_blkptr; - dmu_buf_impl_t *db; - int trunc, start, end, shift, i, err; int dnlevel = dn->dn_phys->dn_nlevels; + boolean_t trunc = B_FALSE; if (blkid > dn->dn_phys->dn_maxblkid) return; ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX); - trunc = blkid + nblks > dn->dn_phys->dn_maxblkid; - if (trunc) + if (blkid + nblks > dn->dn_phys->dn_maxblkid) { nblks = dn->dn_phys->dn_maxblkid - blkid + 1; + trunc = B_TRUE; + } /* There are no indirect blocks in the object */ if (dnlevel == 1) { @@ -315,102 +336,110 @@ dnode_sync_free_range(dnode_t *dn, uint6 return; } ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); - (void) free_blocks(dn, bp + blkid, nblks, tx); - if (trunc) { - uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * - (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); - dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); - ASSERT(off < dn->dn_phys->dn_maxblkid || - dn->dn_phys->dn_maxblkid == 0 || - dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); - } - return; - } - - shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); - start = blkid >> shift; - ASSERT(start < dn->dn_phys->dn_nblkptr); - end = (blkid + nblks - 1) >> shift; - bp += start; - for (i = start; i <= end; i++, bp++) { - if (BP_IS_HOLE(bp)) - continue; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db); - ASSERT3U(err, ==, 0); - rw_exit(&dn->dn_struct_rwlock); + free_blocks(dn, bp + blkid, nblks, tx); + } else { + int shift = (dnlevel - 1) * + (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); + int start = blkid >> shift; + int end = (blkid + nblks - 1) >> shift; + dmu_buf_impl_t *db; + + ASSERT(start < dn->dn_phys->dn_nblkptr); + bp += start; + for (int i = start; i <= end; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, + TRUE, FALSE, FTAG, &db)); + rw_exit(&dn->dn_struct_rwlock); - if (free_children(db, blkid, nblks, trunc, tx) == ALL) { - ASSERT3P(db->db_blkptr, ==, bp); - (void) free_blocks(dn, bp, 1, tx); + free_children(db, blkid, nblks, tx); + dbuf_rele(db, FTAG); } - dbuf_rele(db, FTAG); } + if (trunc) { + dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1; + uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); - dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0); ASSERT(off < dn->dn_phys->dn_maxblkid || dn->dn_phys->dn_maxblkid == 0 || dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); } } +typedef struct dnode_sync_free_range_arg { + dnode_t *dsfra_dnode; + dmu_tx_t *dsfra_tx; +} dnode_sync_free_range_arg_t; + +static void +dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) +{ + dnode_sync_free_range_arg_t *dsfra = arg; + dnode_t *dn = dsfra->dsfra_dnode; + + mutex_exit(&dn->dn_mtx); + dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx); + mutex_enter(&dn->dn_mtx); +} + /* - * Try to kick all the dnodes dbufs out of the cache... + * Try to kick all the dnode's dbufs out of the cache... */ void dnode_evict_dbufs(dnode_t *dn) { - int progress; - int pass = 0; + dmu_buf_impl_t db_marker; + dmu_buf_impl_t *db, *db_next; - do { - dmu_buf_impl_t *db, marker; - int evicting = FALSE; - - progress = FALSE; - mutex_enter(&dn->dn_dbufs_mtx); - list_insert_tail(&dn->dn_dbufs, &marker); - db = list_head(&dn->dn_dbufs); - for (; db != ▮ db = list_head(&dn->dn_dbufs)) { - list_remove(&dn->dn_dbufs, db); - list_insert_tail(&dn->dn_dbufs, db); - ASSERT3P(db->db_dnode, ==, dn); - - mutex_enter(&db->db_mtx); - if (db->db_state == DB_EVICTING) { - progress = TRUE; - evicting = TRUE; - mutex_exit(&db->db_mtx); - } else if (refcount_is_zero(&db->db_holds)) { - progress = TRUE; - dbuf_clear(db); /* exits db_mtx for us */ - } else { - mutex_exit(&db->db_mtx); - } + mutex_enter(&dn->dn_dbufs_mtx); + for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { + +#ifdef DEBUG + DB_DNODE_ENTER(db); + ASSERT3P(DB_DNODE(db), ==, dn); + DB_DNODE_EXIT(db); +#endif /* DEBUG */ + + mutex_enter(&db->db_mtx); + if (db->db_state != DB_EVICTING && + refcount_is_zero(&db->db_holds)) { + db_marker.db_level = db->db_level; + db_marker.db_blkid = db->db_blkid; + db_marker.db_state = DB_SEARCH; + avl_insert_here(&dn->dn_dbufs, &db_marker, db, + AVL_BEFORE); + + dbuf_destroy(db); + db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker); + avl_remove(&dn->dn_dbufs, &db_marker); + } else { + db->db_pending_evict = TRUE; + mutex_exit(&db->db_mtx); + db_next = AVL_NEXT(&dn->dn_dbufs, db); } - list_remove(&dn->dn_dbufs, &marker); - /* - * NB: we need to drop dn_dbufs_mtx between passes so - * that any DB_EVICTING dbufs can make progress. - * Ideally, we would have some cv we could wait on, but - * since we don't, just wait a bit to give the other - * thread a chance to run. - */ - mutex_exit(&dn->dn_dbufs_mtx); - if (evicting) - xdelay(1); - pass++; - ASSERT(pass < 100); /* sanity check */ - } while (progress); + } + mutex_exit(&dn->dn_dbufs_mtx); + dnode_evict_bonus(dn); +} + +void +dnode_evict_bonus(dnode_t *dn) +{ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) { - mutex_enter(&dn->dn_bonus->db_mtx); - dbuf_evict(dn->dn_bonus); - dn->dn_bonus = NULL; + if (dn->dn_bonus != NULL) { + if (refcount_is_zero(&dn->dn_bonus->db_holds)) { + mutex_enter(&dn->dn_bonus->db_mtx); + dbuf_destroy(dn->dn_bonus); + dn->dn_bonus = NULL; + } else { + dn->dn_bonus->db_pending_evict = TRUE; + } } rw_exit(&dn->dn_struct_rwlock); } @@ -434,9 +463,12 @@ dnode_undirty_dbufs(list_t *list) db->db_last_dirty = NULL; db->db_dirtycnt -= 1; if (db->db_level == 0) { - ASSERT(db->db_blkid == DB_BONUS_BLKID || + ASSERT(db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_data == db->db_buf); dbuf_unoverride(dr); + } else { + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); @@ -454,12 +486,11 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *t * Our contents should have been freed in dnode_sync() by the * free range record inserted by the caller of dnode_free(). */ - ASSERT3U(DN_USED_BYTES(dn->dn_phys), ==, 0); + ASSERT0(DN_USED_BYTES(dn->dn_phys)); ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr)); dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); dnode_evict_dbufs(dn); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); /* * XXX - It would be nice to assert this, but we may still @@ -482,7 +513,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *t ASSERT(dn->dn_free_txg > 0); if (dn->dn_allocated_txg != dn->dn_free_txg) - dbuf_will_dirty(dn->dn_dbuf, tx); + dmu_buf_will_dirty(&dn->dn_dbuf->db, tx); bzero(dn->dn_phys, sizeof (dnode_phys_t)); mutex_enter(&dn->dn_mtx); @@ -490,6 +521,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *t dn->dn_maxblkid = 0; dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; + dn->dn_have_spill = B_FALSE; mutex_exit(&dn->dn_mtx); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); @@ -507,11 +539,11 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *t void dnode_sync(dnode_t *dn, dmu_tx_t *tx) { - free_range_t *rp; dnode_phys_t *dnp = dn->dn_phys; int txgoff = tx->tx_txg & TXG_MASK; list_t *list = &dn->dn_dirty_records[txgoff]; static const dnode_phys_t zerodn = { 0 }; + boolean_t kill_spill = B_FALSE; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); @@ -523,10 +555,12 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) if (dmu_objset_userused_enabled(dn->dn_objset) && !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { - ASSERT(dn->dn_oldphys == NULL); - dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t)); - *dn->dn_oldphys = *dn->dn_phys; /* struct assignment */ + mutex_enter(&dn->dn_mtx); + dn->dn_oldused = DN_USED_BYTES(dn->dn_phys); + dn->dn_oldflags = dn->dn_phys->dn_flags; dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED; + mutex_exit(&dn->dn_mtx); + dmu_objset_userquota_get_ids(dn, B_FALSE, tx); } else { /* Once we account for it, we should always account for it. */ ASSERT(!(dn->dn_phys->dn_flags & @@ -546,25 +580,34 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnp->dn_bonustype = dn->dn_bonustype; dnp->dn_bonuslen = dn->dn_bonuslen; } - ASSERT(dnp->dn_nlevels > 1 || BP_IS_HOLE(&dnp->dn_blkptr[0]) || + BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) || BP_GET_LSIZE(&dnp->dn_blkptr[0]) == dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + ASSERT(dnp->dn_nlevels < 2 || + BP_IS_HOLE(&dnp->dn_blkptr[0]) || + BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift); - if (dn->dn_next_blksz[txgoff]) { + if (dn->dn_next_type[txgoff] != 0) { + dnp->dn_type = dn->dn_type; + dn->dn_next_type[txgoff] = 0; + } + + if (dn->dn_next_blksz[txgoff] != 0) { ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], SPA_MINBLOCKSIZE) == 0); ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || dn->dn_maxblkid == 0 || list_head(list) != NULL || dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == - dnp->dn_datablkszsec); + dnp->dn_datablkszsec || + range_tree_space(dn->dn_free_ranges[txgoff]) != 0); dnp->dn_datablkszsec = dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT; dn->dn_next_blksz[txgoff] = 0; } - if (dn->dn_next_bonuslen[txgoff]) { + if (dn->dn_next_bonuslen[txgoff] != 0) { if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN) dnp->dn_bonuslen = 0; else @@ -573,7 +616,26 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dn->dn_next_bonuslen[txgoff] = 0; } - if (dn->dn_next_indblkshift[txgoff]) { + if (dn->dn_next_bonustype[txgoff] != 0) { + ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff])); + dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; + dn->dn_next_bonustype[txgoff] = 0; + } + + boolean_t freeing_dnode = dn->dn_free_txg > 0 && + dn->dn_free_txg <= tx->tx_txg; + + /* + * Remove the spill block if we have been explicitly asked to + * remove it, or if the object is being removed. + */ + if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) { + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) + kill_spill = B_TRUE; + dn->dn_rm_spillblk[txgoff] = 0; + } + + if (dn->dn_next_indblkshift[txgoff] != 0) { ASSERT(dnp->dn_nlevels == 1); dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; dn->dn_next_indblkshift[txgoff] = 0; @@ -589,21 +651,36 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); + if (kill_spill) { + free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx); + mutex_enter(&dn->dn_mtx); + dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR; + mutex_exit(&dn->dn_mtx); + } + /* process all the "freed" ranges in the file */ - while (rp = avl_last(&dn->dn_ranges[txgoff])) { - dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx); - /* grab the mutex so we don't race with dnode_block_freed() */ + if (dn->dn_free_ranges[txgoff] != NULL) { + dnode_sync_free_range_arg_t dsfra; + dsfra.dsfra_dnode = dn; + dsfra.dsfra_tx = tx; mutex_enter(&dn->dn_mtx); - avl_remove(&dn->dn_ranges[txgoff], rp); + range_tree_vacate(dn->dn_free_ranges[txgoff], + dnode_sync_free_range, &dsfra); + range_tree_destroy(dn->dn_free_ranges[txgoff]); + dn->dn_free_ranges[txgoff] = NULL; mutex_exit(&dn->dn_mtx); - kmem_free(rp, sizeof (free_range_t)); } - if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) { + if (freeing_dnode) { dnode_sync_free(dn, tx); return; } + if (dn->dn_next_nlevels[txgoff]) { + dnode_increase_indirection(dn, tx); + dn->dn_next_nlevels[txgoff] = 0; + } + if (dn->dn_next_nblkptr[txgoff]) { /* this should only happen on a realloc */ ASSERT(dn->dn_allocated_txg == tx->tx_txg); @@ -628,12 +705,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); } - if (dn->dn_next_nlevels[txgoff]) { - dnode_increase_indirection(dn, tx); - dn->dn_next_nlevels[txgoff] = 0; - } - - dbuf_sync_list(list, tx); + dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx); if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { ASSERT3P(list_head(list), ==, NULL); Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_bookmark.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_bookmark.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_bookmark.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_bookmark.c 10 Oct 2016 11:09:56 -0000 @@ -0,0 +1,457 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int +dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, + dsl_dataset_t **dsp, void *tag, char **shortnamep) +{ + char buf[ZFS_MAX_DATASET_NAME_LEN]; + char *hashp; + + if (strlen(fullname) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + hashp = strchr(fullname, '#'); + if (hashp == NULL) + return (SET_ERROR(EINVAL)); + + *shortnamep = hashp + 1; + if (zfs_component_namecheck(*shortnamep, NULL, NULL)) + return (SET_ERROR(EINVAL)); + (void) strlcpy(buf, fullname, hashp - fullname + 1); + return (dsl_dataset_hold(dp, buf, tag, dsp)); +} + +/* + * Returns ESRCH if bookmark is not found. + */ +static int +dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname, + zfs_bookmark_phys_t *bmark_phys) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t bmark_zapobj = ds->ds_bookmarks; + matchtype_t mt; + int err; + + if (bmark_zapobj == 0) + return (SET_ERROR(ESRCH)); + + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_FIRST; + else + mt = MT_EXACT; + + err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t), + sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, + NULL, 0, NULL); + + return (err == ENOENT ? ESRCH : err); +} + +/* + * If later_ds is non-NULL, this will return EXDEV if the the specified bookmark + * does not represents an earlier point in later_ds's timeline. + * + * Returns ENOENT if the dataset containing the bookmark does not exist. + * Returns ESRCH if the dataset exists but the bookmark was not found in it. + */ +int +dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname, + dsl_dataset_t *later_ds, zfs_bookmark_phys_t *bmp) +{ + char *shortname; + dsl_dataset_t *ds; + int error; + + error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname); + if (error != 0) + return (error); + + error = dsl_dataset_bmark_lookup(ds, shortname, bmp); + if (error == 0 && later_ds != NULL) { + if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg)) + error = SET_ERROR(EXDEV); + } + dsl_dataset_rele(ds, FTAG); + return (error); +} + +typedef struct dsl_bookmark_create_arg { + nvlist_t *dbca_bmarks; + nvlist_t *dbca_errors; +} dsl_bookmark_create_arg_t; + +static int +dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *bmark_fs; + char *shortname; + int error; + zfs_bookmark_phys_t bmark_phys; + + if (!snapds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + error = dsl_bookmark_hold_ds(dp, bookmark_name, + &bmark_fs, FTAG, &shortname); + if (error != 0) + return (error); + + if (!dsl_dataset_is_before(bmark_fs, snapds, 0)) { + dsl_dataset_rele(bmark_fs, FTAG); + return (SET_ERROR(EINVAL)); + } + + error = dsl_dataset_bmark_lookup(bmark_fs, shortname, + &bmark_phys); + dsl_dataset_rele(bmark_fs, FTAG); + if (error == 0) + return (SET_ERROR(EEXIST)); + if (error == ESRCH) + return (0); + return (error); +} + +static int +dsl_bookmark_create_check(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_create_arg_t *dbca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + int rv = 0; + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)) + return (SET_ERROR(ENOTSUP)); + + for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { + dsl_dataset_t *snapds; + int error; + + /* note: validity of nvlist checked by ioctl layer */ + error = dsl_dataset_hold(dp, fnvpair_value_string(pair), + FTAG, &snapds); + if (error == 0) { + error = dsl_bookmark_create_check_impl(snapds, + nvpair_name(pair), tx); + dsl_dataset_rele(snapds, FTAG); + } + if (error != 0) { + fnvlist_add_int32(dbca->dbca_errors, + nvpair_name(pair), error); + rv = error; + } + } + + return (rv); +} + +static void +dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_create_arg_t *dbca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + + ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)); + + for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { + dsl_dataset_t *snapds, *bmark_fs; + zfs_bookmark_phys_t bmark_phys; + char *shortname; + + VERIFY0(dsl_dataset_hold(dp, fnvpair_value_string(pair), + FTAG, &snapds)); + VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), + &bmark_fs, FTAG, &shortname)); + if (bmark_fs->ds_bookmarks == 0) { + bmark_fs->ds_bookmarks = + zap_create_norm(mos, U8_TEXTPREP_TOUPPER, + DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); + spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); + + dsl_dataset_zapify(bmark_fs, tx); + VERIFY0(zap_add(mos, bmark_fs->ds_object, + DS_FIELD_BOOKMARK_NAMES, + sizeof (bmark_fs->ds_bookmarks), 1, + &bmark_fs->ds_bookmarks, tx)); + } + + bmark_phys.zbm_guid = dsl_dataset_phys(snapds)->ds_guid; + bmark_phys.zbm_creation_txg = + dsl_dataset_phys(snapds)->ds_creation_txg; + bmark_phys.zbm_creation_time = + dsl_dataset_phys(snapds)->ds_creation_time; + + VERIFY0(zap_add(mos, bmark_fs->ds_bookmarks, + shortname, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &bmark_phys, tx)); + + spa_history_log_internal_ds(bmark_fs, "bookmark", tx, + "name=%s creation_txg=%llu target_snap=%llu", + shortname, + (longlong_t)bmark_phys.zbm_creation_txg, + (longlong_t)snapds->ds_object); + + dsl_dataset_rele(bmark_fs, FTAG); + dsl_dataset_rele(snapds, FTAG); + } +} + +/* + * The bookmarks must all be in the same pool. + */ +int +dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors) +{ + nvpair_t *pair; + dsl_bookmark_create_arg_t dbca; + + pair = nvlist_next_nvpair(bmarks, NULL); + if (pair == NULL) + return (0); + + dbca.dbca_bmarks = bmarks; + dbca.dbca_errors = errors; + + return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check, + dsl_bookmark_create_sync, &dbca, + fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL)); +} + +int +dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl) +{ + int err = 0; + zap_cursor_t zc; + zap_attribute_t attr; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + uint64_t bmark_zapobj = ds->ds_bookmarks; + if (bmark_zapobj == 0) + return (0); + + for (zap_cursor_init(&zc, dp->dp_meta_objset, bmark_zapobj); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + char *bmark_name = attr.za_name; + zfs_bookmark_phys_t bmark_phys; + + err = dsl_dataset_bmark_lookup(ds, bmark_name, &bmark_phys); + ASSERT3U(err, !=, ENOENT); + if (err != 0) + break; + + nvlist_t *out_props = fnvlist_alloc(); + if (nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_GUID))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_GUID, bmark_phys.zbm_guid); + } + if (nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_CREATETXG))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_CREATETXG, bmark_phys.zbm_creation_txg); + } + if (nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_CREATION))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_CREATION, bmark_phys.zbm_creation_time); + } + + fnvlist_add_nvlist(outnvl, bmark_name, out_props); + fnvlist_free(out_props); + } + zap_cursor_fini(&zc); + return (err); +} + +/* + * Retrieve the bookmarks that exist in the specified dataset, and the + * requested properties of each bookmark. + * + * The "props" nvlist specifies which properties are requested. + * See lzc_get_bookmarks() for the list of valid properties. + */ +int +dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int err; + + err = dsl_pool_hold(dsname, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + err = dsl_get_bookmarks_impl(ds, props, outnvl); + + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); +} + +typedef struct dsl_bookmark_destroy_arg { + nvlist_t *dbda_bmarks; + nvlist_t *dbda_success; + nvlist_t *dbda_errors; +} dsl_bookmark_destroy_arg_t; + +static int +dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t bmark_zapobj = ds->ds_bookmarks; + matchtype_t mt; + + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_FIRST; + else + mt = MT_EXACT; + + return (zap_remove_norm(mos, bmark_zapobj, name, mt, tx)); +} + +static int +dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_destroy_arg_t *dbda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + int rv = 0; + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)) + return (0); + + for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_bmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_bmarks, pair)) { + const char *fullname = nvpair_name(pair); + dsl_dataset_t *ds; + zfs_bookmark_phys_t bm; + int error; + char *shortname; + + error = dsl_bookmark_hold_ds(dp, fullname, &ds, + FTAG, &shortname); + if (error == ENOENT) { + /* ignore it; the bookmark is "already destroyed" */ + continue; + } + if (error == 0) { + error = dsl_dataset_bmark_lookup(ds, shortname, &bm); + dsl_dataset_rele(ds, FTAG); + if (error == ESRCH) { + /* + * ignore it; the bookmark is + * "already destroyed" + */ + continue; + } + } + if (error == 0) { + fnvlist_add_boolean(dbda->dbda_success, fullname); + } else { + fnvlist_add_int32(dbda->dbda_errors, fullname, error); + rv = error; + } + } + return (rv); +} + +static void +dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_destroy_arg_t *dbda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + + for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_success, NULL); + pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_success, pair)) { + dsl_dataset_t *ds; + char *shortname; + uint64_t zap_cnt; + + VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), + &ds, FTAG, &shortname)); + VERIFY0(dsl_dataset_bookmark_remove(ds, shortname, tx)); + + /* + * If all of this dataset's bookmarks have been destroyed, + * free the zap object and decrement the feature's use count. + */ + VERIFY0(zap_count(mos, ds->ds_bookmarks, + &zap_cnt)); + if (zap_cnt == 0) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx)); + ds->ds_bookmarks = 0; + spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); + VERIFY0(zap_remove(mos, ds->ds_object, + DS_FIELD_BOOKMARK_NAMES, tx)); + } + + spa_history_log_internal_ds(ds, "remove bookmark", tx, + "name=%s", shortname); + + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * The bookmarks must all be in the same pool. + */ +int +dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors) +{ + int rv; + dsl_bookmark_destroy_arg_t dbda; + nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL); + if (pair == NULL) + return (0); + + dbda.dbda_bmarks = bmarks; + dbda.dbda_errors = errors; + dbda.dbda_success = fnvlist_alloc(); + + rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check, + dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks), + ZFS_SPACE_CHECK_RESERVED); + fnvlist_free(dbda.dbda_success); + return (rv); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c,v retrieving revision 1.2 diff -u -p -r1.2 dsl_dataset.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c 19 Feb 2016 19:25:22 -0000 1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c 29 Apr 2017 00:45:23 -0000 @@ -19,8 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright (c) 2011 Martin Matuska + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 RackTop Systems. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved. */ #include @@ -29,29 +35,60 @@ #include #include #include +#include +#include #include #include #include #include +#include #include #include #include #include #include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include -static char *dsl_reaper = "the grim reaper"; +SYSCTL_DECL(_vfs_zfs); -static dsl_checkfunc_t dsl_dataset_destroy_begin_check; -static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; -static dsl_syncfunc_t dsl_dataset_set_reservation_sync; +/* + * The SPA supports block sizes up to 16MB. However, very large blocks + * can have an impact on i/o latency (e.g. tying up a spinning disk for + * ~300ms), and also potentially on the memory allocator. Therefore, + * we do not allow the recordsize to be set larger than zfs_max_recordsize + * (default 1MB). Larger blocks can be created by changing this tunable, + * and pools with larger blocks can always be imported and used, regardless + * of this setting. + */ +int zfs_max_recordsize = 1 * 1024 * 1024; +SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN, + &zfs_max_recordsize, 0, + "Maximum block size. Expect dragons when tuning this."); + +#define SWITCH64(x, y) \ + { \ + uint64_t __tmp = (x); \ + (x) = (y); \ + (y) = __tmp; \ + } #define DS_REF_MAX (1ULL << 62) -#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE +extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds); -#define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) +extern int spa_asize_inflation; +static zil_header_t zero_zil; /* * Figure out how much of this delta should be propogated to the dsl_dir @@ -61,13 +98,15 @@ static dsl_syncfunc_t dsl_dataset_set_re static int64_t parent_delta(dsl_dataset_t *ds, int64_t delta) { + dsl_dataset_phys_t *ds_phys; uint64_t old_bytes, new_bytes; if (ds->ds_reserved == 0) return (delta); - old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); - new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); + ds_phys = dsl_dataset_phys(ds); + old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved); + new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved); ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); return (new_bytes - old_bytes); @@ -81,129 +120,124 @@ dsl_dataset_block_born(dsl_dataset_t *ds int uncompressed = BP_GET_UCSIZE(bp); int64_t delta; - dprintf_bp(bp, "born, ds=%p\n", ds); + dprintf_bp(bp, "ds=%p", ds); ASSERT(dmu_tx_is_syncing(tx)); /* It could have been compressed away to nothing */ if (BP_IS_HOLE(bp)) return; ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); - ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); + ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); if (ds == NULL) { - /* - * Account for the meta-objset space in its placeholder - * dsl_dir. - */ - ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ - dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, - used, compressed, uncompressed, tx); - dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); + dsl_pool_mos_diduse_space(tx->tx_pool, + used, compressed, uncompressed); return; } + + ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg); dmu_buf_will_dirty(ds->ds_dbuf, tx); - mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); - ds->ds_phys->ds_used_bytes += used; - ds->ds_phys->ds_compressed_bytes += compressed; - ds->ds_phys->ds_uncompressed_bytes += uncompressed; - ds->ds_phys->ds_unique_bytes += used; + dsl_dataset_phys(ds)->ds_referenced_bytes += used; + dsl_dataset_phys(ds)->ds_compressed_bytes += compressed; + dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed; + dsl_dataset_phys(ds)->ds_unique_bytes += used; + + if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) { + ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] = + B_TRUE; + } + + spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); + if (f != SPA_FEATURE_NONE) + ds->ds_feature_activation_needed[f] = B_TRUE; + mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, compressed, uncompressed, tx); dsl_dir_transfer_space(ds->ds_dir, used - delta, - DD_USED_REFRSRV, DD_USED_HEAD, tx); - mutex_exit(&ds->ds_dir->dd_lock); + DD_USED_REFRSRV, DD_USED_HEAD, NULL); } int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async) { + int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); + if (BP_IS_HOLE(bp)) return (0); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(bp->blk_birth <= tx->tx_txg); - int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); - int compressed = BP_GET_PSIZE(bp); - int uncompressed = BP_GET_UCSIZE(bp); - - ASSERT(used > 0); if (ds == NULL) { - /* - * Account for the meta-objset space in its placeholder - * dataset. - */ dsl_free(tx->tx_pool, tx->tx_txg, bp); - - dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, - -used, -compressed, -uncompressed, tx); - dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); + dsl_pool_mos_diduse_space(tx->tx_pool, + -used, -compressed, -uncompressed); return (used); } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); - ASSERT(!dsl_dataset_is_snapshot(ds)); + ASSERT(!ds->ds_is_snapshot); dmu_buf_will_dirty(ds->ds_dbuf, tx); - if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { + if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { int64_t delta; - dprintf_bp(bp, "freeing: %s", ""); + dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); dsl_free(tx->tx_pool, tx->tx_txg, bp); - mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); - ASSERT(ds->ds_phys->ds_unique_bytes >= used || + ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used || !DS_UNIQUE_IS_ACCURATE(ds)); delta = parent_delta(ds, -used); - ds->ds_phys->ds_unique_bytes -= used; + dsl_dataset_phys(ds)->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, -compressed, -uncompressed, tx); dsl_dir_transfer_space(ds->ds_dir, -used - delta, - DD_USED_REFRSRV, DD_USED_HEAD, tx); - mutex_exit(&ds->ds_dir->dd_lock); + DD_USED_REFRSRV, DD_USED_HEAD, NULL); } else { dprintf_bp(bp, "putting on dead list: %s", ""); if (async) { /* * We are here as part of zio's write done callback, * which means we're a zio interrupt thread. We can't - * call bplist_enqueue() now because it may block + * call dsl_deadlist_insert() now because it may block * waiting for I/O. Instead, put bp on the deferred * queue and let dsl_pool_sync() finish the job. */ - bplist_enqueue_deferred(&ds->ds_deadlist, bp); + bplist_append(&ds->ds_pending_deadlist, bp); } else { - VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); + dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); } ASSERT3U(ds->ds_prev->ds_object, ==, - ds->ds_phys->ds_prev_snap_obj); - ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); + dsl_dataset_phys(ds)->ds_prev_snap_obj); + ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0); /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ - if (ds->ds_prev->ds_phys->ds_next_snap_obj == + if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object && bp->blk_birth > - ds->ds_prev->ds_phys->ds_prev_snap_txg) { + dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); mutex_enter(&ds->ds_prev->ds_lock); - ds->ds_prev->ds_phys->ds_unique_bytes += used; + dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used; mutex_exit(&ds->ds_prev->ds_lock); } - if (bp->blk_birth > ds->ds_origin_txg) { + if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { dsl_dir_transfer_space(ds->ds_dir, used, DD_USED_HEAD, DD_USED_SNAP, tx); } } mutex_enter(&ds->ds_lock); - ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); - ds->ds_phys->ds_used_bytes -= used; - ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); - ds->ds_phys->ds_compressed_bytes -= compressed; - ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); - ds->ds_phys->ds_uncompressed_bytes -= uncompressed; + ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used); + dsl_dataset_phys(ds)->ds_referenced_bytes -= used; + ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed); + dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed; + ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed); + dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed; mutex_exit(&ds->ds_lock); return (used); @@ -229,50 +263,78 @@ dsl_dataset_prev_snap_txg(dsl_dataset_t if (ds->ds_trysnap_txg > spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) trysnap = ds->ds_trysnap_txg; - return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); + return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap)); } boolean_t -dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth) +dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, + uint64_t blk_birth) { - return (blk_birth > dsl_dataset_prev_snap_txg(ds)); + if (blk_birth <= dsl_dataset_prev_snap_txg(ds) || + (bp != NULL && BP_IS_HOLE(bp))) + return (B_FALSE); + + ddt_prefetch(dsl_dataset_get_spa(ds), bp); + + return (B_TRUE); } -/* ARGSUSED */ +/* + * We have to release the fsid syncronously or we risk that a subsequent + * mount of the same dataset will fail to unique_insert the fsid. This + * failure would manifest itself as the fsid of this dataset changing + * between mounts which makes NFS clients quite unhappy. + */ static void -dsl_dataset_evict(dmu_buf_t *db, void *dsv) +dsl_dataset_evict_sync(void *dbu) { - dsl_dataset_t *ds = dsv; + dsl_dataset_t *ds = dbu; - ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); + ASSERT(ds->ds_owner == NULL); unique_remove(ds->ds_fsid_guid); +} + +static void +dsl_dataset_evict_async(void *dbu) +{ + dsl_dataset_t *ds = dbu; + + ASSERT(ds->ds_owner == NULL); + + ds->ds_dbuf = NULL; if (ds->ds_objset != NULL) dmu_objset_evict(ds->ds_objset); if (ds->ds_prev) { - dsl_dataset_drop_ref(ds->ds_prev, ds); + dsl_dataset_rele(ds->ds_prev, ds); ds->ds_prev = NULL; } - bplist_close(&ds->ds_deadlist); + bplist_destroy(&ds->ds_pending_deadlist); + if (ds->ds_deadlist.dl_os != NULL) + dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_dir) - dsl_dir_close(ds->ds_dir, ds); + dsl_dir_async_rele(ds->ds_dir, ds); ASSERT(!list_link_active(&ds->ds_synced_link)); + list_destroy(&ds->ds_prop_cbs); + if (mutex_owned(&ds->ds_lock)) + mutex_exit(&ds->ds_lock); mutex_destroy(&ds->ds_lock); - mutex_destroy(&ds->ds_recvlock); + if (mutex_owned(&ds->ds_opening_lock)) + mutex_exit(&ds->ds_opening_lock); mutex_destroy(&ds->ds_opening_lock); - rw_destroy(&ds->ds_rwlock); - cv_destroy(&ds->ds_exclusive_cv); - bplist_fini(&ds->ds_deadlist); + mutex_destroy(&ds->ds_sendstream_lock); + refcount_destroy(&ds->ds_longholds); + rrw_destroy(&ds->ds_bp_rwlock); kmem_free(ds, sizeof (dsl_dataset_t)); } -static int +int dsl_dataset_get_snapname(dsl_dataset_t *ds) { dsl_dataset_phys_t *headphys; @@ -283,12 +345,12 @@ dsl_dataset_get_snapname(dsl_dataset_t * if (ds->ds_snapname[0]) return (0); - if (ds->ds_phys->ds_next_snap_obj == 0) + if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) return (0); - err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, + err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &headdbuf); - if (err) + if (err != 0) return (err); headphys = headdbuf->db_data; err = zap_value_search(dp->dp_meta_objset, @@ -297,15 +359,15 @@ dsl_dataset_get_snapname(dsl_dataset_t * return (err); } -static int +int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; matchtype_t mt; int err; - if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else mt = MT_EXACT; @@ -317,17 +379,18 @@ dsl_dataset_snap_lookup(dsl_dataset_t *d return (err); } -static int -dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) +int +dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, + boolean_t adj_cnt) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; + uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; matchtype_t mt; int err; dsl_dir_snap_cmtime_update(ds->ds_dir); - if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else mt = MT_EXACT; @@ -335,316 +398,305 @@ dsl_dataset_snap_remove(dsl_dataset_t *d err = zap_remove_norm(mos, snapobj, name, mt, tx); if (err == ENOTSUP && mt == MT_FIRST) err = zap_remove(mos, snapobj, name, tx); + + if (err == 0 && adj_cnt) + dsl_fs_ss_count_adjust(ds->ds_dir, -1, + DD_FIELD_SNAPSHOT_COUNT, tx); + return (err); } -static int -dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, +boolean_t +dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag) +{ + dmu_buf_t *dbuf = ds->ds_dbuf; + boolean_t result = B_FALSE; + + if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset, + ds->ds_object, DMU_BONUS_BLKID, tag)) { + + if (ds == dmu_buf_get_user(dbuf)) + result = B_TRUE; + else + dmu_buf_rele(dbuf, tag); + } + + return (result); +} + +int +dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_t **dsp) { objset_t *mos = dp->dp_meta_objset; dmu_buf_t *dbuf; dsl_dataset_t *ds; int err; + dmu_object_info_t doi; - ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || - dsl_pool_sync_context(dp)); + ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); - if (err) + if (err != 0) return (err); + + /* Make sure dsobj has the correct object type. */ + dmu_object_info_from_db(dbuf, &doi); + if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) { + dmu_buf_rele(dbuf, tag); + return (SET_ERROR(EINVAL)); + } + ds = dmu_buf_get_user(dbuf); if (ds == NULL) { - dsl_dataset_t *winner; + dsl_dataset_t *winner = NULL; ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ds->ds_dbuf = dbuf; ds->ds_object = dsobj; - ds->ds_phys = dbuf->db_data; + ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0; mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); - rw_init(&ds->ds_rwlock, 0, 0, 0); - cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); - bplist_init(&ds->ds_deadlist); - - err = bplist_open(&ds->ds_deadlist, - mos, ds->ds_phys->ds_deadlist_obj); - if (err == 0) { - err = dsl_dir_open_obj(dp, - ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); + mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); + rrw_init(&ds->ds_bp_rwlock, B_FALSE); + refcount_create(&ds->ds_longholds); + + bplist_create(&ds->ds_pending_deadlist); + dsl_deadlist_open(&ds->ds_deadlist, + mos, dsl_dataset_phys(ds)->ds_deadlist_obj); + + list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), + offsetof(dmu_sendarg_t, dsa_link)); + + list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t), + offsetof(dsl_prop_cb_record_t, cbr_ds_node)); + + if (doi.doi_type == DMU_OTN_ZAP_METADATA) { + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (!(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET)) + continue; + err = zap_contains(mos, dsobj, + spa_feature_table[f].fi_guid); + if (err == 0) { + ds->ds_feature_inuse[f] = B_TRUE; + } else { + ASSERT3U(err, ==, ENOENT); + err = 0; + } + } } - if (err) { - /* - * we don't really need to close the blist if we - * just opened it. - */ + + err = dsl_dir_hold_obj(dp, + dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir); + if (err != 0) { mutex_destroy(&ds->ds_lock); - mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); - rw_destroy(&ds->ds_rwlock); - cv_destroy(&ds->ds_exclusive_cv); - bplist_fini(&ds->ds_deadlist); + mutex_destroy(&ds->ds_sendstream_lock); + refcount_destroy(&ds->ds_longholds); + bplist_destroy(&ds->ds_pending_deadlist); + dsl_deadlist_close(&ds->ds_deadlist); kmem_free(ds, sizeof (dsl_dataset_t)); dmu_buf_rele(dbuf, tag); return (err); } - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { ds->ds_snapname[0] = '\0'; - if (ds->ds_phys->ds_prev_snap_obj) { - err = dsl_dataset_get_ref(dp, - ds->ds_phys->ds_prev_snap_obj, + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev); } - - if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) { - dsl_dataset_t *origin; - - err = dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_origin_obj, - FTAG, &origin); - if (err == 0) { - ds->ds_origin_txg = - origin->ds_phys->ds_creation_txg; - dsl_dataset_rele(origin, FTAG); - } + if (doi.doi_type == DMU_OTN_ZAP_METADATA) { + int zaperr = zap_lookup(mos, ds->ds_object, + DS_FIELD_BOOKMARK_NAMES, + sizeof (ds->ds_bookmarks), 1, + &ds->ds_bookmarks); + if (zaperr != ENOENT) + VERIFY0(zaperr); } } else { if (zfs_flags & ZFS_DEBUG_SNAPNAMES) err = dsl_dataset_get_snapname(ds); - if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { + if (err == 0 && + dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { err = zap_count( ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_userrefs_obj, + dsl_dataset_phys(ds)->ds_userrefs_obj, &ds->ds_userrefs); } } - if (err == 0 && !dsl_dataset_is_snapshot(ds)) { - /* - * In sync context, we're called with either no lock - * or with the write lock. If we're not syncing, - * we're always called with the read lock held. - */ - boolean_t need_lock = - !RW_WRITE_HELD(&dp->dp_config_rwlock) && - dsl_pool_sync_context(dp); - - if (need_lock) - rw_enter(&dp->dp_config_rwlock, RW_READER); - - err = dsl_prop_get_ds(ds, - "refreservation", sizeof (uint64_t), 1, - &ds->ds_reserved, NULL); + if (err == 0 && !ds->ds_is_snapshot) { + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + &ds->ds_reserved); if (err == 0) { - err = dsl_prop_get_ds(ds, - "refquota", sizeof (uint64_t), 1, - &ds->ds_quota, NULL); + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), + &ds->ds_quota); } - - if (need_lock) - rw_exit(&dp->dp_config_rwlock); } else { ds->ds_reserved = ds->ds_quota = 0; } - if (err == 0) { - winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, - dsl_dataset_evict); - } - if (err || winner) { - bplist_close(&ds->ds_deadlist); + dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync, + dsl_dataset_evict_async, &ds->ds_dbuf); + if (err == 0) + winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu); + + if (err != 0 || winner != NULL) { + bplist_destroy(&ds->ds_pending_deadlist); + dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_prev) - dsl_dataset_drop_ref(ds->ds_prev, ds); - dsl_dir_close(ds->ds_dir, ds); + dsl_dataset_rele(ds->ds_prev, ds); + dsl_dir_rele(ds->ds_dir, ds); mutex_destroy(&ds->ds_lock); - mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); - rw_destroy(&ds->ds_rwlock); - cv_destroy(&ds->ds_exclusive_cv); - bplist_fini(&ds->ds_deadlist); + mutex_destroy(&ds->ds_sendstream_lock); + refcount_destroy(&ds->ds_longholds); kmem_free(ds, sizeof (dsl_dataset_t)); - if (err) { + if (err != 0) { dmu_buf_rele(dbuf, tag); return (err); } ds = winner; } else { ds->ds_fsid_guid = - unique_insert(ds->ds_phys->ds_fsid_guid); + unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid); + if (ds->ds_fsid_guid != + dsl_dataset_phys(ds)->ds_fsid_guid) { + zfs_dbgmsg("ds_fsid_guid changed from " + "%llx to %llx for pool %s dataset id %llu", + (long long) + dsl_dataset_phys(ds)->ds_fsid_guid, + (long long)ds->ds_fsid_guid, + spa_name(dp->dp_spa), + dsobj); + } } } ASSERT3P(ds->ds_dbuf, ==, dbuf); - ASSERT3P(ds->ds_phys, ==, dbuf->db_data); - ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || + ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data); + ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 || spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); - mutex_enter(&ds->ds_lock); - if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { - mutex_exit(&ds->ds_lock); - dmu_buf_rele(ds->ds_dbuf, tag); - return (ENOENT); - } - mutex_exit(&ds->ds_lock); *dsp = ds; return (0); } -static int -dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - /* - * In syncing context we don't want the rwlock lock: there - * may be an existing writer waiting for sync phase to - * finish. We don't need to worry about such writers, since - * sync phase is single-threaded, so the writer can't be - * doing anything while we are active. - */ - if (dsl_pool_sync_context(dp)) { - ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); - return (0); - } - - /* - * Normal users will hold the ds_rwlock as a READER until they - * are finished (i.e., call dsl_dataset_rele()). "Owners" will - * drop their READER lock after they set the ds_owner field. - * - * If the dataset is being destroyed, the destroy thread will - * obtain a WRITER lock for exclusive access after it's done its - * open-context work and then change the ds_owner to - * dsl_reaper once destruction is assured. So threads - * may block here temporarily, until the "destructability" of - * the dataset is determined. - */ - ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); - mutex_enter(&ds->ds_lock); - while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { - rw_exit(&dp->dp_config_rwlock); - cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); - if (DSL_DATASET_IS_DESTROYED(ds)) { - mutex_exit(&ds->ds_lock); - dsl_dataset_drop_ref(ds, tag); - rw_enter(&dp->dp_config_rwlock, RW_READER); - return (ENOENT); - } - /* - * The dp_config_rwlock lives above the ds_lock. And - * we need to check DSL_DATASET_IS_DESTROYED() while - * holding the ds_lock, so we have to drop and reacquire - * the ds_lock here. - */ - mutex_exit(&ds->ds_lock); - rw_enter(&dp->dp_config_rwlock, RW_READER); - mutex_enter(&ds->ds_lock); - } - mutex_exit(&ds->ds_lock); - return (0); -} - -int -dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, - dsl_dataset_t **dsp) -{ - int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); - - if (err) - return (err); - return (dsl_dataset_hold_ref(*dsp, tag)); -} - int -dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, +dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dataset_t **dsp) { - int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); - if (err) - return (err); - if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { - dsl_dataset_rele(*dsp, tag); - *dsp = NULL; - return (EBUSY); - } - return (0); -} - -int -dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) -{ dsl_dir_t *dd; - dsl_pool_t *dp; const char *snapname; uint64_t obj; int err = 0; + dsl_dataset_t *ds; - err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); - if (err) + err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); + if (err != 0) return (err); - dp = dd->dd_pool; - obj = dd->dd_phys->dd_head_dataset_obj; - rw_enter(&dp->dp_config_rwlock, RW_READER); - if (obj) - err = dsl_dataset_get_ref(dp, obj, tag, dsp); + ASSERT(dsl_pool_config_held(dp)); + obj = dsl_dir_phys(dd)->dd_head_dataset_obj; + if (obj != 0) + err = dsl_dataset_hold_obj(dp, obj, tag, &ds); else - err = ENOENT; - if (err) - goto out; - - err = dsl_dataset_hold_ref(*dsp, tag); + err = SET_ERROR(ENOENT); /* we may be looking for a snapshot */ if (err == 0 && snapname != NULL) { - dsl_dataset_t *ds = NULL; + dsl_dataset_t *snap_ds; if (*snapname++ != '@') { - dsl_dataset_rele(*dsp, tag); - err = ENOENT; - goto out; + dsl_dataset_rele(ds, tag); + dsl_dir_rele(dd, FTAG); + return (SET_ERROR(ENOENT)); } dprintf("looking for snapshot '%s'\n", snapname); - err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); + err = dsl_dataset_snap_lookup(ds, snapname, &obj); if (err == 0) - err = dsl_dataset_get_ref(dp, obj, tag, &ds); - dsl_dataset_rele(*dsp, tag); + err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds); + dsl_dataset_rele(ds, tag); - ASSERT3U((err == 0), ==, (ds != NULL)); - - if (ds) { - mutex_enter(&ds->ds_lock); - if (ds->ds_snapname[0] == 0) - (void) strlcpy(ds->ds_snapname, snapname, - sizeof (ds->ds_snapname)); - mutex_exit(&ds->ds_lock); - err = dsl_dataset_hold_ref(ds, tag); - *dsp = err ? NULL : ds; + if (err == 0) { + mutex_enter(&snap_ds->ds_lock); + if (snap_ds->ds_snapname[0] == 0) + (void) strlcpy(snap_ds->ds_snapname, snapname, + sizeof (snap_ds->ds_snapname)); + mutex_exit(&snap_ds->ds_lock); + ds = snap_ds; } } -out: - rw_exit(&dp->dp_config_rwlock); - dsl_dir_close(dd, FTAG); + if (err == 0) + *dsp = ds; + dsl_dir_rele(dd, FTAG); return (err); } int -dsl_dataset_own(const char *name, boolean_t inconsistentok, +dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, + void *tag, dsl_dataset_t **dsp) +{ + int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); + if (err != 0) + return (err); + if (!dsl_dataset_tryown(*dsp, tag)) { + dsl_dataset_rele(*dsp, tag); + *dsp = NULL; + return (SET_ERROR(EBUSY)); + } + return (0); +} + +int +dsl_dataset_own(dsl_pool_t *dp, const char *name, void *tag, dsl_dataset_t **dsp) { - int err = dsl_dataset_hold(name, tag, dsp); - if (err) + int err = dsl_dataset_hold(dp, name, tag, dsp); + if (err != 0) return (err); - if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { + if (!dsl_dataset_tryown(*dsp, tag)) { dsl_dataset_rele(*dsp, tag); - return (EBUSY); + return (SET_ERROR(EBUSY)); } return (0); } +/* + * See the comment above dsl_pool_hold() for details. In summary, a long + * hold is used to prevent destruction of a dataset while the pool hold + * is dropped, allowing other concurrent operations (e.g. spa_sync()). + * + * The dataset and pool must be held when this function is called. After it + * is called, the pool hold may be released while the dataset is still held + * and accessed. + */ +void +dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag) +{ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + (void) refcount_add(&ds->ds_longholds, tag); +} + +void +dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag) +{ + (void) refcount_remove(&ds->ds_longholds, tag); +} + +/* Return B_TRUE if there are any long holds on this dataset. */ +boolean_t +dsl_dataset_long_held(dsl_dataset_t *ds) +{ + return (!refcount_is_zero(&ds->ds_longholds)); +} + void dsl_dataset_name(dsl_dataset_t *ds, char *name) { @@ -652,106 +704,110 @@ dsl_dataset_name(dsl_dataset_t *ds, char (void) strcpy(name, "mos"); } else { dsl_dir_name(ds->ds_dir, name); - VERIFY(0 == dsl_dataset_get_snapname(ds)); + VERIFY0(dsl_dataset_get_snapname(ds)); if (ds->ds_snapname[0]) { - (void) strcat(name, "@"); + VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); /* * We use a "recursive" mutex so that we * can call dprintf_ds() with ds_lock held. */ if (!MUTEX_HELD(&ds->ds_lock)) { mutex_enter(&ds->ds_lock); - (void) strcat(name, ds->ds_snapname); + VERIFY3U(strlcat(name, ds->ds_snapname, + ZFS_MAX_DATASET_NAME_LEN), <, + ZFS_MAX_DATASET_NAME_LEN); mutex_exit(&ds->ds_lock); } else { - (void) strcat(name, ds->ds_snapname); + VERIFY3U(strlcat(name, ds->ds_snapname, + ZFS_MAX_DATASET_NAME_LEN), <, + ZFS_MAX_DATASET_NAME_LEN); } } } } -static int +int dsl_dataset_namelen(dsl_dataset_t *ds) { - int result; - - if (ds == NULL) { - result = 3; /* "mos" */ - } else { - result = dsl_dir_namelen(ds->ds_dir); - VERIFY(0 == dsl_dataset_get_snapname(ds)); - if (ds->ds_snapname[0]) { - ++result; /* adding one for the @-sign */ - if (!MUTEX_HELD(&ds->ds_lock)) { - mutex_enter(&ds->ds_lock); - result += strlen(ds->ds_snapname); - mutex_exit(&ds->ds_lock); - } else { - result += strlen(ds->ds_snapname); - } - } - } - - return (result); -} - -void -dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) -{ - dmu_buf_rele(ds->ds_dbuf, tag); + VERIFY0(dsl_dataset_get_snapname(ds)); + mutex_enter(&ds->ds_lock); + int len = dsl_dir_namelen(ds->ds_dir) + 1 + strlen(ds->ds_snapname); + mutex_exit(&ds->ds_lock); + return (len); } void dsl_dataset_rele(dsl_dataset_t *ds, void *tag) { - if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { - rw_exit(&ds->ds_rwlock); - } - dsl_dataset_drop_ref(ds, tag); + dmu_buf_rele(ds->ds_dbuf, tag); } void dsl_dataset_disown(dsl_dataset_t *ds, void *tag) { - ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || - (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); + ASSERT3P(ds->ds_owner, ==, tag); + ASSERT(ds->ds_dbuf != NULL); mutex_enter(&ds->ds_lock); ds->ds_owner = NULL; - if (RW_WRITE_HELD(&ds->ds_rwlock)) { - rw_exit(&ds->ds_rwlock); - cv_broadcast(&ds->ds_exclusive_cv); - } mutex_exit(&ds->ds_lock); - if (ds->ds_dbuf) - dsl_dataset_drop_ref(ds, tag); - else - dsl_dataset_evict(ds->ds_dbuf, ds); + dsl_dataset_long_rele(ds, tag); + dsl_dataset_rele(ds, tag); } boolean_t -dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) +dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) { boolean_t gotit = FALSE; + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); mutex_enter(&ds->ds_lock); - if (ds->ds_owner == NULL && - (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { + if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) { ds->ds_owner = tag; - if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) - rw_exit(&ds->ds_rwlock); + dsl_dataset_long_hold(ds, tag); gotit = TRUE; } mutex_exit(&ds->ds_lock); return (gotit); } +boolean_t +dsl_dataset_has_owner(dsl_dataset_t *ds) +{ + boolean_t rv; + mutex_enter(&ds->ds_lock); + rv = (ds->ds_owner != NULL); + mutex_exit(&ds->ds_lock); + return (rv); +} + +static void +dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; + uint64_t zero = 0; + + VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); + + spa_feature_incr(spa, f, tx); + dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); + + VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid, + sizeof (zero), 1, &zero, tx)); +} + void -dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) +dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) { - ASSERT3P(owner, ==, ds->ds_owner); - if (!RW_WRITE_HELD(&ds->ds_rwlock)) - rw_enter(&ds->ds_rwlock, RW_WRITER); + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; + + VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); + + VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx)); + spa_feature_decr(spa, f, tx); } uint64_t @@ -768,58 +824,93 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd origin = dp->dp_origin_snap; ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); - ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); + ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0); ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); + ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0); dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); + VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = dd->dd_object; dsphys->ds_flags = flags; dsphys->ds_fsid_guid = unique_create(); - (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, - sizeof (dsphys->ds_guid)); + do { + (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, + sizeof (dsphys->ds_guid)); + } while (dsphys->ds_guid == 0); dsphys->ds_snapnames_zapobj = zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx); dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; - dsphys->ds_deadlist_obj = - bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - if (origin) { + if (origin == NULL) { + dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); + } else { + dsl_dataset_t *ohds; /* head of the origin snapshot */ + dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = - origin->ds_phys->ds_creation_txg; - dsphys->ds_used_bytes = - origin->ds_phys->ds_used_bytes; + dsl_dataset_phys(origin)->ds_creation_txg; + dsphys->ds_referenced_bytes = + dsl_dataset_phys(origin)->ds_referenced_bytes; dsphys->ds_compressed_bytes = - origin->ds_phys->ds_compressed_bytes; + dsl_dataset_phys(origin)->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = - origin->ds_phys->ds_uncompressed_bytes; - dsphys->ds_bp = origin->ds_phys->ds_bp; - dsphys->ds_flags |= origin->ds_phys->ds_flags; + dsl_dataset_phys(origin)->ds_uncompressed_bytes; + rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG); + dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp; + rrw_exit(&origin->ds_bp_rwlock, FTAG); + + /* + * Inherit flags that describe the dataset's contents + * (INCONSISTENT) or properties (Case Insensitive). + */ + dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags & + (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET); + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (origin->ds_feature_inuse[f]) + dsl_dataset_activate_feature(dsobj, f, tx); + } dmu_buf_will_dirty(origin->ds_dbuf, tx); - origin->ds_phys->ds_num_children++; + dsl_dataset_phys(origin)->ds_num_children++; + + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj, + FTAG, &ohds)); + dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, + dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); + dsl_dataset_rele(ohds, FTAG); if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { - if (origin->ds_phys->ds_next_clones_obj == 0) { - origin->ds_phys->ds_next_clones_obj = + if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) { + dsl_dataset_phys(origin)->ds_next_clones_obj = zap_create(mos, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } - VERIFY(0 == zap_add_int(mos, - origin->ds_phys->ds_next_clones_obj, + VERIFY0(zap_add_int(mos, + dsl_dataset_phys(origin)->ds_next_clones_obj, dsobj, tx)); } dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_origin_obj = origin->ds_object; + dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object; + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { + dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); + dsl_dir_phys(origin->ds_dir)->dd_clones = + zap_create(mos, + DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY0(zap_add_int(mos, + dsl_dir_phys(origin->ds_dir)->dd_clones, + dsobj, tx)); + } } if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) @@ -828,11 +919,33 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd dmu_buf_rele(dbuf, FTAG); dmu_buf_will_dirty(dd->dd_dbuf, tx); - dd->dd_phys->dd_head_dataset_obj = dsobj; + dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj; return (dsobj); } +static void +dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + objset_t *os; + + VERIFY0(dmu_objset_from_ds(ds, &os)); + if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + zio_t *zio; + + bzero(&os->os_zil_header, sizeof (os->os_zil_header)); + + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + dsl_dataset_sync(ds, zio, tx); + VERIFY0(zio_wait(zio)); + + /* dsl_dataset_sync_done will drop this reference. */ + dmu_buf_add_ref(ds->ds_dbuf, ds); + dsl_dataset_sync_done(ds, tx); + } +} + uint64_t dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) @@ -841,343 +954,154 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, uint64_t dsobj, ddobj; dsl_dir_t *dd; + ASSERT(dmu_tx_is_syncing(tx)); ASSERT(lastname[0] != '@'); ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); - VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); + VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); - dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); + dsobj = dsl_dataset_create_sync_dd(dd, origin, + flags & ~DS_CREATE_FLAG_NODIRTY, tx); dsl_deleg_set_create_perms(dd, tx, cr); - dsl_dir_close(dd, FTAG); + /* + * Since we're creating a new node we know it's a leaf, so we can + * initialize the counts if the limit feature is active. + */ + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { + uint64_t cnt = 0; + objset_t *os = dd->dd_pool->dp_meta_objset; + + dsl_dir_zapify(dd, tx); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, + sizeof (cnt), 1, &cnt, tx)); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, + sizeof (cnt), 1, &cnt, tx)); + } + + dsl_dir_rele(dd, FTAG); + + /* + * If we are creating a clone, make sure we zero out any stale + * data from the origin snapshots zil header. + */ + if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) { + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + dsl_dataset_zero_zil(ds, tx); + dsl_dataset_rele(ds, FTAG); + } return (dsobj); } +#if defined(__FreeBSD__) || defined(__NetBSD__) +/* FreeBSD ioctl compat begin */ struct destroyarg { - dsl_sync_task_group_t *dstg; - char *snapname; - char *failed; - boolean_t defer; + nvlist_t *nvl; + const char *snapname; }; static int -dsl_snapshot_destroy_one(const char *name, void *arg) +dsl_check_snap_cb(const char *name, void *arg) { struct destroyarg *da = arg; dsl_dataset_t *ds; - int err; char *dsname; dsname = kmem_asprintf("%s@%s", name, da->snapname); - err = dsl_dataset_own(dsname, B_TRUE, da->dstg, &ds); - strfree(dsname); - if (err == 0) { - struct dsl_ds_destroyarg *dsda; + fnvlist_add_boolean(da->nvl, dsname); + kmem_free(dsname, strlen(dsname) + 1); - dsl_dataset_make_exclusive(ds, da->dstg); - if (ds->ds_objset != NULL) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } - dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); - dsda->ds = ds; - dsda->defer = da->defer; - dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, dsda, da->dstg, 0); - } else if (err == ENOENT) { - err = 0; - } else { - (void) strcpy(da->failed, name); - } - return (err); + return (0); } -/* - * Destroy 'snapname' in all descendants of 'fsname'. - */ -#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy int -dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer) +dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname, + nvlist_t *snaps) { + struct destroyarg *da; int err; - struct destroyarg da; - dsl_sync_task_t *dst; - spa_t *spa; - - err = spa_open(fsname, &spa, FTAG); - if (err) - return (err); - da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - da.snapname = snapname; - da.failed = fsname; - da.defer = defer; - - err = dmu_objset_find(fsname, - dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); - - if (err == 0) - err = dsl_sync_task_group_wait(da.dstg); - for (dst = list_head(&da.dstg->dstg_tasks); dst; - dst = list_next(&da.dstg->dstg_tasks, dst)) { - struct dsl_ds_destroyarg *dsda = dst->dst_arg1; - dsl_dataset_t *ds = dsda->ds; - - /* - * Return the file system name that triggered the error - */ - if (dst->dst_err) { - dsl_dataset_name(ds, fsname); - *strchr(fsname, '@') = '\0'; - } - ASSERT3P(dsda->rm_origin, ==, NULL); - dsl_dataset_disown(ds, da.dstg); - kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); - } + da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP); + da->nvl = snaps; + da->snapname = snapname; + err = dmu_objset_find(fsname, dsl_check_snap_cb, da, + DS_FIND_CHILDREN); + kmem_free(da, sizeof (struct destroyarg)); - dsl_sync_task_group_destroy(da.dstg); - spa_close(spa, FTAG); return (err); } - -static boolean_t -dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) -{ - boolean_t might_destroy = B_FALSE; - - mutex_enter(&ds->ds_lock); - if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && - DS_IS_DEFER_DESTROY(ds)) - might_destroy = B_TRUE; - mutex_exit(&ds->ds_lock); - - return (might_destroy); -} +/* FreeBSD ioctl compat end */ +#endif /* defined(__FreeBSD__) || defined(__NetBSD__) */ /* - * If we're removing a clone, and these three conditions are true: - * 1) the clone's origin has no other children - * 2) the clone's origin has no user references - * 3) the clone's origin has been marked for deferred destruction - * Then, prepare to remove the origin as part of this sync task group. + * The unique space in the head dataset can be calculated by subtracting + * the space used in the most recent snapshot, that is still being used + * in this file system, from the space currently in use. To figure out + * the space in the most recent snapshot still in use, we need to take + * the total space used in the snapshot and subtract out the space that + * has been freed up since the snapshot was taken. */ -static int -dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) +void +dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) { - dsl_dataset_t *ds = dsda->ds; - dsl_dataset_t *origin = ds->ds_prev; + uint64_t mrs_used; + uint64_t dlused, dlcomp, dluncomp; - if (dsl_dataset_might_destroy_origin(origin)) { - char *name; - int namelen; - int error; - - namelen = dsl_dataset_namelen(origin) + 1; - name = kmem_alloc(namelen, KM_SLEEP); - dsl_dataset_name(origin, name); -#ifdef _KERNEL - error = zfs_unmount_snap(name, NULL); - if (error) { - kmem_free(name, namelen); - return (error); - } -#endif - error = dsl_dataset_own(name, B_TRUE, tag, &origin); - kmem_free(name, namelen); - if (error) - return (error); - dsda->rm_origin = origin; - dsl_dataset_make_exclusive(origin, tag); + ASSERT(!ds->ds_is_snapshot); - if (origin->ds_objset != NULL) { - dmu_objset_evict(origin->ds_objset); - origin->ds_objset = NULL; - } - } + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) + mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes; + else + mrs_used = 0; - return (0); + dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); + + ASSERT3U(dlused, <=, mrs_used); + dsl_dataset_phys(ds)->ds_unique_bytes = + dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused); + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= + SPA_VERSION_UNIQUE_ACCURATE) + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; } -/* - * ds must be opened as OWNER. On return (whether successful or not), - * ds will be closed and caller can no longer dereference it. - */ -int -dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) +void +dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, + dmu_tx_t *tx) { + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t count; int err; - dsl_sync_task_group_t *dstg; - objset_t *os; - dsl_dir_t *dd; - uint64_t obj; - struct dsl_ds_destroyarg dsda = { 0 }; - dsl_dataset_t dummy_ds = { 0 }; - - dsda.ds = ds; - - if (dsl_dataset_is_snapshot(ds)) { - /* Destroying a snapshot is simpler */ - dsl_dataset_make_exclusive(ds, tag); - - if (ds->ds_objset != NULL) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } - dsda.defer = defer; - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_destroy_check, dsl_dataset_destroy_sync, - &dsda, tag, 0); - ASSERT3P(dsda.rm_origin, ==, NULL); - goto out; - } else if (defer) { - err = EINVAL; - goto out; - } - - dd = ds->ds_dir; - dummy_ds.ds_dir = dd; - dummy_ds.ds_object = ds->ds_object; - - /* - * Check for errors and mark this ds as inconsistent, in - * case we crash while freeing the objects. - */ - err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, - dsl_dataset_destroy_begin_sync, ds, NULL, 0); - if (err) - goto out; - - err = dmu_objset_from_ds(ds, &os); - if (err) - goto out; - - /* - * remove the objects in open context, so that we won't - * have too much to do in syncing context. - */ - for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, - ds->ds_phys->ds_prev_snap_txg)) { - /* - * Ignore errors, if there is not enough disk space - * we will deal with it in dsl_dataset_destroy_sync(). - */ - (void) dmu_free_object(os, obj); - } - - /* - * We need to sync out all in-flight IO before we try to evict - * (the dataset evict func is trying to clear the cached entries - * for this dataset in the ARC). - */ - txg_wait_synced(dd->dd_pool, 0); - - /* - * If we managed to free all the objects in open - * context, the user space accounting should be zero. - */ - if (ds->ds_phys->ds_bp.blk_fill == 0 && - dmu_objset_userused_enabled(os)) { - uint64_t count; - - ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || - count == 0); - ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || - count == 0); - } - - if (err != ESRCH) - goto out; - - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); - rw_exit(&dd->dd_pool->dp_config_rwlock); - - if (err) - goto out; - - if (ds->ds_objset) { - /* - * We need to sync out all in-flight IO before we try - * to evict (the dataset evict func is trying to clear - * the cached entries for this dataset in the ARC). - */ - txg_wait_synced(dd->dd_pool, 0); - } - - /* - * Blow away the dsl_dir + head dataset. - */ - dsl_dataset_make_exclusive(ds, tag); - if (ds->ds_objset) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } + ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2); + err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, + obj, tx); /* - * If we're removing a clone, we might also need to remove its - * origin. + * The err should not be ENOENT, but a bug in a previous version + * of the code could cause upgrade_clones_cb() to not set + * ds_next_snap_obj when it should, leading to a missing entry. + * If we knew that the pool was created after + * SPA_VERSION_NEXT_CLONES, we could assert that it isn't + * ENOENT. However, at least we can check that we don't have + * too many entries in the next_clones_obj even after failing to + * remove this one. */ - do { - dsda.need_prep = B_FALSE; - if (dsl_dir_is_clone(dd)) { - err = dsl_dataset_origin_rm_prep(&dsda, tag); - if (err) { - dsl_dir_close(dd, FTAG); - goto out; - } - } - - dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); - dsl_sync_task_create(dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, &dsda, tag, 0); - dsl_sync_task_create(dstg, dsl_dir_destroy_check, - dsl_dir_destroy_sync, &dummy_ds, FTAG, 0); - err = dsl_sync_task_group_wait(dstg); - dsl_sync_task_group_destroy(dstg); - - /* - * We could be racing against 'zfs release' or 'zfs destroy -d' - * on the origin snap, in which case we can get EBUSY if we - * needed to destroy the origin snap but were not ready to - * do so. - */ - if (dsda.need_prep) { - ASSERT(err == EBUSY); - ASSERT(dsl_dir_is_clone(dd)); - ASSERT(dsda.rm_origin == NULL); - } - } while (dsda.need_prep); - - if (dsda.rm_origin != NULL) - dsl_dataset_disown(dsda.rm_origin, tag); - - /* if it is successful, dsl_dir_destroy_sync will close the dd */ - if (err) - dsl_dir_close(dd, FTAG); -out: - dsl_dataset_disown(ds, tag); - return (err); + if (err != ENOENT) + VERIFY0(err); + ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, + &count)); + ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2); } + blkptr_t * dsl_dataset_get_blkptr(dsl_dataset_t *ds) { - return (&ds->ds_phys->ds_bp); -} - -void -dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) -{ - ASSERT(dmu_tx_is_syncing(tx)); - /* If it's the meta-objset, set dp_meta_rootbp */ - if (ds == NULL) { - tx->tx_pool->dp_meta_rootbp = *bp; - } else { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_bp = *bp; - } + return (&dsl_dataset_phys(ds)->ds_bp); } spa_t * @@ -1196,708 +1120,290 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu ASSERT(ds->ds_objset != NULL); - if (ds->ds_phys->ds_next_snap_obj != 0) + if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) panic("dirtying snapshot!"); - dp = ds->ds_dir->dd_pool; + /* Must not dirty a dataset in the same txg where it got snapshotted. */ + ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg); - if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { + dp = ds->ds_dir->dd_pool; + if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, ds); } } -/* - * The unique space in the head dataset can be calculated by subtracting - * the space used in the most recent snapshot, that is still being used - * in this file system, from the space currently in use. To figure out - * the space in the most recent snapshot still in use, we need to take - * the total space used in the snapshot and subtract out the space that - * has been freed up since the snapshot was taken. - */ -static void -dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) -{ - uint64_t mrs_used; - uint64_t dlused, dlcomp, dluncomp; - - ASSERT(ds->ds_object == ds->ds_dir->dd_phys->dd_head_dataset_obj); - - if (ds->ds_phys->ds_prev_snap_obj != 0) - mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; - else - mrs_used = 0; - - VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp, - &dluncomp)); - - ASSERT3U(dlused, <=, mrs_used); - ds->ds_phys->ds_unique_bytes = - ds->ds_phys->ds_used_bytes - (mrs_used - dlused); - - if (!DS_UNIQUE_IS_ACCURATE(ds) && - spa_version(ds->ds_dir->dd_pool->dp_spa) >= - SPA_VERSION_UNIQUE_ACCURATE) - ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; -} - -static uint64_t -dsl_dataset_unique(dsl_dataset_t *ds) +boolean_t +dsl_dataset_is_dirty(dsl_dataset_t *ds) { - if (!DS_UNIQUE_IS_ACCURATE(ds) && !dsl_dataset_is_snapshot(ds)) - dsl_dataset_recalc_head_uniq(ds); - - return (ds->ds_phys->ds_unique_bytes); + for (int t = 0; t < TXG_SIZE; t++) { + if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, + ds, t)) + return (B_TRUE); + } + return (B_FALSE); } -struct killarg { - dsl_dataset_t *ds; - dmu_tx_t *tx; -}; - -/* ARGSUSED */ static int -kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) { - struct killarg *ka = arg; - dmu_tx_t *tx = ka->tx; + uint64_t asize; - if (bp == NULL) + if (!dmu_tx_is_syncing(tx)) return (0); - if (zb->zb_level == ZB_ZIL_LEVEL) { - ASSERT(zilog != NULL); - /* - * It's a block in the intent log. It has no - * accounting, so just free it. - */ - dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); - } else { - ASSERT(zilog == NULL); - ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); - (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); - } - - return (0); -} - -/* ARGSUSED */ -static int -dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t count; - int err; - /* - * Can't delete a head dataset if there are snapshots of it. - * (Except if the only snapshots are from the branch we cloned - * from.) + * If there's an fs-only reservation, any blocks that might become + * owned by the snapshot dataset must be accommodated by space + * outside of the reservation. */ - if (ds->ds_prev != NULL && - ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) - return (EBUSY); + ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); + asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved); + if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) + return (SET_ERROR(ENOSPC)); /* - * This is really a dsl_dir thing, but check it here so that - * we'll be less likely to leave this dataset inconsistent & - * nearly destroyed. + * Propagate any reserved space for this snapshot to other + * snapshot checks in this sync group. */ - err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); - if (err) - return (err); - if (count != 0) - return (EEXIST); + if (asize > 0) + dsl_dir_willuse_space(ds->ds_dir, asize, tx); return (0); } -/* ARGSUSED */ -static void -dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - /* Mark it as inconsistent on-disk, in case we crash */ - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; - - spa_history_internal_log(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, - cr, "dataset = %llu", ds->ds_object); -} +typedef struct dsl_dataset_snapshot_arg { + nvlist_t *ddsa_snaps; + nvlist_t *ddsa_props; + nvlist_t *ddsa_errors; + cred_t *ddsa_cr; +} dsl_dataset_snapshot_arg_t; -static int -dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, - dmu_tx_t *tx) +int +dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, + dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr) { - dsl_dataset_t *ds = dsda->ds; - dsl_dataset_t *ds_prev = ds->ds_prev; + int error; + uint64_t value; - if (dsl_dataset_might_destroy_origin(ds_prev)) { - struct dsl_ds_destroyarg ndsda = {0}; + ds->ds_trysnap_txg = tx->tx_txg; - /* - * If we're not prepared to remove the origin, don't remove - * the clone either. - */ - if (dsda->rm_origin == NULL) { - dsda->need_prep = B_TRUE; - return (EBUSY); - } + if (!dmu_tx_is_syncing(tx)) + return (0); - ndsda.ds = ds_prev; - ndsda.is_origin_rm = B_TRUE; - return (dsl_dataset_destroy_check(&ndsda, tag, tx)); - } + /* + * We don't allow multiple snapshots of the same txg. If there + * is already one, try again. + */ + if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) + return (SET_ERROR(EAGAIN)); /* - * If we're not going to remove the origin after all, - * undo the open context setup. + * Check for conflicting snapshot name. */ - if (dsda->rm_origin != NULL) { - dsl_dataset_disown(dsda->rm_origin, tag); - dsda->rm_origin = NULL; + error = dsl_dataset_snap_lookup(ds, snapname, &value); + if (error == 0) + return (SET_ERROR(EEXIST)); + if (error != ENOENT) + return (error); + + /* + * We don't allow taking snapshots of inconsistent datasets, such as + * those into which we are currently receiving. However, if we are + * creating this snapshot as part of a receive, this check will be + * executed atomically with respect to the completion of the receive + * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this + * case we ignore this, knowing it will be fixed up for us shortly in + * dmu_recv_end_sync(). + */ + if (!recv && DS_IS_INCONSISTENT(ds)) + return (SET_ERROR(EBUSY)); + + /* + * Skip the check for temporary snapshots or if we have already checked + * the counts in dsl_dataset_snapshot_check. This means we really only + * check the count here when we're receiving a stream. + */ + if (cnt != 0 && cr != NULL) { + error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr); + if (error != 0) + return (error); } + error = dsl_dataset_snapshot_reserve_space(ds, tx); + if (error != 0) + return (error); + return (0); } -/* ARGSUSED */ -int -dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) +static int +dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) { - struct dsl_ds_destroyarg *dsda = arg1; - dsl_dataset_t *ds = dsda->ds; - - /* we have an owner hold, so noone else can destroy us */ - ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); - - /* - * Only allow deferred destroy on pools that support it. - * NOTE: deferred destroy is only supported on snapshots. - */ - if (dsda->defer) { - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < - SPA_VERSION_USERREFS) - return (ENOTSUP); - ASSERT(dsl_dataset_is_snapshot(ds)); - return (0); - } + dsl_dataset_snapshot_arg_t *ddsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + int rv = 0; /* - * Can't delete a head dataset if there are snapshots of it. - * (Except if the only snapshots are from the branch we cloned - * from.) + * Pre-compute how many total new snapshots will be created for each + * level in the tree and below. This is needed for validating the + * snapshot limit when either taking a recursive snapshot or when + * taking multiple snapshots. + * + * The problem is that the counts are not actually adjusted when + * we are checking, only when we finally sync. For a single snapshot, + * this is easy, the count will increase by 1 at each node up the tree, + * but its more complicated for the recursive/multiple snapshot case. + * + * The dsl_fs_ss_limit_check function does recursively check the count + * at each level up the tree but since it is validating each snapshot + * independently we need to be sure that we are validating the complete + * count for the entire set of snapshots. We do this by rolling up the + * counts for each component of the name into an nvlist and then + * checking each of those cases with the aggregated count. + * + * This approach properly handles not only the recursive snapshot + * case (where we get all of those on the ddsa_snaps list) but also + * the sibling case (e.g. snapshot a/b and a/c so that we will also + * validate the limit on 'a' using a count of 2). + * + * We validate the snapshot names in the third loop and only report + * name errors once. */ - if (ds->ds_prev != NULL && - ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) - return (EBUSY); + if (dmu_tx_is_syncing(tx)) { + nvlist_t *cnt_track = NULL; + cnt_track = fnvlist_alloc(); + + /* Rollup aggregated counts into the cnt_track list */ + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); + pair != NULL; + pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { + char *pdelim; + uint64_t val; + char nm[MAXPATHLEN]; - /* - * If we made changes this txg, traverse_dsl_dataset won't find - * them. Try again. - */ - if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) - return (EAGAIN); + (void) strlcpy(nm, nvpair_name(pair), sizeof (nm)); + pdelim = strchr(nm, '@'); + if (pdelim == NULL) + continue; + *pdelim = '\0'; + + do { + if (nvlist_lookup_uint64(cnt_track, nm, + &val) == 0) { + /* update existing entry */ + fnvlist_add_uint64(cnt_track, nm, + val + 1); + } else { + /* add to list */ + fnvlist_add_uint64(cnt_track, nm, 1); + } - if (dsl_dataset_is_snapshot(ds)) { - /* - * If this snapshot has an elevated user reference count, - * we can't destroy it yet. - */ - if (ds->ds_userrefs > 0 && !dsda->releasing) - return (EBUSY); + pdelim = strrchr(nm, '/'); + if (pdelim != NULL) + *pdelim = '\0'; + } while (pdelim != NULL); + } + + /* Check aggregated counts at each level */ + for (pair = nvlist_next_nvpair(cnt_track, NULL); + pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) { + int error = 0; + char *name; + uint64_t cnt = 0; + dsl_dataset_t *ds; + + name = nvpair_name(pair); + cnt = fnvpair_value_uint64(pair); + ASSERT(cnt > 0); + + error = dsl_dataset_hold(dp, name, FTAG, &ds); + if (error == 0) { + error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, + ddsa->ddsa_cr); + dsl_dataset_rele(ds, FTAG); + } - mutex_enter(&ds->ds_lock); - /* - * Can't delete a branch point. However, if we're destroying - * a clone and removing its origin due to it having a user - * hold count of 0 and having been marked for deferred destroy, - * it's OK for the origin to have a single clone. - */ - if (ds->ds_phys->ds_num_children > - (dsda->is_origin_rm ? 2 : 1)) { - mutex_exit(&ds->ds_lock); - return (EEXIST); + if (error != 0) { + if (ddsa->ddsa_errors != NULL) + fnvlist_add_int32(ddsa->ddsa_errors, + name, error); + rv = error; + /* only report one error for this check */ + break; + } } - mutex_exit(&ds->ds_lock); - } else if (dsl_dir_is_clone(ds->ds_dir)) { - return (dsl_dataset_origin_check(dsda, arg2, tx)); + nvlist_free(cnt_track); } - /* XXX we should do some i/o error checking... */ - return (0); -} - -struct refsarg { - kmutex_t lock; - boolean_t gone; - kcondvar_t cv; -}; + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { + int error = 0; + dsl_dataset_t *ds; + char *name, *atp; + char dsname[ZFS_MAX_DATASET_NAME_LEN]; -/* ARGSUSED */ -static void -dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) -{ - struct refsarg *arg = argv; - - mutex_enter(&arg->lock); - arg->gone = TRUE; - cv_signal(&arg->cv); - mutex_exit(&arg->lock); -} - -static void -dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) -{ - struct refsarg arg; - - mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); - arg.gone = FALSE; - (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, - dsl_dataset_refs_gone); - dmu_buf_rele(ds->ds_dbuf, tag); - mutex_enter(&arg.lock); - while (!arg.gone) - cv_wait(&arg.cv, &arg.lock); - ASSERT(arg.gone); - mutex_exit(&arg.lock); - ds->ds_dbuf = NULL; - ds->ds_phys = NULL; - mutex_destroy(&arg.lock); - cv_destroy(&arg.cv); -} - -static void -remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) -{ - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t count; - int err; - - ASSERT(ds->ds_phys->ds_num_children >= 2); - err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); - /* - * The err should not be ENOENT, but a bug in a previous version - * of the code could cause upgrade_clones_cb() to not set - * ds_next_snap_obj when it should, leading to a missing entry. - * If we knew that the pool was created after - * SPA_VERSION_NEXT_CLONES, we could assert that it isn't - * ENOENT. However, at least we can check that we don't have - * too many entries in the next_clones_obj even after failing to - * remove this one. - */ - if (err != ENOENT) { - VERIFY3U(err, ==, 0); - } - ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, - &count)); - ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); -} - -void -dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) -{ - struct dsl_ds_destroyarg *dsda = arg1; - dsl_dataset_t *ds = dsda->ds; - int err; - int after_branch_point = FALSE; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - dsl_dataset_t *ds_prev = NULL; - uint64_t obj; - - ASSERT(ds->ds_owner); - ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); - ASSERT(ds->ds_prev == NULL || - ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); - ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); - - if (dsda->defer) { - ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); - if (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1) { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; - return; - } - } - - /* signal any waiters that this dataset is going away */ - mutex_enter(&ds->ds_lock); - ds->ds_owner = dsl_reaper; - cv_broadcast(&ds->ds_exclusive_cv); - mutex_exit(&ds->ds_lock); - - /* Remove our reservation */ - if (ds->ds_reserved != 0) { - dsl_prop_setarg_t psa; - uint64_t value = 0; - - dsl_prop_setarg_init_uint64(&psa, "refreservation", - (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), - &value); - psa.psa_effective_value = 0; /* predict default value */ - - dsl_dataset_set_reservation_sync(ds, &psa, cr, tx); - ASSERT3U(ds->ds_reserved, ==, 0); - } - - ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - - dsl_pool_ds_destroyed(ds, tx); - - obj = ds->ds_object; - - if (ds->ds_phys->ds_prev_snap_obj != 0) { - if (ds->ds_prev) { - ds_prev = ds->ds_prev; - } else { - VERIFY(0 == dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); - } - after_branch_point = - (ds_prev->ds_phys->ds_next_snap_obj != obj); - - dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); - if (after_branch_point && - ds_prev->ds_phys->ds_next_clones_obj != 0) { - remove_from_next_clones(ds_prev, obj, tx); - if (ds->ds_phys->ds_next_snap_obj != 0) { - VERIFY(0 == zap_add_int(mos, - ds_prev->ds_phys->ds_next_clones_obj, - ds->ds_phys->ds_next_snap_obj, tx)); - } + name = nvpair_name(pair); + if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN) + error = SET_ERROR(ENAMETOOLONG); + if (error == 0) { + atp = strchr(name, '@'); + if (atp == NULL) + error = SET_ERROR(EINVAL); + if (error == 0) + (void) strlcpy(dsname, name, atp - name + 1); } - if (after_branch_point && - ds->ds_phys->ds_next_snap_obj == 0) { - /* This clone is toast. */ - ASSERT(ds_prev->ds_phys->ds_num_children > 1); - ds_prev->ds_phys->ds_num_children--; - - /* - * If the clone's origin has no other clones, no - * user holds, and has been marked for deferred - * deletion, then we should have done the necessary - * destroy setup for it. - */ - if (ds_prev->ds_phys->ds_num_children == 1 && - ds_prev->ds_userrefs == 0 && - DS_IS_DEFER_DESTROY(ds_prev)) { - ASSERT3P(dsda->rm_origin, !=, NULL); - } else { - ASSERT3P(dsda->rm_origin, ==, NULL); - } - } else if (!after_branch_point) { - ds_prev->ds_phys->ds_next_snap_obj = - ds->ds_phys->ds_next_snap_obj; - } - } - - if (ds->ds_phys->ds_next_snap_obj != 0) { - blkptr_t bp; - dsl_dataset_t *ds_next; - uint64_t itor = 0; - uint64_t old_unique; - int64_t used = 0, compressed = 0, uncompressed = 0; - - VERIFY(0 == dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); - ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); - - old_unique = dsl_dataset_unique(ds_next); - - dmu_buf_will_dirty(ds_next->ds_dbuf, tx); - ds_next->ds_phys->ds_prev_snap_obj = - ds->ds_phys->ds_prev_snap_obj; - ds_next->ds_phys->ds_prev_snap_txg = - ds->ds_phys->ds_prev_snap_txg; - ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, - ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); - - /* - * Transfer to our deadlist (which will become next's - * new deadlist) any entries from next's current - * deadlist which were born before prev, and free the - * other entries. - * - * XXX we're doing this long task with the config lock held - */ - while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) { - if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) { - VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, - &bp, tx)); - if (ds_prev && !after_branch_point && - bp.blk_birth > - ds_prev->ds_phys->ds_prev_snap_txg) { - ds_prev->ds_phys->ds_unique_bytes += - bp_get_dsize_sync(dp->dp_spa, &bp); - } - } else { - used += bp_get_dsize_sync(dp->dp_spa, &bp); - compressed += BP_GET_PSIZE(&bp); - uncompressed += BP_GET_UCSIZE(&bp); - dsl_free(dp, tx->tx_txg, &bp); - } + if (error == 0) + error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (error == 0) { + /* passing 0/NULL skips dsl_fs_ss_limit_check */ + error = dsl_dataset_snapshot_check_impl(ds, + atp + 1, tx, B_FALSE, 0, NULL); + dsl_dataset_rele(ds, FTAG); } - ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); - - /* change snapused */ - dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, - -used, -compressed, -uncompressed, tx); - - /* free next's deadlist */ - bplist_close(&ds_next->ds_deadlist); - bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx); - - /* set next's deadlist to our deadlist */ - bplist_close(&ds->ds_deadlist); - ds_next->ds_phys->ds_deadlist_obj = - ds->ds_phys->ds_deadlist_obj; - VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos, - ds_next->ds_phys->ds_deadlist_obj)); - ds->ds_phys->ds_deadlist_obj = 0; - - if (ds_next->ds_phys->ds_next_snap_obj != 0) { - /* - * Update next's unique to include blocks which - * were previously shared by only this snapshot - * and it. Those blocks will be born after the - * prev snap and before this snap, and will have - * died after the next snap and before the one - * after that (ie. be on the snap after next's - * deadlist). - * - * XXX we're doing this long task with the - * config lock held - */ - dsl_dataset_t *ds_after_next; - uint64_t space; - - VERIFY(0 == dsl_dataset_hold_obj(dp, - ds_next->ds_phys->ds_next_snap_obj, - FTAG, &ds_after_next)); - - VERIFY(0 == - bplist_space_birthrange(&ds_after_next->ds_deadlist, - ds->ds_phys->ds_prev_snap_txg, - ds->ds_phys->ds_creation_txg, &space)); - ds_next->ds_phys->ds_unique_bytes += space; - - dsl_dataset_rele(ds_after_next, FTAG); - ASSERT3P(ds_next->ds_prev, ==, NULL); - } else { - ASSERT3P(ds_next->ds_prev, ==, ds); - dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); - ds_next->ds_prev = NULL; - if (ds_prev) { - VERIFY(0 == dsl_dataset_get_ref(dp, - ds->ds_phys->ds_prev_snap_obj, - ds_next, &ds_next->ds_prev)); - } - - dsl_dataset_recalc_head_uniq(ds_next); - - /* - * Reduce the amount of our unconsmed refreservation - * being charged to our parent by the amount of - * new unique data we have gained. - */ - if (old_unique < ds_next->ds_reserved) { - int64_t mrsdelta; - uint64_t new_unique = - ds_next->ds_phys->ds_unique_bytes; - - ASSERT(old_unique <= new_unique); - mrsdelta = MIN(new_unique - old_unique, - ds_next->ds_reserved - old_unique); - dsl_dir_diduse_space(ds->ds_dir, - DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); + if (error != 0) { + if (ddsa->ddsa_errors != NULL) { + fnvlist_add_int32(ddsa->ddsa_errors, + name, error); } + rv = error; } - dsl_dataset_rele(ds_next, FTAG); - } else { - /* - * There's no next snapshot, so this is a head dataset. - * Destroy the deadlist. Unless it's a clone, the - * deadlist should be empty. (If it's a clone, it's - * safe to ignore the deadlist contents.) - */ - struct killarg ka; - - ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist)); - bplist_close(&ds->ds_deadlist); - bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); - ds->ds_phys->ds_deadlist_obj = 0; - - /* - * Free everything that we point to (that's born after - * the previous snapshot, if we are a clone) - * - * NB: this should be very quick, because we already - * freed all the objects in open context. - */ - ka.ds = ds; - ka.tx = tx; - err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, - TRAVERSE_POST, kill_blkptr, &ka); - ASSERT3U(err, ==, 0); - ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || - ds->ds_phys->ds_unique_bytes == 0); - - if (ds->ds_prev != NULL) { - dsl_dataset_rele(ds->ds_prev, ds); - ds->ds_prev = ds_prev = NULL; - } - } - - if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { - /* Erase the link in the dir */ - dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); - ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; - ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); - err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); - ASSERT(err == 0); - } else { - /* remove from snapshot namespace */ - dsl_dataset_t *ds_head; - ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); - VERIFY(0 == dsl_dataset_hold_obj(dp, - ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); - VERIFY(0 == dsl_dataset_get_snapname(ds)); -#ifdef ZFS_DEBUG - { - uint64_t val; - - err = dsl_dataset_snap_lookup(ds_head, - ds->ds_snapname, &val); - ASSERT3U(err, ==, 0); - ASSERT3U(val, ==, obj); - } -#endif - err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); - ASSERT(err == 0); - dsl_dataset_rele(ds_head, FTAG); - } - - if (ds_prev && ds->ds_prev != ds_prev) - dsl_dataset_rele(ds_prev, FTAG); - - spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); - spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx, - cr, "dataset = %llu", ds->ds_object); - - if (ds->ds_phys->ds_next_clones_obj != 0) { - uint64_t count; - ASSERT(0 == zap_count(mos, - ds->ds_phys->ds_next_clones_obj, &count) && count == 0); - VERIFY(0 == dmu_object_free(mos, - ds->ds_phys->ds_next_clones_obj, tx)); - } - if (ds->ds_phys->ds_props_obj != 0) - VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); - if (ds->ds_phys->ds_userrefs_obj != 0) - VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); - dsl_dir_close(ds->ds_dir, ds); - ds->ds_dir = NULL; - dsl_dataset_drain_refs(ds, tag); - VERIFY(0 == dmu_object_free(mos, obj, tx)); - - if (dsda->rm_origin) { - /* - * Remove the origin of the clone we just destroyed. - */ - struct dsl_ds_destroyarg ndsda = {0}; - - ndsda.ds = dsda->rm_origin; - dsl_dataset_destroy_sync(&ndsda, tag, cr, tx); } -} - -static int -dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - uint64_t asize; - - if (!dmu_tx_is_syncing(tx)) - return (0); - - /* - * If there's an fs-only reservation, any blocks that might become - * owned by the snapshot dataset must be accommodated by space - * outside of the reservation. - */ - asize = MIN(dsl_dataset_unique(ds), ds->ds_reserved); - if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, FALSE)) - return (ENOSPC); - - /* - * Propogate any reserved space for this snapshot to other - * snapshot checks in this sync group. - */ - if (asize > 0) - dsl_dir_willuse_space(ds->ds_dir, asize, tx); - - return (0); -} - -/* ARGSUSED */ -int -dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - const char *snapname = arg2; - int err; - uint64_t value; - - /* - * We don't allow multiple snapshots of the same txg. If there - * is already one, try again. - */ - if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) - return (EAGAIN); - - /* - * Check for conflicting name snapshot name. - */ - err = dsl_dataset_snap_lookup(ds, snapname, &value); - if (err == 0) - return (EEXIST); - if (err != ENOENT) - return (err); - - /* - * Check that the dataset's name is not too long. Name consists - * of the dataset's length + 1 for the @-sign + snapshot name's length - */ - if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) - return (ENAMETOOLONG); - - err = dsl_dataset_snapshot_reserve_space(ds, tx); - if (err) - return (err); - ds->ds_trysnap_txg = tx->tx_txg; - return (0); + return (rv); } void -dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, + dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - const char *snapname = arg2; dsl_pool_t *dp = ds->ds_dir->dd_pool; dmu_buf_t *dbuf; dsl_dataset_phys_t *dsphys; uint64_t dsobj, crtxg; objset_t *mos = dp->dp_meta_objset; - int err; + objset_t *os; + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + /* + * If we are on an old pool, the zil must not be active, in which + * case it will be zeroed. Usually zil_suspend() accomplishes this. + */ + ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP || + dmu_objset_from_ds(ds, &os) != 0 || + bcmp(&os->os_phys->os_zil_header, &zero_zil, + sizeof (zero_zil)) == 0); + + /* Should not snapshot a dirty dataset. */ + ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, + ds, tx->tx_txg)); - ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); + dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx); /* * The origin's ds_creation_txg has to be < TXG_INITIAL @@ -1909,44 +1415,56 @@ dsl_dataset_snapshot_sync(void *arg1, vo dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); + VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = ds->ds_dir->dd_object; dsphys->ds_fsid_guid = unique_create(); - (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, - sizeof (dsphys->ds_guid)); - dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; - dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; + do { + (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, + sizeof (dsphys->ds_guid)); + } while (dsphys->ds_guid == 0); + dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; dsphys->ds_next_snap_obj = ds->ds_object; dsphys->ds_num_children = 1; dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = crtxg; - dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; - dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; - dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; - dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; - dsphys->ds_flags = ds->ds_phys->ds_flags; - dsphys->ds_bp = ds->ds_phys->ds_bp; + dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; + dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes; + dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes; + dsphys->ds_uncompressed_bytes = + dsl_dataset_phys(ds)->ds_uncompressed_bytes; + dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags; + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp; + rrw_exit(&ds->ds_bp_rwlock, FTAG); dmu_buf_rele(dbuf, FTAG); - ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (ds->ds_feature_inuse[f]) + dsl_dataset_activate_feature(dsobj, f, tx); + } + + ASSERT3U(ds->ds_prev != 0, ==, + dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); if (ds->ds_prev) { uint64_t next_clones_obj = - ds->ds_prev->ds_phys->ds_next_clones_obj; - ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == + dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj; + ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object || - ds->ds_prev->ds_phys->ds_num_children > 1); - if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { + dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1); + if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == + ds->ds_object) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, - ds->ds_prev->ds_phys->ds_creation_txg); - ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, + dsl_dataset_phys(ds->ds_prev)->ds_creation_txg); + dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj; } else if (next_clones_obj != 0) { - remove_from_next_clones(ds->ds_prev, + dsl_dataset_remove_from_next_clones(ds->ds_prev, dsphys->ds_next_snap_obj, tx); - VERIFY3U(0, ==, zap_add_int(mos, + VERIFY0(zap_add_int(mos, next_clones_obj, dsobj, tx)); } } @@ -1957,148 +1475,584 @@ dsl_dataset_snapshot_sync(void *arg1, vo * since our unique space is going to zero. */ if (ds->ds_reserved) { - int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved); + int64_t delta; + ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); + delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, + ds->ds_reserved); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, - add, 0, 0, tx); + delta, 0, 0, tx); } - bplist_close(&ds->ds_deadlist); dmu_buf_will_dirty(ds->ds_dbuf, tx); - ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); - ds->ds_phys->ds_prev_snap_obj = dsobj; - ds->ds_phys->ds_prev_snap_txg = crtxg; - ds->ds_phys->ds_unique_bytes = 0; + dsl_dataset_phys(ds)->ds_deadlist_obj = + dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX, + dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_open(&ds->ds_deadlist, mos, + dsl_dataset_phys(ds)->ds_deadlist_obj); + dsl_deadlist_add_key(&ds->ds_deadlist, + dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); + + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg); + dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj; + dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg; + dsl_dataset_phys(ds)->ds_unique_bytes = 0; if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) - ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; - ds->ds_phys->ds_deadlist_obj = - bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, - ds->ds_phys->ds_deadlist_obj)); - - dprintf("snap '%s' -> obj %llu\n", snapname, dsobj); - err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, - snapname, 8, 1, &dsobj, tx); - ASSERT(err == 0); + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + + VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj, + snapname, 8, 1, &dsobj, tx)); if (ds->ds_prev) - dsl_dataset_drop_ref(ds->ds_prev, ds); - VERIFY(0 == dsl_dataset_get_ref(dp, - ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); + dsl_dataset_rele(ds->ds_prev, ds); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev)); - dsl_pool_ds_snapshotted(ds, tx); + dsl_scan_ds_snapshotted(ds, tx); dsl_dir_snap_cmtime_update(ds->ds_dir); - spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr, - "dataset = %llu", dsobj); + spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, ""); } -void -dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) +static void +dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) { - ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(ds->ds_objset != NULL); - ASSERT(ds->ds_phys->ds_next_snap_obj == 0); + dsl_dataset_snapshot_arg_t *ddsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; - /* - * in case we had to change ds_fsid_guid when we opened it, - * sync it out now. - */ - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { + dsl_dataset_t *ds; + char *name, *atp; + char dsname[ZFS_MAX_DATASET_NAME_LEN]; - dsl_dir_dirty(ds->ds_dir, tx); - dmu_objset_sync(ds->ds_objset, zio, tx); + name = nvpair_name(pair); + atp = strchr(name, '@'); + (void) strlcpy(dsname, name, atp - name + 1); + VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds)); + + dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx); + if (ddsa->ddsa_props != NULL) { + dsl_props_set_sync_impl(ds->ds_prev, + ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx); + } + dsl_dataset_rele(ds, FTAG); + } } -void -dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) +/* + * The snapshots must all be in the same pool. + * All-or-nothing: if there are any failures, nothing will be modified. + */ +int +dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) { - uint64_t refd, avail, uobjs, aobjs; + dsl_dataset_snapshot_arg_t ddsa; + nvpair_t *pair; + boolean_t needsuspend; + int error; + spa_t *spa; + char *firstname; + nvlist_t *suspended = NULL; - dsl_dir_stats(ds->ds_dir, nv); + pair = nvlist_next_nvpair(snaps, NULL); + if (pair == NULL) + return (0); + firstname = nvpair_name(pair); - dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); + error = spa_open(firstname, &spa, FTAG); + if (error != 0) + return (error); + needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); + spa_close(spa, FTAG); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, - ds->ds_phys->ds_creation_time); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, - ds->ds_phys->ds_creation_txg); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, - ds->ds_quota); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, - ds->ds_reserved); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, - ds->ds_phys->ds_guid); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, - dsl_dataset_unique(ds)); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, - ds->ds_object); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, - ds->ds_userrefs); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, - DS_IS_DEFER_DESTROY(ds) ? 1 : 0); + if (needsuspend) { + suspended = fnvlist_alloc(); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + char *snapname = nvpair_name(pair); + char *atp; + void *cookie; + + atp = strchr(snapname, '@'); + if (atp == NULL) { + error = SET_ERROR(EINVAL); + break; + } + (void) strlcpy(fsname, snapname, atp - snapname + 1); - if (ds->ds_phys->ds_next_snap_obj) { - /* - * This is a snapshot; override the dd's space used with - * our unique space and compression ratio. - */ - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, - ds->ds_phys->ds_unique_bytes); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, - ds->ds_phys->ds_compressed_bytes == 0 ? 100 : - (ds->ds_phys->ds_uncompressed_bytes * 100 / - ds->ds_phys->ds_compressed_bytes)); + error = zil_suspend(fsname, &cookie); + if (error != 0) + break; + fnvlist_add_uint64(suspended, fsname, + (uintptr_t)cookie); + } + } + + ddsa.ddsa_snaps = snaps; + ddsa.ddsa_props = props; + ddsa.ddsa_errors = errors; + ddsa.ddsa_cr = CRED(); + + if (error == 0) { + error = dsl_sync_task(firstname, dsl_dataset_snapshot_check, + dsl_dataset_snapshot_sync, &ddsa, + fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL); + } + + if (suspended != NULL) { + for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL; + pair = nvlist_next_nvpair(suspended, pair)) { + zil_resume((void *)(uintptr_t) + fnvpair_value_uint64(pair)); + } + fnvlist_free(suspended); + } + +#ifdef __FreeBSD__ +#ifdef _KERNEL + if (error == 0) { + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + char *snapname = nvpair_name(pair); + zvol_create_minors(snapname); + } } +#endif +#endif + return (error); } -void -dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) +typedef struct dsl_dataset_snapshot_tmp_arg { + const char *ddsta_fsname; + const char *ddsta_snapname; + minor_t ddsta_cleanup_minor; + const char *ddsta_htag; +} dsl_dataset_snapshot_tmp_arg_t; + +static int +dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx) { - stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; - stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; - stat->dds_guid = ds->ds_phys->ds_guid; - if (ds->ds_phys->ds_next_snap_obj) { - stat->dds_is_snapshot = B_TRUE; - stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; - } else { - stat->dds_is_snapshot = B_FALSE; - stat->dds_num_clones = 0; + dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + + error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds); + if (error != 0) + return (error); + + /* NULL cred means no limit check for tmp snapshot */ + error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, + tx, B_FALSE, 0, NULL); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); } - /* clone origin is really a dsl_dir thing... */ - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); - if (dsl_dir_is_clone(ds->ds_dir)) { - dsl_dataset_t *ods; - - VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, - ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); - dsl_dataset_name(ods, stat->dds_origin); - dsl_dataset_drop_ref(ods, FTAG); - } else { - stat->dds_origin[0] = '\0'; + if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOTSUP)); + } + error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag, + B_TRUE, tx); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); } - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + + dsl_dataset_rele(ds, FTAG); + return (0); } -uint64_t -dsl_dataset_fsid_guid(dsl_dataset_t *ds) +static void +dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx) { - return (ds->ds_fsid_guid); -} + dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; -void -dsl_dataset_space(dsl_dataset_t *ds, + VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds)); + + dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx); + dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag, + ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx); + dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx); + + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, + minor_t cleanup_minor, const char *htag) +{ + dsl_dataset_snapshot_tmp_arg_t ddsta; + int error; + spa_t *spa; + boolean_t needsuspend; + void *cookie; + + ddsta.ddsta_fsname = fsname; + ddsta.ddsta_snapname = snapname; + ddsta.ddsta_cleanup_minor = cleanup_minor; + ddsta.ddsta_htag = htag; + + error = spa_open(fsname, &spa, FTAG); + if (error != 0) + return (error); + needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); + spa_close(spa, FTAG); + + if (needsuspend) { + error = zil_suspend(fsname, &cookie); + if (error != 0) + return (error); + } + + error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check, + dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED); + + if (needsuspend) + zil_resume(cookie); + return (error); +} + + +void +dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(ds->ds_objset != NULL); + ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0); + + /* + * in case we had to change ds_fsid_guid when we opened it, + * sync it out now. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid; + + if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) { + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1, + &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1, + &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1, + &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx)); + ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0; + ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0; + ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0; + } + + dmu_objset_sync(ds->ds_objset, zio, tx); + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (ds->ds_feature_activation_needed[f]) { + if (ds->ds_feature_inuse[f]) + continue; + dsl_dataset_activate_feature(ds->ds_object, f, tx); + ds->ds_feature_inuse[f] = B_TRUE; + } + } +} + +static int +deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, tx); + return (0); +} + +void +dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + objset_t *os = ds->ds_objset; + + bplist_iterate(&ds->ds_pending_deadlist, + deadlist_enqueue_cb, &ds->ds_deadlist, tx); + + ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx))); + + dmu_buf_rele(ds->ds_dbuf, ds); +} + +static void +get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) +{ + uint64_t count = 0; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + nvlist_t *propval = fnvlist_alloc(); + nvlist_t *val; + + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + + /* + * We use nvlist_alloc() instead of fnvlist_alloc() because the + * latter would allocate the list with NV_UNIQUE_NAME flag. + * As a result, every time a clone name is appended to the list + * it would be (linearly) searched for for a duplicate name. + * We already know that all clone names must be unique and we + * want avoid the quadratic complexity of double-checking that + * because we can have a large number of clones. + */ + VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP)); + + /* + * There may be missing entries in ds_next_clones_obj + * due to a bug in a previous version of the code. + * Only trust it if it has the right number of entries. + */ + if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { + VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, + &count)); + } + if (count != dsl_dataset_phys(ds)->ds_num_children - 1) + goto fail; + for (zap_cursor_init(&zc, mos, + dsl_dataset_phys(ds)->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *clone; + char buf[ZFS_MAX_DATASET_NAME_LEN]; + VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + za.za_first_integer, FTAG, &clone)); + dsl_dir_name(clone->ds_dir, buf); + fnvlist_add_boolean(val, buf); + dsl_dataset_rele(clone, FTAG); + } + zap_cursor_fini(&zc); + fnvlist_add_nvlist(propval, ZPROP_VALUE, val); + fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval); +fail: + nvlist_free(val); + nvlist_free(propval); +} + +static void +get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (dsl_dataset_has_resume_receive_state(ds)) { + char *str; + void *packed; + uint8_t *compressed; + uint64_t val; + nvlist_t *token_nv = fnvlist_alloc(); + size_t packed_size, compressed_size; + + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "fromguid", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "object", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "offset", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "bytes", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "toguid", val); + } + char buf[256]; + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) { + fnvlist_add_string(token_nv, "toname", buf); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_EMBEDOK) == 0) { + fnvlist_add_boolean(token_nv, "embedok"); + } + packed = fnvlist_pack(token_nv, &packed_size); + fnvlist_free(token_nv); + compressed = kmem_alloc(packed_size, KM_SLEEP); + + compressed_size = gzip_compress(packed, compressed, + packed_size, packed_size, 6); + + zio_cksum_t cksum; + fletcher_4_native(compressed, compressed_size, NULL, &cksum); + + str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP); + for (int i = 0; i < compressed_size; i++) { + (void) sprintf(str + i * 2, "%02x", compressed[i]); + } + str[compressed_size * 2] = '\0'; + char *propval = kmem_asprintf("%u-%llx-%llx-%s", + ZFS_SEND_RESUME_TOKEN_VERSION, + (longlong_t)cksum.zc_word[0], + (longlong_t)packed_size, str); + dsl_prop_nvlist_add_string(nv, + ZFS_PROP_RECEIVE_RESUME_TOKEN, propval); + kmem_free(packed, packed_size); + kmem_free(str, compressed_size * 2 + 1); + kmem_free(compressed, packed_size); + strfree(propval); + } +} + +void +dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + uint64_t refd, avail, uobjs, aobjs, ratio; + + ASSERT(dsl_pool_config_held(dp)); + + ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 : + (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 / + dsl_dataset_phys(ds)->ds_compressed_bytes); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, + dsl_dataset_phys(ds)->ds_uncompressed_bytes); + + if (ds->ds_is_snapshot) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, + dsl_dataset_phys(ds)->ds_unique_bytes); + get_clones_stat(ds, nv); + } else { + if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) { + char buf[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(ds->ds_prev, buf); + dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf); + } + + dsl_dir_stats(ds->ds_dir, nv); + } + + dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, + dsl_dataset_phys(ds)->ds_creation_time); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, + dsl_dataset_phys(ds)->ds_creation_txg); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, + ds->ds_quota); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, + ds->ds_reserved); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, + dsl_dataset_phys(ds)->ds_guid); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, + dsl_dataset_phys(ds)->ds_unique_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, + ds->ds_object); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, + ds->ds_userrefs); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, + DS_IS_DEFER_DESTROY(ds) ? 1 : 0); + + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + uint64_t written, comp, uncomp; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_dataset_t *prev; + + int err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); + if (err == 0) { + err = dsl_dataset_space_written(prev, ds, &written, + &comp, &uncomp); + dsl_dataset_rele(prev, FTAG); + if (err == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, + written); + } + } + } + + if (!dsl_dataset_is_snapshot(ds)) { + /* + * A failed "newfs" (e.g. full) resumable receive leaves + * the stats set on this dataset. Check here for the prop. + */ + get_receive_resume_stats(ds, nv); + + /* + * A failed incremental resumable receive leaves the + * stats set on our child named "%recv". Check the child + * for the prop. + */ + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + dsl_dataset_t *recv_ds; + dsl_dataset_name(ds, recvname); + if (strlcat(recvname, "/", sizeof (recvname)) < + sizeof (recvname) && + strlcat(recvname, recv_clone_name, sizeof (recvname)) < + sizeof (recvname) && + dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) { + get_receive_resume_stats(recv_ds, nv); + dsl_dataset_rele(recv_ds, FTAG); + } + } +} + +void +dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + ASSERT(dsl_pool_config_held(dp)); + + stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg; + stat->dds_inconsistent = + dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT; + stat->dds_guid = dsl_dataset_phys(ds)->ds_guid; + stat->dds_origin[0] = '\0'; + if (ds->ds_is_snapshot) { + stat->dds_is_snapshot = B_TRUE; + stat->dds_num_clones = + dsl_dataset_phys(ds)->ds_num_children - 1; + } else { + stat->dds_is_snapshot = B_FALSE; + stat->dds_num_clones = 0; + + if (dsl_dir_is_clone(ds->ds_dir)) { + dsl_dataset_t *ods; + + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_origin_obj, + FTAG, &ods)); + dsl_dataset_name(ods, stat->dds_origin); + dsl_dataset_rele(ods, FTAG); + } + } +} + +uint64_t +dsl_dataset_fsid_guid(dsl_dataset_t *ds) +{ + return (ds->ds_fsid_guid); +} + +void +dsl_dataset_space(dsl_dataset_t *ds, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { - *refdbytesp = ds->ds_phys->ds_used_bytes; + *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes; *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); - if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) - *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; + if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) + *availbytesp += + ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes; if (ds->ds_quota != 0) { /* * Adjust available bytes according to refquota @@ -2109,316 +2063,452 @@ dsl_dataset_space(dsl_dataset_t *ds, else *availbytesp = 0; } - *usedobjsp = ds->ds_phys->ds_bp.blk_fill; + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp); + rrw_exit(&ds->ds_bp_rwlock, FTAG); *availobjsp = DN_MAX_OBJECT - *usedobjsp; } boolean_t -dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) +dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) { dsl_pool_t *dp = ds->ds_dir->dd_pool; + uint64_t birth; - ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || - dsl_pool_sync_context(dp)); - if (ds->ds_prev == NULL) + ASSERT(dsl_pool_config_held(dp)); + if (snap == NULL) return (B_FALSE); - if (ds->ds_phys->ds_bp.blk_birth > - ds->ds_prev->ds_phys->ds_creation_txg) - return (B_TRUE); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + birth = dsl_dataset_get_blkptr(ds)->blk_birth; + rrw_exit(&ds->ds_bp_rwlock, FTAG); + if (birth > dsl_dataset_phys(snap)->ds_creation_txg) { + objset_t *os, *os_snap; + /* + * It may be that only the ZIL differs, because it was + * reset in the head. Don't count that as being + * modified. + */ + if (dmu_objset_from_ds(ds, &os) != 0) + return (B_TRUE); + if (dmu_objset_from_ds(snap, &os_snap) != 0) + return (B_TRUE); + return (bcmp(&os->os_phys->os_meta_dnode, + &os_snap->os_phys->os_meta_dnode, + sizeof (os->os_phys->os_meta_dnode)) != 0); + } return (B_FALSE); } +typedef struct dsl_dataset_rename_snapshot_arg { + const char *ddrsa_fsname; + const char *ddrsa_oldsnapname; + const char *ddrsa_newsnapname; + boolean_t ddrsa_recursive; + dmu_tx_t *ddrsa_tx; +} dsl_dataset_rename_snapshot_arg_t; + /* ARGSUSED */ static int -dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, + dsl_dataset_t *hds, void *arg) { - dsl_dataset_t *ds = arg1; - char *newsnapname = arg2; - dsl_dir_t *dd = ds->ds_dir; - dsl_dataset_t *hds; + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + int error; uint64_t val; - int err; - - err = dsl_dataset_hold_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); - if (err) - return (err); - /* new name better not be in use */ - err = dsl_dataset_snap_lookup(hds, newsnapname, &val); - dsl_dataset_rele(hds, FTAG); + error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); + if (error != 0) { + /* ignore nonexistent snapshots */ + return (error == ENOENT ? 0 : error); + } - if (err == 0) - err = EEXIST; - else if (err == ENOENT) - err = 0; + /* new name should not exist */ + error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val); + if (error == 0) + error = SET_ERROR(EEXIST); + else if (error == ENOENT) + error = 0; /* dataset name + 1 for the "@" + the new snapshot name must fit */ - if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) - err = ENAMETOOLONG; + if (dsl_dir_namelen(hds->ds_dir) + 1 + + strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN) + error = SET_ERROR(ENAMETOOLONG); - return (err); + return (error); } -static void -dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, - cred_t *cr, dmu_tx_t *tx) +static int +dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - const char *newsnapname = arg2; - dsl_dir_t *dd = ds->ds_dir; - objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; - int err; - - ASSERT(ds->ds_phys->ds_next_snap_obj != 0); - - VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); + int error; - VERIFY(0 == dsl_dataset_get_snapname(ds)); - err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); - ASSERT3U(err, ==, 0); - mutex_enter(&ds->ds_lock); - (void) strcpy(ds->ds_snapname, newsnapname); - mutex_exit(&ds->ds_lock); - err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, - ds->ds_snapname, 8, 1, &ds->ds_object, tx); - ASSERT3U(err, ==, 0); + error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds); + if (error != 0) + return (error); - spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, - cr, "dataset = %llu", ds->ds_object); + if (ddrsa->ddrsa_recursive) { + error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object, + dsl_dataset_rename_snapshot_check_impl, ddrsa, + DS_FIND_CHILDREN); + } else { + error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa); + } dsl_dataset_rele(hds, FTAG); + return (error); } -struct renamesnaparg { - dsl_sync_task_group_t *dstg; - char failed[MAXPATHLEN]; - char *oldsnap; - char *newsnap; -}; - static int -dsl_snapshot_rename_one(const char *name, void *arg) +dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, + dsl_dataset_t *hds, void *arg) { - struct renamesnaparg *ra = arg; - dsl_dataset_t *ds = NULL; - char *snapname; - int err; - - snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); - (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); +#ifdef __FreeBSD__ +#ifdef _KERNEL + char *oldname, *newname; +#endif +#endif + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + dsl_dataset_t *ds; + uint64_t val; + dmu_tx_t *tx = ddrsa->ddrsa_tx; + int error; - /* - * For recursive snapshot renames the parent won't be changing - * so we just pass name for both the to/from argument. - */ - err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); - if (err != 0) { - strfree(snapname); - return (err == ENOENT ? 0 : err); + error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); + ASSERT(error == 0 || error == ENOENT); + if (error == ENOENT) { + /* ignore nonexistent snapshots */ + return (0); } + VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds)); + + /* log before we change the name */ + spa_history_log_internal_ds(ds, "rename", tx, + "-> @%s", ddrsa->ddrsa_newsnapname); + + VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx, + B_FALSE)); + mutex_enter(&ds->ds_lock); + (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname); + mutex_exit(&ds->ds_lock); + VERIFY0(zap_add(dp->dp_meta_objset, + dsl_dataset_phys(hds)->ds_snapnames_zapobj, + ds->ds_snapname, 8, 1, &ds->ds_object, tx)); + +#ifdef __FreeBSD__ #ifdef _KERNEL - /* - * For all filesystems undergoing rename, we'll need to unmount it. - */ - (void) zfs_unmount_snap(snapname, NULL); + oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + newname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname, + ddrsa->ddrsa_oldsnapname); + snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname, + ddrsa->ddrsa_newsnapname); + zfsvfs_update_fromname(oldname, newname); + zvol_rename_minors(oldname, newname); + kmem_free(newname, MAXPATHLEN); + kmem_free(oldname, MAXPATHLEN); #endif - err = dsl_dataset_hold(snapname, ra->dstg, &ds); - strfree(snapname); - if (err != 0) - return (err == ENOENT ? 0 : err); - - dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, - dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); +#endif + dsl_dataset_rele(ds, FTAG); return (0); } -static int -dsl_recursive_rename(char *oldname, const char *newname) +static void +dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx) { - int err; - struct renamesnaparg *ra; - dsl_sync_task_t *dst; - spa_t *spa; - char *cp, *fsname = spa_strdup(oldname); - int len = strlen(oldname) + 1; + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; - /* truncate the snapshot name to get the fsname */ - cp = strchr(fsname, '@'); - *cp = '\0'; - - err = spa_open(fsname, &spa, FTAG); - if (err) { - kmem_free(fsname, len); - return (err); + VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds)); + ddrsa->ddrsa_tx = tx; + if (ddrsa->ddrsa_recursive) { + VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object, + dsl_dataset_rename_snapshot_sync_impl, ddrsa, + DS_FIND_CHILDREN)); + } else { + VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa)); } - ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); - ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + dsl_dataset_rele(hds, FTAG); +} - ra->oldsnap = strchr(oldname, '@') + 1; - ra->newsnap = strchr(newname, '@') + 1; - *ra->failed = '\0'; +int +dsl_dataset_rename_snapshot(const char *fsname, + const char *oldsnapname, const char *newsnapname, boolean_t recursive) +{ + dsl_dataset_rename_snapshot_arg_t ddrsa; - err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, - DS_FIND_CHILDREN); - kmem_free(fsname, len); + ddrsa.ddrsa_fsname = fsname; + ddrsa.ddrsa_oldsnapname = oldsnapname; + ddrsa.ddrsa_newsnapname = newsnapname; + ddrsa.ddrsa_recursive = recursive; + + return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check, + dsl_dataset_rename_snapshot_sync, &ddrsa, + 1, ZFS_SPACE_CHECK_RESERVED)); +} - if (err == 0) { - err = dsl_sync_task_group_wait(ra->dstg); - } +/* + * If we're doing an ownership handoff, we need to make sure that there is + * only one long hold on the dataset. We're not allowed to change anything here + * so we don't permanently release the long hold or regular hold here. We want + * to do this only when syncing to avoid the dataset unexpectedly going away + * when we release the long hold. + */ +static int +dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) +{ + boolean_t held; - for (dst = list_head(&ra->dstg->dstg_tasks); dst; - dst = list_next(&ra->dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; - if (dst->dst_err) { - dsl_dir_name(ds->ds_dir, ra->failed); - (void) strlcat(ra->failed, "@", sizeof (ra->failed)); - (void) strlcat(ra->failed, ra->newsnap, - sizeof (ra->failed)); - } - dsl_dataset_rele(ds, ra->dstg); - } + if (!dmu_tx_is_syncing(tx)) + return (0); - if (err) - (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); + if (owner != NULL) { + VERIFY3P(ds->ds_owner, ==, owner); + dsl_dataset_long_rele(ds, owner); + } - dsl_sync_task_group_destroy(ra->dstg); - kmem_free(ra, sizeof (struct renamesnaparg)); - spa_close(spa, FTAG); - return (err); -} + held = dsl_dataset_long_held(ds); -static int -dsl_valid_rename(const char *oldname, void *arg) -{ - int delta = *(int *)arg; + if (owner != NULL) + dsl_dataset_long_hold(ds, owner); - if (strlen(oldname) + delta >= MAXNAMELEN) - return (ENAMETOOLONG); + if (held) + return (SET_ERROR(EBUSY)); return (0); } -#pragma weak dmu_objset_rename = dsl_dataset_rename -int -dsl_dataset_rename(const char *oldname, const char *newname, boolean_t recursive) +typedef struct dsl_dataset_rollback_arg { + const char *ddra_fsname; + void *ddra_owner; + nvlist_t *ddra_result; +} dsl_dataset_rollback_arg_t; + +static int +dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd; + dsl_dataset_rollback_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; - const char *tail; - int err; + int64_t unused_refres_delta; + int error; - err = dsl_dir_open(oldname, FTAG, &dd, &tail); - if (err) - return (err); + error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds); + if (error != 0) + return (error); - if (tail == NULL) { - int delta = strlen(newname) - strlen(oldname); + /* must not be a snapshot */ + if (ds->ds_is_snapshot) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } - /* if we're growing, validate child name lengths */ - if (delta > 0) - err = dmu_objset_find(oldname, dsl_valid_rename, - &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + /* must have a most recent snapshot */ + if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } - if (!err) { - /* - * If there are more than 2 references there may be - * holds hanging around that haven't been cleared - * out yet. - */ - if (dmu_buf_refcount(dd->dd_dbuf) > 2) - txg_wait_synced(dd->dd_pool, 0); + /* + * No rollback to a snapshot created in the current txg, because + * the rollback may dirty the dataset and create blocks that are + * not reachable from the rootbp while having a birth txg that + * falls into the snapshot's range. + */ + if (dmu_tx_is_syncing(tx) && + dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EAGAIN)); + } - err = dsl_dir_rename(dd, newname); + /* must not have any bookmarks after the most recent snapshot */ + nvlist_t *proprequest = fnvlist_alloc(); + fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG)); + nvlist_t *bookmarks = fnvlist_alloc(); + error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks); + fnvlist_free(proprequest); + if (error != 0) + return (error); + for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) { + nvlist_t *valuenv = + fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair), + zfs_prop_to_name(ZFS_PROP_CREATETXG)); + uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value"); + if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) { + fnvlist_free(bookmarks); + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EEXIST)); } - dsl_dir_close(dd, FTAG); - return (err); } + fnvlist_free(bookmarks); - if (tail[0] != '@') { - /* the name ended in a nonexistent component */ - dsl_dir_close(dd, FTAG); - return (ENOENT); - } - - dsl_dir_close(dd, FTAG); - - /* new name must be snapshot in same filesystem */ - tail = strchr(newname, '@'); - if (tail == NULL) - return (EINVAL); - tail++; - if (strncmp(oldname, newname, tail - newname) != 0) - return (EXDEV); + error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } - if (recursive) { - err = dsl_recursive_rename(oldname, newname); - } else { - err = dsl_dataset_hold(oldname, FTAG, &ds); - if (err) - return (err); + /* + * Check if the snap we are rolling back to uses more than + * the refquota. + */ + if (ds->ds_quota != 0 && + dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EDQUOT)); + } - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_snapshot_rename_check, - dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); + /* + * When we do the clone swap, we will temporarily use more space + * due to the refreservation (the head will no longer have any + * unique space, so the entire amount of the refreservation will need + * to be free). We will immediately destroy the clone, freeing + * this space, but the freeing happens over many txg's. + */ + unused_refres_delta = (int64_t)MIN(ds->ds_reserved, + dsl_dataset_phys(ds)->ds_unique_bytes); + if (unused_refres_delta > 0 && + unused_refres_delta > + dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) { dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOSPC)); } - return (err); + dsl_dataset_rele(ds, FTAG); + return (0); } -struct promotenode { - list_node_t link; - dsl_dataset_t *ds; +static void +dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_rollback_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds, *clone; + uint64_t cloneobj; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + + VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds)); + + dsl_dataset_name(ds->ds_prev, namebuf); + fnvlist_add_string(ddra->ddra_result, "target", namebuf); + + cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback", + ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx); + + VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone)); + + dsl_dataset_clone_swap_sync_impl(clone, ds, tx); + dsl_dataset_zero_zil(ds, tx); + + dsl_destroy_head_sync_impl(clone, tx); + + dsl_dataset_rele(clone, FTAG); + dsl_dataset_rele(ds, FTAG); +} + +/* + * Rolls back the given filesystem or volume to the most recent snapshot. + * The name of the most recent snapshot will be returned under key "target" + * in the result nvlist. + * + * If owner != NULL: + * - The existing dataset MUST be owned by the specified owner at entry + * - Upon return, dataset will still be held by the same owner, whether we + * succeed or not. + * + * This mode is required any time the existing filesystem is mounted. See + * notes above zfs_suspend_fs() for further details. + */ +int +dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result) +{ + dsl_dataset_rollback_arg_t ddra; + + ddra.ddra_fsname = fsname; + ddra.ddra_owner = owner; + ddra.ddra_result = result; + + return (dsl_sync_task(fsname, dsl_dataset_rollback_check, + dsl_dataset_rollback_sync, &ddra, + 1, ZFS_SPACE_CHECK_RESERVED)); +} + +struct promotenode { + list_node_t link; + dsl_dataset_t *ds; }; -struct promotearg { +typedef struct dsl_dataset_promote_arg { + const char *ddpa_clonename; + dsl_dataset_t *ddpa_clone; list_t shared_snaps, origin_snaps, clone_snaps; - dsl_dataset_t *origin_origin, *origin_head; + dsl_dataset_t *origin_origin; /* origin of the origin */ uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; char *err_ds; -}; + cred_t *cr; +} dsl_dataset_promote_arg_t; static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); +static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, + void *tag); +static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag); -/* ARGSUSED */ static int -dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *hds = arg1; - struct promotearg *pa = arg2; - struct promotenode *snap = list_head(&pa->shared_snaps); - dsl_dataset_t *origin_ds = snap->ds; + dsl_dataset_promote_arg_t *ddpa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; + struct promotenode *snap; + dsl_dataset_t *origin_ds; int err; + uint64_t unused; + uint64_t ss_mv_cnt; + size_t max_snap_len; - /* Check that it is a real clone */ - if (!dsl_dir_is_clone(hds->ds_dir)) - return (EINVAL); + err = promote_hold(ddpa, dp, FTAG); + if (err != 0) + return (err); - /* Since this is so expensive, don't do the preliminary check */ - if (!dmu_tx_is_syncing(tx)) + hds = ddpa->ddpa_clone; + max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1; + + if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) { + promote_rele(ddpa, FTAG); + return (SET_ERROR(EXDEV)); + } + + /* + * Compute and check the amount of space to transfer. Since this is + * so expensive, don't do the preliminary check. + */ + if (!dmu_tx_is_syncing(tx)) { + promote_rele(ddpa, FTAG); return (0); + } - if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) - return (EXDEV); + snap = list_head(&ddpa->shared_snaps); + origin_ds = snap->ds; /* compute origin's new unique space */ - snap = list_tail(&pa->clone_snaps); - ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); - err = bplist_space_birthrange(&snap->ds->ds_deadlist, - origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique); - if (err) - return (err); + snap = list_tail(&ddpa->clone_snaps); + ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, + origin_ds->ds_object); + dsl_deadlist_space_range(&snap->ds->ds_deadlist, + dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX, + &ddpa->unique, &unused, &unused); /* * Walk the snapshots that we are moving * * Compute space to transfer. Consider the incremental changes - * to used for each snapshot: + * to used by each snapshot: * (my used) = (prev's used) + (blocks born) - (blocks killed) * So each snapshot gave birth to: * (blocks born) = (my used) - (prev's used) + (blocks killed) @@ -2429,51 +2519,71 @@ dsl_dataset_promote_check(void *arg1, vo * Note however, if we stop before we reach the ORIGIN we get: * uN + kN + kN-1 + ... + kM - uM-1 */ - pa->used = origin_ds->ds_phys->ds_used_bytes; - pa->comp = origin_ds->ds_phys->ds_compressed_bytes; - pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; - for (snap = list_head(&pa->shared_snaps); snap; - snap = list_next(&pa->shared_snaps, snap)) { + ss_mv_cnt = 0; + ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes; + ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes; + ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes; + for (snap = list_head(&ddpa->shared_snaps); snap; + snap = list_next(&ddpa->shared_snaps, snap)) { uint64_t val, dlused, dlcomp, dluncomp; dsl_dataset_t *ds = snap->ds; + ss_mv_cnt++; + + /* + * If there are long holds, we won't be able to evict + * the objset. + */ + if (dsl_dataset_long_held(ds)) { + err = SET_ERROR(EBUSY); + goto out; + } + /* Check that the snapshot name does not conflict */ - VERIFY(0 == dsl_dataset_get_snapname(ds)); + VERIFY0(dsl_dataset_get_snapname(ds)); + if (strlen(ds->ds_snapname) >= max_snap_len) { + err = SET_ERROR(ENAMETOOLONG); + goto out; + } err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); if (err == 0) { - err = EEXIST; + (void) strcpy(ddpa->err_ds, snap->ds->ds_snapname); + err = SET_ERROR(EEXIST); goto out; } if (err != ENOENT) goto out; /* The very first snapshot does not have a deadlist */ - if (ds->ds_phys->ds_prev_snap_obj == 0) + if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0) continue; - if (err = bplist_space(&ds->ds_deadlist, - &dlused, &dlcomp, &dluncomp)) - goto out; - pa->used += dlused; - pa->comp += dlcomp; - pa->uncomp += dluncomp; + dsl_deadlist_space(&ds->ds_deadlist, + &dlused, &dlcomp, &dluncomp); + ddpa->used += dlused; + ddpa->comp += dlcomp; + ddpa->uncomp += dluncomp; } /* * If we are a clone of a clone then we never reached ORIGIN, * so we need to subtract out the clone origin's used space. */ - if (pa->origin_origin) { - pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; - pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; - pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; + if (ddpa->origin_origin) { + ddpa->used -= + dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes; + ddpa->comp -= + dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes; + ddpa->uncomp -= + dsl_dataset_phys(ddpa->origin_origin)-> + ds_uncompressed_bytes; } - /* Check that there is enough space here */ + /* Check that there is enough space and limit headroom here */ err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, - pa->used); - if (err) - return (err); + 0, ss_mv_cnt, ddpa->used, ddpa->cr); + if (err != 0) + goto out; /* * Compute the amounts of space that will be used by snapshots @@ -2481,120 +2591,217 @@ dsl_dataset_promote_check(void *arg1, vo * it is the amount of space that will be on all of their * deadlists (that was not born before their new origin). */ - if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) { uint64_t space; /* * Note, typically this will not be a clone of a clone, - * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so - * these snaplist_space() -> bplist_space_birthrange() + * so dd_origin_txg will be < TXG_INITIAL, so + * these snaplist_space() -> dsl_deadlist_space_range() * calls will be fast because they do not have to * iterate over all bps. */ - snap = list_head(&pa->origin_snaps); - err = snaplist_space(&pa->shared_snaps, - snap->ds->ds_origin_txg, &pa->cloneusedsnap); - if (err) - return (err); + snap = list_head(&ddpa->origin_snaps); + err = snaplist_space(&ddpa->shared_snaps, + snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap); + if (err != 0) + goto out; - err = snaplist_space(&pa->clone_snaps, - snap->ds->ds_origin_txg, &space); - if (err) - return (err); - pa->cloneusedsnap += space; + err = snaplist_space(&ddpa->clone_snaps, + snap->ds->ds_dir->dd_origin_txg, &space); + if (err != 0) + goto out; + ddpa->cloneusedsnap += space; } - if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { - err = snaplist_space(&pa->origin_snaps, - origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); - if (err) - return (err); + if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags & + DD_FLAG_USED_BREAKDOWN) { + err = snaplist_space(&ddpa->origin_snaps, + dsl_dataset_phys(origin_ds)->ds_creation_txg, + &ddpa->originusedsnap); + if (err != 0) + goto out; } - return (0); out: - pa->err_ds = snap->ds->ds_snapname; + promote_rele(ddpa, FTAG); return (err); } static void -dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *hds = arg1; - struct promotearg *pa = arg2; - struct promotenode *snap = list_head(&pa->shared_snaps); - dsl_dataset_t *origin_ds = snap->ds; + dsl_dataset_promote_arg_t *ddpa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; + struct promotenode *snap; + dsl_dataset_t *origin_ds; dsl_dataset_t *origin_head; - dsl_dir_t *dd = hds->ds_dir; - dsl_pool_t *dp = hds->ds_dir->dd_pool; + dsl_dir_t *dd; dsl_dir_t *odd = NULL; uint64_t oldnext_obj; int64_t delta; +#if defined(__FreeBSD__) && defined(_KERNEL) + char *oldname, *newname; +#endif - ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); + VERIFY0(promote_hold(ddpa, dp, FTAG)); + hds = ddpa->ddpa_clone; - snap = list_head(&pa->origin_snaps); + ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE); + + snap = list_head(&ddpa->shared_snaps); + origin_ds = snap->ds; + dd = hds->ds_dir; + + snap = list_head(&ddpa->origin_snaps); origin_head = snap->ds; /* * We need to explicitly open odd, since origin_ds's dd will be * changing. */ - VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, + VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object, NULL, FTAG, &odd)); /* change origin's next snap */ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); - oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; - snap = list_tail(&pa->clone_snaps); - ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); - origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; + oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj; + snap = list_tail(&ddpa->clone_snaps); + ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, + origin_ds->ds_object); + dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object; /* change the origin's next clone */ - if (origin_ds->ds_phys->ds_next_clones_obj) { - remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); - VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, - origin_ds->ds_phys->ds_next_clones_obj, + if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) { + dsl_dataset_remove_from_next_clones(origin_ds, + snap->ds->ds_object, tx); + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dataset_phys(origin_ds)->ds_next_clones_obj, oldnext_obj, tx)); } /* change origin */ dmu_buf_will_dirty(dd->dd_dbuf, tx); - ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); - dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; - hds->ds_origin_txg = origin_head->ds_origin_txg; + ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object); + dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj; + dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; dmu_buf_will_dirty(odd->dd_dbuf, tx); - odd->dd_phys->dd_origin_obj = origin_ds->ds_object; - origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg; + dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object; + origin_head->ds_dir->dd_origin_txg = + dsl_dataset_phys(origin_ds)->ds_creation_txg; + + /* change dd_clone entries */ + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + VERIFY0(zap_remove_int(dp->dp_meta_objset, + dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx)); + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, + hds->ds_object, tx)); + + VERIFY0(zap_remove_int(dp->dp_meta_objset, + dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, + origin_head->ds_object, tx)); + if (dsl_dir_phys(dd)->dd_clones == 0) { + dsl_dir_phys(dd)->dd_clones = + zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES, + DMU_OT_NONE, 0, tx); + } + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx)); + } + +#if defined(__FreeBSD__) && defined(_KERNEL) + /* Take the spa_namespace_lock early so zvol renames don't deadlock. */ + mutex_enter(&spa_namespace_lock); + + oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + newname = kmem_alloc(MAXPATHLEN, KM_SLEEP); +#endif /* move snapshots to this dir */ - for (snap = list_head(&pa->shared_snaps); snap; - snap = list_next(&pa->shared_snaps, snap)) { + for (snap = list_head(&ddpa->shared_snaps); snap; + snap = list_next(&ddpa->shared_snaps, snap)) { dsl_dataset_t *ds = snap->ds; - /* unregister props as dsl_dir is changing */ + /* + * Property callbacks are registered to a particular + * dsl_dir. Since ours is changing, evict the objset + * so that they will be unregistered from the old dsl_dir. + */ if (ds->ds_objset) { dmu_objset_evict(ds->ds_objset); ds->ds_objset = NULL; } + /* move snap name entry */ - VERIFY(0 == dsl_dataset_get_snapname(ds)); - VERIFY(0 == dsl_dataset_snap_remove(origin_head, - ds->ds_snapname, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, - hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, + VERIFY0(dsl_dataset_get_snapname(ds)); + VERIFY0(dsl_dataset_snap_remove(origin_head, + ds->ds_snapname, tx, B_TRUE)); + VERIFY0(zap_add(dp->dp_meta_objset, + dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); + dsl_fs_ss_count_adjust(hds->ds_dir, 1, + DD_FIELD_SNAPSHOT_COUNT, tx); + /* change containing dsl_dir */ dmu_buf_will_dirty(ds->ds_dbuf, tx); - ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); - ds->ds_phys->ds_dir_obj = dd->dd_object; + ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object); + dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object; ASSERT3P(ds->ds_dir, ==, odd); - dsl_dir_close(ds->ds_dir, ds); - VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, + dsl_dir_rele(ds->ds_dir, ds); + VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object, NULL, ds, &ds->ds_dir)); - ASSERT3U(dsl_prop_numcb(ds), ==, 0); +#if defined(__FreeBSD__) && defined(_KERNEL) + dsl_dataset_name(ds, newname); + zfsvfs_update_fromname(oldname, newname); + zvol_rename_minors(oldname, newname); +#endif + + /* move any clone references */ + if (dsl_dataset_phys(ds)->ds_next_clones_obj && + spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *cnds; + uint64_t o; + + if (za.za_first_integer == oldnext_obj) { + /* + * We've already moved the + * origin's reference. + */ + continue; + } + + VERIFY0(dsl_dataset_hold_obj(dp, + za.za_first_integer, FTAG, &cnds)); + o = dsl_dir_phys(cnds->ds_dir)-> + dd_head_dataset_obj; + + VERIFY0(zap_remove_int(dp->dp_meta_objset, + dsl_dir_phys(odd)->dd_clones, o, tx)); + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_clones, o, tx)); + dsl_dataset_rele(cnds, FTAG); + } + zap_cursor_fini(&zc); + } + + ASSERT(!dsl_prop_hascb(ds)); } +#if defined(__FreeBSD__) && defined(_KERNEL) + mutex_exit(&spa_namespace_lock); + + kmem_free(newname, MAXPATHLEN); + kmem_free(oldname, MAXPATHLEN); +#endif /* * Change space accounting. * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either @@ -2602,32 +2809,31 @@ dsl_dataset_promote_sync(void *arg1, voi * is true for each of {clone,origin} independently. */ - delta = pa->cloneusedsnap - - dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; + delta = ddpa->cloneusedsnap - + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, >=, 0); - ASSERT3U(pa->used, >=, delta); + ASSERT3U(ddpa->used, >=, delta); dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); dsl_dir_diduse_space(dd, DD_USED_HEAD, - pa->used - delta, pa->comp, pa->uncomp, tx); + ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx); - delta = pa->originusedsnap - - odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; + delta = ddpa->originusedsnap - + dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, <=, 0); - ASSERT3U(pa->used, >=, -delta); + ASSERT3U(ddpa->used, >=, -delta); dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); dsl_dir_diduse_space(odd, DD_USED_HEAD, - -pa->used - delta, -pa->comp, -pa->uncomp, tx); + -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx); - origin_ds->ds_phys->ds_unique_bytes = pa->unique; + dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique; /* log history record */ - spa_history_internal_log(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, - cr, "dataset = %llu", hds->ds_object); + spa_history_log_internal_ds(hds, "promote", tx, ""); - dsl_dir_close(odd, FTAG); + dsl_dir_rele(odd, FTAG); + promote_rele(ddpa, FTAG); } -static char *snaplist_tag = "snaplist"; /* * Make a list of dsl_dataset_t's for the snapshots between first_obj * (exclusive) and last_obj (inclusive). The list will be in reverse @@ -2635,13 +2841,11 @@ static char *snaplist_tag = "snaplist"; * snapshots back to this dataset's origin. */ static int -snaplist_make(dsl_pool_t *dp, boolean_t own, - uint64_t first_obj, uint64_t last_obj, list_t *l) +snaplist_make(dsl_pool_t *dp, + uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag) { uint64_t obj = last_obj; - ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); - list_create(l, sizeof (struct promotenode), offsetof(struct promotenode, link)); @@ -2650,31 +2854,18 @@ snaplist_make(dsl_pool_t *dp, boolean_t struct promotenode *snap; int err; - if (own) { - err = dsl_dataset_own_obj(dp, obj, - 0, snaplist_tag, &ds); - if (err == 0) - dsl_dataset_make_exclusive(ds, snaplist_tag); - } else { - err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); - } - if (err == ENOENT) { - /* lost race with snapshot destroy */ - struct promotenode *last = list_tail(l); - ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); - obj = last->ds->ds_phys->ds_prev_snap_obj; - continue; - } else if (err) { + err = dsl_dataset_hold_obj(dp, obj, tag, &ds); + ASSERT(err != ENOENT); + if (err != 0) return (err); - } if (first_obj == 0) - first_obj = ds->ds_dir->dd_phys->dd_origin_obj; + first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj; - snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); + snap = kmem_alloc(sizeof (*snap), KM_SLEEP); snap->ds = ds; list_insert_tail(l, snap); - obj = ds->ds_phys->ds_prev_snap_obj; + obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; } return (0); @@ -2687,1077 +2878,873 @@ snaplist_space(list_t *l, uint64_t mintx *spacep = 0; for (snap = list_head(l); snap; snap = list_next(l, snap)) { - uint64_t used; - int err = bplist_space_birthrange(&snap->ds->ds_deadlist, - mintxg, UINT64_MAX, &used); - if (err) - return (err); + uint64_t used, comp, uncomp; + dsl_deadlist_space_range(&snap->ds->ds_deadlist, + mintxg, UINT64_MAX, &used, &comp, &uncomp); *spacep += used; } return (0); } static void -snaplist_destroy(list_t *l, boolean_t own) +snaplist_destroy(list_t *l, void *tag) { struct promotenode *snap; - if (!l || !list_link_active(&l->list_head)) + if (l == NULL || !list_link_active(&l->list_head)) return; while ((snap = list_tail(l)) != NULL) { list_remove(l, snap); - if (own) - dsl_dataset_disown(snap->ds, snaplist_tag); - else - dsl_dataset_rele(snap->ds, snaplist_tag); - kmem_free(snap, sizeof (struct promotenode)); + dsl_dataset_rele(snap->ds, tag); + kmem_free(snap, sizeof (*snap)); } list_destroy(l); } -/* - * Promote a clone. Nomenclature note: - * "clone" or "cds": the original clone which is being promoted - * "origin" or "ods": the snapshot which is originally clone's origin - * "origin head" or "ohds": the dataset which is the head - * (filesystem/volume) for the origin - * "origin origin": the origin of the origin's filesystem (typically - * NULL, indicating that the clone is not a clone of a clone). - */ -int -dsl_dataset_promote(const char *name, char *conflsnap) +static int +promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) { - dsl_dataset_t *ds; + int error; dsl_dir_t *dd; - dsl_pool_t *dp; - dmu_object_info_t doi; - struct promotearg pa = { 0 }; struct promotenode *snap; - int err; - - err = dsl_dataset_hold(name, FTAG, &ds); - if (err) - return (err); - dd = ds->ds_dir; - dp = dd->dd_pool; - err = dmu_object_info(dp->dp_meta_objset, - ds->ds_phys->ds_snapnames_zapobj, &doi); - if (err) { - dsl_dataset_rele(ds, FTAG); - return (err); - } + error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag, + &ddpa->ddpa_clone); + if (error != 0) + return (error); + dd = ddpa->ddpa_clone->ds_dir; - if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { - dsl_dataset_rele(ds, FTAG); - return (EINVAL); + if (ddpa->ddpa_clone->ds_is_snapshot || + !dsl_dir_is_clone(dd)) { + dsl_dataset_rele(ddpa->ddpa_clone, tag); + return (SET_ERROR(EINVAL)); } - /* - * We are going to inherit all the snapshots taken before our - * origin (i.e., our new origin will be our parent's origin). - * Take ownership of them so that we can rename them into our - * namespace. - */ - rw_enter(&dp->dp_config_rwlock, RW_READER); - - err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, - &pa.shared_snaps); - if (err != 0) + error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj, + &ddpa->shared_snaps, tag); + if (error != 0) goto out; - err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); - if (err != 0) + error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object, + &ddpa->clone_snaps, tag); + if (error != 0) goto out; - snap = list_head(&pa.shared_snaps); - ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); - err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, - snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); - if (err != 0) + snap = list_head(&ddpa->shared_snaps); + ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj); + error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj, + dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj, + &ddpa->origin_snaps, tag); + if (error != 0) goto out; - if (dsl_dir_is_clone(snap->ds->ds_dir)) { - err = dsl_dataset_own_obj(dp, - snap->ds->ds_dir->dd_phys->dd_origin_obj, - 0, FTAG, &pa.origin_origin); - if (err != 0) + if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) { + error = dsl_dataset_hold_obj(dp, + dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj, + tag, &ddpa->origin_origin); + if (error != 0) goto out; } - out: - rw_exit(&dp->dp_config_rwlock); - - /* - * Add in 128x the snapnames zapobj size, since we will be moving - * a bunch of snapnames to the promoted ds, and dirtying their - * bonus buffers. - */ - if (err == 0) { - err = dsl_sync_task_do(dp, dsl_dataset_promote_check, - dsl_dataset_promote_sync, ds, &pa, - 2 + 2 * doi.doi_physical_blocks_512); - if (err && pa.err_ds && conflsnap) - (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); - } - - snaplist_destroy(&pa.shared_snaps, B_TRUE); - snaplist_destroy(&pa.clone_snaps, B_FALSE); - snaplist_destroy(&pa.origin_snaps, B_FALSE); - if (pa.origin_origin) - dsl_dataset_disown(pa.origin_origin, FTAG); - dsl_dataset_rele(ds, FTAG); - return (err); -} - -struct cloneswaparg { - dsl_dataset_t *cds; /* clone dataset */ - dsl_dataset_t *ohds; /* origin's head dataset */ - boolean_t force; - int64_t unused_refres_delta; /* change in unconsumed refreservation */ -}; - -/* ARGSUSED */ -static int -dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - struct cloneswaparg *csa = arg1; - - /* they should both be heads */ - if (dsl_dataset_is_snapshot(csa->cds) || - dsl_dataset_is_snapshot(csa->ohds)) - return (EINVAL); - - /* the branch point should be just before them */ - if (csa->cds->ds_prev != csa->ohds->ds_prev) - return (EINVAL); - - /* cds should be the clone (unless they are unrelated) */ - if (csa->cds->ds_prev != NULL && - csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && - csa->ohds->ds_object != - csa->cds->ds_prev->ds_phys->ds_next_snap_obj) - return (EINVAL); - - /* the clone should be a child of the origin */ - if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) - return (EINVAL); - - /* ohds shouldn't be modified unless 'force' */ - if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) - return (ETXTBSY); - - /* adjust amount of any unconsumed refreservation */ - csa->unused_refres_delta = - (int64_t)MIN(csa->ohds->ds_reserved, - csa->ohds->ds_phys->ds_unique_bytes) - - (int64_t)MIN(csa->ohds->ds_reserved, - csa->cds->ds_phys->ds_unique_bytes); - - if (csa->unused_refres_delta > 0 && - csa->unused_refres_delta > - dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) - return (ENOSPC); - - if (csa->ohds->ds_quota != 0 && - csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) - return (EDQUOT); - - return (0); + if (error != 0) + promote_rele(ddpa, tag); + return (error); } -/* ARGSUSED */ static void -dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag) { - struct cloneswaparg *csa = arg1; - dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; - - ASSERT(csa->cds->ds_reserved == 0); - ASSERT(csa->ohds->ds_quota == 0 || - csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); - - dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); - dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); - - if (csa->cds->ds_objset != NULL) { - dmu_objset_evict(csa->cds->ds_objset); - csa->cds->ds_objset = NULL; - } - - if (csa->ohds->ds_objset != NULL) { - dmu_objset_evict(csa->ohds->ds_objset); - csa->ohds->ds_objset = NULL; - } - - /* - * Reset origin's unique bytes, if it exists. - */ - if (csa->cds->ds_prev) { - dsl_dataset_t *origin = csa->cds->ds_prev; - dmu_buf_will_dirty(origin->ds_dbuf, tx); - VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, - origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, - &origin->ds_phys->ds_unique_bytes)); - } - - /* swap blkptrs */ - { - blkptr_t tmp; - tmp = csa->ohds->ds_phys->ds_bp; - csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; - csa->cds->ds_phys->ds_bp = tmp; - } - - /* set dd_*_bytes */ - { - int64_t dused, dcomp, duncomp; - uint64_t cdl_used, cdl_comp, cdl_uncomp; - uint64_t odl_used, odl_comp, odl_uncomp; - - ASSERT3U(csa->cds->ds_dir->dd_phys-> - dd_used_breakdown[DD_USED_SNAP], ==, 0); - - VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used, - &cdl_comp, &cdl_uncomp)); - VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used, - &odl_comp, &odl_uncomp)); - - dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - - (csa->ohds->ds_phys->ds_used_bytes + odl_used); - dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - - (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); - duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + - cdl_uncomp - - (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); - - dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, - dused, dcomp, duncomp, tx); - dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, - -dused, -dcomp, -duncomp, tx); - - /* - * The difference in the space used by snapshots is the - * difference in snapshot space due to the head's - * deadlist (since that's the only thing that's - * changing that affects the snapused). - */ - VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, - csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used)); - VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist, - csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used)); - dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, - DD_USED_HEAD, DD_USED_SNAP, tx); - } - -#define SWITCH64(x, y) \ - { \ - uint64_t __tmp = (x); \ - (x) = (y); \ - (y) = __tmp; \ - } - - /* swap ds_*_bytes */ - SWITCH64(csa->ohds->ds_phys->ds_used_bytes, - csa->cds->ds_phys->ds_used_bytes); - SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, - csa->cds->ds_phys->ds_compressed_bytes); - SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, - csa->cds->ds_phys->ds_uncompressed_bytes); - SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, - csa->cds->ds_phys->ds_unique_bytes); - - /* apply any parent delta for change in unconsumed refreservation */ - dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, - csa->unused_refres_delta, 0, 0, tx); - - /* swap deadlists */ - bplist_close(&csa->cds->ds_deadlist); - bplist_close(&csa->ohds->ds_deadlist); - SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, - csa->cds->ds_phys->ds_deadlist_obj); - VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, - csa->cds->ds_phys->ds_deadlist_obj)); - VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, - csa->ohds->ds_phys->ds_deadlist_obj)); - - dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx); + snaplist_destroy(&ddpa->shared_snaps, tag); + snaplist_destroy(&ddpa->clone_snaps, tag); + snaplist_destroy(&ddpa->origin_snaps, tag); + if (ddpa->origin_origin != NULL) + dsl_dataset_rele(ddpa->origin_origin, tag); + dsl_dataset_rele(ddpa->ddpa_clone, tag); } /* - * Swap 'clone' with its origin head datasets. Used at the end of "zfs - * recv" into an existing fs to swizzle the file system to the new - * version, and by "zfs rollback". Can also be used to swap two - * independent head datasets if neither has any snapshots. + * Promote a clone. + * + * If it fails due to a conflicting snapshot name, "conflsnap" will be filled + * in with the name. (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.) */ int -dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, - boolean_t force) -{ - struct cloneswaparg csa; - int error; - - ASSERT(clone->ds_owner); - ASSERT(origin_head->ds_owner); -retry: - /* Need exclusive access for the swap */ - rw_enter(&clone->ds_rwlock, RW_WRITER); - if (!rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { - rw_exit(&clone->ds_rwlock); - rw_enter(&origin_head->ds_rwlock, RW_WRITER); - if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { - rw_exit(&origin_head->ds_rwlock); - goto retry; - } - } - csa.cds = clone; - csa.ohds = origin_head; - csa.force = force; - error = dsl_sync_task_do(clone->ds_dir->dd_pool, - dsl_dataset_clone_swap_check, - dsl_dataset_clone_swap_sync, &csa, NULL, 9); - return (error); -} - -/* - * Given a pool name and a dataset object number in that pool, - * return the name of that dataset. - */ -int -dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) +dsl_dataset_promote(const char *name, char *conflsnap) { - spa_t *spa; - dsl_pool_t *dp; - dsl_dataset_t *ds; + dsl_dataset_promote_arg_t ddpa = { 0 }; + uint64_t numsnaps; int error; + objset_t *os; - if ((error = spa_open(pname, &spa, FTAG)) != 0) + /* + * We will modify space proportional to the number of + * snapshots. Compute numsnaps. + */ + error = dmu_objset_hold(name, FTAG, &os); + if (error != 0) + return (error); + error = zap_count(dmu_objset_pool(os)->dp_meta_objset, + dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj, + &numsnaps); + dmu_objset_rele(os, FTAG); + if (error != 0) return (error); - dp = spa_get_dsl(spa); - rw_enter(&dp->dp_config_rwlock, RW_READER); - if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { - dsl_dataset_name(ds, buf); - dsl_dataset_rele(ds, FTAG); - } - rw_exit(&dp->dp_config_rwlock); - spa_close(spa, FTAG); - return (error); + ddpa.ddpa_clonename = name; + ddpa.err_ds = conflsnap; + ddpa.cr = CRED(); + + return (dsl_sync_task(name, dsl_dataset_promote_check, + dsl_dataset_promote_sync, &ddpa, + 2 + numsnaps, ZFS_SPACE_CHECK_RESERVED)); } int -dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, - uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) +dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx) { - int error = 0; - - ASSERT3S(asize, >, 0); - /* - * *ref_rsrv is the portion of asize that will come from any - * unconsumed refreservation space. + * "slack" factor for received datasets with refquota set on them. + * See the bottom of this function for details on its use. */ - *ref_rsrv = 0; + uint64_t refquota_slack = DMU_MAX_ACCESS * spa_asize_inflation; + int64_t unused_refres_delta; - mutex_enter(&ds->ds_lock); - /* - * Make a space adjustment for reserved bytes. - */ - if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { - ASSERT3U(*used, >=, - ds->ds_reserved - ds->ds_phys->ds_unique_bytes); - *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); - *ref_rsrv = - asize - MIN(asize, parent_delta(ds, asize + inflight)); - } - - if (!check_quota || ds->ds_quota == 0) { - mutex_exit(&ds->ds_lock); - return (0); - } - /* - * If they are requesting more space, and our current estimate - * is over quota, they get to try again unless the actual - * on-disk is over quota and there are no pending changes (which - * may free up space for us). - */ - if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { - if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) - error = ERESTART; - else - error = EDQUOT; - } - mutex_exit(&ds->ds_lock); - - return (error); -} - -/* ARGSUSED */ -static int -dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - dsl_prop_setarg_t *psa = arg2; - int err; - - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) - return (ENOTSUP); + /* they should both be heads */ + if (clone->ds_is_snapshot || + origin_head->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + /* if we are not forcing, the branch point should be just before them */ + if (!force && clone->ds_prev != origin_head->ds_prev) + return (SET_ERROR(EINVAL)); + + /* clone should be the clone (unless they are unrelated) */ + if (clone->ds_prev != NULL && + clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap && + origin_head->ds_dir != clone->ds_prev->ds_dir) + return (SET_ERROR(EINVAL)); - if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) - return (err); + /* the clone should be a child of the origin */ + if (clone->ds_dir->dd_parent != origin_head->ds_dir) + return (SET_ERROR(EINVAL)); - if (psa->psa_effective_value == 0) - return (0); + /* origin_head shouldn't be modified unless 'force' */ + if (!force && + dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev)) + return (SET_ERROR(ETXTBSY)); + + /* origin_head should have no long holds (e.g. is not mounted) */ + if (dsl_dataset_handoff_check(origin_head, owner, tx)) + return (SET_ERROR(EBUSY)); + + /* check amount of any unconsumed refreservation */ + unused_refres_delta = + (int64_t)MIN(origin_head->ds_reserved, + dsl_dataset_phys(origin_head)->ds_unique_bytes) - + (int64_t)MIN(origin_head->ds_reserved, + dsl_dataset_phys(clone)->ds_unique_bytes); + + if (unused_refres_delta > 0 && + unused_refres_delta > + dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE)) + return (SET_ERROR(ENOSPC)); - if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes || - psa->psa_effective_value < ds->ds_reserved) - return (ENOSPC); + /* + * The clone can't be too much over the head's refquota. + * + * To ensure that the entire refquota can be used, we allow one + * transaction to exceed the the refquota. Therefore, this check + * needs to also allow for the space referenced to be more than the + * refquota. The maximum amount of space that one transaction can use + * on disk is DMU_MAX_ACCESS * spa_asize_inflation. Allowing this + * overage ensures that we are able to receive a filesystem that + * exceeds the refquota on the source system. + * + * So that overage is the refquota_slack we use below. + */ + if (origin_head->ds_quota != 0 && + dsl_dataset_phys(clone)->ds_referenced_bytes > + origin_head->ds_quota + refquota_slack) + return (SET_ERROR(EDQUOT)); return (0); } -extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *); - void -dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value = psa->psa_effective_value; - - dsl_prop_set_sync(ds, psa, cr, tx); - DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); - - if (ds->ds_quota != effective_value) { - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_quota = effective_value; - - spa_history_internal_log(LOG_DS_REFQUOTA, - ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu ", - (longlong_t)ds->ds_quota, ds->ds_object); - } -} - -int -dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) +dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, dmu_tx_t *tx) { - dsl_dataset_t *ds; - dsl_prop_setarg_t psa; - int err; - - dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); - - err = dsl_dataset_hold(dsname, FTAG, &ds); - if (err) - return (err); + dsl_pool_t *dp = dmu_tx_pool(tx); + int64_t unused_refres_delta; + ASSERT(clone->ds_reserved == 0); /* - * If someone removes a file, then tries to set the quota, we - * want to make sure the file freeing takes effect. + * NOTE: On DEBUG kernels there could be a race between this and + * the check function if spa_asize_inflation is adjusted... */ - txg_wait_open(ds->ds_dir->dd_pool, 0); - - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, - ds, &psa, 0); - - dsl_dataset_rele(ds, FTAG); - return (err); -} - -static int -dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value; - uint64_t unique; - int err; - - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < - SPA_VERSION_REFRESERVATION) - return (ENOTSUP); - - if (dsl_dataset_is_snapshot(ds)) - return (EINVAL); - - if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) - return (err); - - effective_value = psa->psa_effective_value; + ASSERT(origin_head->ds_quota == 0 || + dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota + + DMU_MAX_ACCESS * spa_asize_inflation); + ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev); /* - * If we are doing the preliminary check in open context, the - * space estimates may be inaccurate. + * Swap per-dataset feature flags. */ - if (!dmu_tx_is_syncing(tx)) - return (0); + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (!(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET)) { + ASSERT(!clone->ds_feature_inuse[f]); + ASSERT(!origin_head->ds_feature_inuse[f]); + continue; + } - mutex_enter(&ds->ds_lock); - unique = dsl_dataset_unique(ds); - mutex_exit(&ds->ds_lock); + boolean_t clone_inuse = clone->ds_feature_inuse[f]; + boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f]; - if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { - uint64_t delta = MAX(unique, effective_value) - - MAX(unique, ds->ds_reserved); - - if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) - return (ENOSPC); - if (ds->ds_quota > 0 && - effective_value > ds->ds_quota) - return (ENOSPC); + if (clone_inuse) { + dsl_dataset_deactivate_feature(clone->ds_object, f, tx); + clone->ds_feature_inuse[f] = B_FALSE; + } + if (origin_head_inuse) { + dsl_dataset_deactivate_feature(origin_head->ds_object, + f, tx); + origin_head->ds_feature_inuse[f] = B_FALSE; + } + if (clone_inuse) { + dsl_dataset_activate_feature(origin_head->ds_object, + f, tx); + origin_head->ds_feature_inuse[f] = B_TRUE; + } + if (origin_head_inuse) { + dsl_dataset_activate_feature(clone->ds_object, f, tx); + clone->ds_feature_inuse[f] = B_TRUE; + } } - return (0); -} + dmu_buf_will_dirty(clone->ds_dbuf, tx); + dmu_buf_will_dirty(origin_head->ds_dbuf, tx); -/* ARGSUSED */ -static void -dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, - dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value = psa->psa_effective_value; - uint64_t unique; - int64_t delta; - - dsl_prop_set_sync(ds, psa, cr, tx); - DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - - mutex_enter(&ds->ds_dir->dd_lock); - mutex_enter(&ds->ds_lock); - unique = dsl_dataset_unique(ds); - delta = MAX(0, (int64_t)(effective_value - unique)) - - MAX(0, (int64_t)(ds->ds_reserved - unique)); - ds->ds_reserved = effective_value; - mutex_exit(&ds->ds_lock); - - dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); - mutex_exit(&ds->ds_dir->dd_lock); - - spa_history_internal_log(LOG_DS_REFRESERV, - ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu", - (longlong_t)effective_value, ds->ds_object); -} - -int -dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, - uint64_t reservation) -{ - dsl_dataset_t *ds; - dsl_prop_setarg_t psa; - int err; - - dsl_prop_setarg_init_uint64(&psa, "refreservation", source, - &reservation); - - err = dsl_dataset_hold(dsname, FTAG, &ds); - if (err) - return (err); - - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_set_reservation_check, - dsl_dataset_set_reservation_sync, ds, &psa, 0); - - dsl_dataset_rele(ds, FTAG); - return (err); -} - -struct dsl_ds_holdarg { - dsl_sync_task_group_t *dstg; - char *htag; - char *snapname; - boolean_t recursive; - boolean_t gotone; - boolean_t temphold; - char failed[MAXPATHLEN]; -}; + if (clone->ds_objset != NULL) { + dmu_objset_evict(clone->ds_objset); + clone->ds_objset = NULL; + } -/* - * The max length of a temporary tag prefix is the number of hex digits - * required to express UINT64_MAX plus one for the hyphen. - */ -#define MAX_TAG_PREFIX_LEN 17 + if (origin_head->ds_objset != NULL) { + dmu_objset_evict(origin_head->ds_objset); + origin_head->ds_objset = NULL; + } -static int -dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct dsl_ds_holdarg *ha = arg2; - char *htag = ha->htag; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - int error = 0; + unused_refres_delta = + (int64_t)MIN(origin_head->ds_reserved, + dsl_dataset_phys(origin_head)->ds_unique_bytes) - + (int64_t)MIN(origin_head->ds_reserved, + dsl_dataset_phys(clone)->ds_unique_bytes); - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) - return (ENOTSUP); + /* + * Reset origin's unique bytes, if it exists. + */ + if (clone->ds_prev) { + dsl_dataset_t *origin = clone->ds_prev; + uint64_t comp, uncomp; - if (!dsl_dataset_is_snapshot(ds)) - return (EINVAL); + dmu_buf_will_dirty(origin->ds_dbuf, tx); + dsl_deadlist_space_range(&clone->ds_deadlist, + dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX, + &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp); + } - /* tags must be unique */ - mutex_enter(&ds->ds_lock); - if (ds->ds_phys->ds_userrefs_obj) { - error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, - 8, 1, tx); - if (error == 0) - error = EEXIST; - else if (error == ENOENT) - error = 0; + /* swap blkptrs */ + { + rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG); + rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG); + blkptr_t tmp; + tmp = dsl_dataset_phys(origin_head)->ds_bp; + dsl_dataset_phys(origin_head)->ds_bp = + dsl_dataset_phys(clone)->ds_bp; + dsl_dataset_phys(clone)->ds_bp = tmp; + rrw_exit(&origin_head->ds_bp_rwlock, FTAG); + rrw_exit(&clone->ds_bp_rwlock, FTAG); } - mutex_exit(&ds->ds_lock); - if (error == 0 && ha->temphold && - strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) - error = E2BIG; + /* set dd_*_bytes */ + { + int64_t dused, dcomp, duncomp; + uint64_t cdl_used, cdl_comp, cdl_uncomp; + uint64_t odl_used, odl_comp, odl_uncomp; - return (error); -} + ASSERT3U(dsl_dir_phys(clone->ds_dir)-> + dd_used_breakdown[DD_USED_SNAP], ==, 0); -static void -dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct dsl_ds_holdarg *ha = arg2; - char *htag = ha->htag; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - uint64_t now = gethrestime_sec(); - uint64_t zapobj; + dsl_deadlist_space(&clone->ds_deadlist, + &cdl_used, &cdl_comp, &cdl_uncomp); + dsl_deadlist_space(&origin_head->ds_deadlist, + &odl_used, &odl_comp, &odl_uncomp); + + dused = dsl_dataset_phys(clone)->ds_referenced_bytes + + cdl_used - + (dsl_dataset_phys(origin_head)->ds_referenced_bytes + + odl_used); + dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes + + cdl_comp - + (dsl_dataset_phys(origin_head)->ds_compressed_bytes + + odl_comp); + duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes + + cdl_uncomp - + (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes + + odl_uncomp); + + dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD, + dused, dcomp, duncomp, tx); + dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD, + -dused, -dcomp, -duncomp, tx); - mutex_enter(&ds->ds_lock); - if (ds->ds_phys->ds_userrefs_obj == 0) { /* - * This is the first user hold for this dataset. Create - * the userrefs zap object. + * The difference in the space used by snapshots is the + * difference in snapshot space due to the head's + * deadlist (since that's the only thing that's + * changing that affects the snapused). */ - dmu_buf_will_dirty(ds->ds_dbuf, tx); - zapobj = ds->ds_phys->ds_userrefs_obj = - zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); - } else { - zapobj = ds->ds_phys->ds_userrefs_obj; + dsl_deadlist_space_range(&clone->ds_deadlist, + origin_head->ds_dir->dd_origin_txg, UINT64_MAX, + &cdl_used, &cdl_comp, &cdl_uncomp); + dsl_deadlist_space_range(&origin_head->ds_deadlist, + origin_head->ds_dir->dd_origin_txg, UINT64_MAX, + &odl_used, &odl_comp, &odl_uncomp); + dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used, + DD_USED_HEAD, DD_USED_SNAP, NULL); } - ds->ds_userrefs++; - mutex_exit(&ds->ds_lock); - VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); + /* swap ds_*_bytes */ + SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes, + dsl_dataset_phys(clone)->ds_referenced_bytes); + SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes, + dsl_dataset_phys(clone)->ds_compressed_bytes); + SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes, + dsl_dataset_phys(clone)->ds_uncompressed_bytes); + SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes, + dsl_dataset_phys(clone)->ds_unique_bytes); - if (ha->temphold) { - VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, - htag, &now, tx)); - } + /* apply any parent delta for change in unconsumed refreservation */ + dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV, + unused_refres_delta, 0, 0, tx); + + /* + * Swap deadlists. + */ + dsl_deadlist_close(&clone->ds_deadlist); + dsl_deadlist_close(&origin_head->ds_deadlist); + SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj, + dsl_dataset_phys(clone)->ds_deadlist_obj); + dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset, + dsl_dataset_phys(clone)->ds_deadlist_obj); + dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, + dsl_dataset_phys(origin_head)->ds_deadlist_obj); - spa_history_internal_log(LOG_DS_USER_HOLD, - dp->dp_spa, tx, cr, "<%s> temp = %d dataset = %llu", htag, - (int)ha->temphold, ds->ds_object); + dsl_scan_ds_clone_swapped(origin_head, clone, tx); + + spa_history_log_internal_ds(clone, "clone swap", tx, + "parent=%s", origin_head->ds_dir->dd_myname); } -static int -dsl_dataset_user_hold_one(const char *dsname, void *arg) +/* + * Given a pool name and a dataset object number in that pool, + * return the name of that dataset. + */ +int +dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) { - struct dsl_ds_holdarg *ha = arg; + dsl_pool_t *dp; dsl_dataset_t *ds; int error; - char *name; - /* alloc a buffer to hold dsname@snapname plus terminating NULL */ - name = kmem_asprintf("%s@%s", dsname, ha->snapname); - error = dsl_dataset_hold(name, ha->dstg, &ds); - strfree(name); + error = dsl_pool_hold(pname, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); if (error == 0) { - ha->gotone = B_TRUE; - dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, - dsl_dataset_user_hold_sync, ds, ha, 0); - } else if (error == ENOENT && ha->recursive) { - error = 0; - } else { - (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); + dsl_dataset_name(ds, buf); + dsl_dataset_rele(ds, FTAG); } + dsl_pool_rele(dp, FTAG); + return (error); } int -dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, - boolean_t recursive, boolean_t temphold) +dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, + uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) { - struct dsl_ds_holdarg *ha; - dsl_sync_task_t *dst; - spa_t *spa; - int error; + int error = 0; - ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + ASSERT3S(asize, >, 0); - (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); + /* + * *ref_rsrv is the portion of asize that will come from any + * unconsumed refreservation space. + */ + *ref_rsrv = 0; - error = spa_open(dsname, &spa, FTAG); - if (error) { - kmem_free(ha, sizeof (struct dsl_ds_holdarg)); - return (error); + mutex_enter(&ds->ds_lock); + /* + * Make a space adjustment for reserved bytes. + */ + if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) { + ASSERT3U(*used, >=, + ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); + *used -= + (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); + *ref_rsrv = + asize - MIN(asize, parent_delta(ds, asize + inflight)); } - ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - ha->htag = htag; - ha->snapname = snapname; - ha->recursive = recursive; - ha->temphold = temphold; - if (recursive) { - error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, - ha, DS_FIND_CHILDREN); - } else { - error = dsl_dataset_user_hold_one(dsname, ha); + if (!check_quota || ds->ds_quota == 0) { + mutex_exit(&ds->ds_lock); + return (0); } - if (error == 0) - error = dsl_sync_task_group_wait(ha->dstg); - - for (dst = list_head(&ha->dstg->dstg_tasks); dst; - dst = list_next(&ha->dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; - - if (dst->dst_err) { - dsl_dataset_name(ds, ha->failed); - *strchr(ha->failed, '@') = '\0'; - } - dsl_dataset_rele(ds, ha->dstg); + /* + * If they are requesting more space, and our current estimate + * is over quota, they get to try again unless the actual + * on-disk is over quota and there are no pending changes (which + * may free up space for us). + */ + if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >= + ds->ds_quota) { + if (inflight > 0 || + dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota) + error = SET_ERROR(ERESTART); + else + error = SET_ERROR(EDQUOT); } + mutex_exit(&ds->ds_lock); - if (error == 0 && recursive && !ha->gotone) - error = ENOENT; - - if (error) - (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); - - dsl_sync_task_group_destroy(ha->dstg); - kmem_free(ha, sizeof (struct dsl_ds_holdarg)); - spa_close(spa, FTAG); return (error); } -struct dsl_ds_releasearg { - dsl_dataset_t *ds; - const char *htag; - boolean_t own; /* do we own or just hold ds? */ -}; +typedef struct dsl_dataset_set_qr_arg { + const char *ddsqra_name; + zprop_source_t ddsqra_source; + uint64_t ddsqra_value; +} dsl_dataset_set_qr_arg_t; + +/* ARGSUSED */ static int -dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, - boolean_t *might_destroy) +dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx) { - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t zapobj; - uint64_t tmp; + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; int error; + uint64_t newval; - *might_destroy = B_FALSE; + if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA) + return (SET_ERROR(ENOTSUP)); - mutex_enter(&ds->ds_lock); - zapobj = ds->ds_phys->ds_userrefs_obj; - if (zapobj == 0) { - /* The tag can't possibly exist */ - mutex_exit(&ds->ds_lock); - return (ESRCH); + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + + if (ds->ds_is_snapshot) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); } - /* Make sure the tag exists */ - error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); - if (error) { - mutex_exit(&ds->ds_lock); - if (error == ENOENT) - error = ESRCH; + error = dsl_prop_predict(ds->ds_dir, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); return (error); } - if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && - DS_IS_DEFER_DESTROY(ds)) - *might_destroy = B_TRUE; + if (newval == 0) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes || + newval < ds->ds_reserved) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOSPC)); + } - mutex_exit(&ds->ds_lock); + dsl_dataset_rele(ds, FTAG); return (0); } -static int -dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) +static void +dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx) { - struct dsl_ds_releasearg *ra = arg1; - dsl_dataset_t *ds = ra->ds; - boolean_t might_destroy; - int error; + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + uint64_t newval; - if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) - return (ENOTSUP); + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); - if (error) - return (error); + dsl_prop_set_sync_impl(ds, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); - if (might_destroy) { - struct dsl_ds_destroyarg dsda = {0}; + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval)); - if (dmu_tx_is_syncing(tx)) { - /* - * If we're not prepared to remove the snapshot, - * we can't allow the release to happen right now. - */ - if (!ra->own) - return (EBUSY); - if (ds->ds_objset) { - dmu_objset_evict(ds->ds_objset); - ds->ds_objset = NULL; - } - } - dsda.ds = ds; - dsda.releasing = B_TRUE; - return (dsl_dataset_destroy_check(&dsda, tag, tx)); + if (ds->ds_quota != newval) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_quota = newval; } - - return (0); + dsl_dataset_rele(ds, FTAG); } -static void -dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) +int +dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, + uint64_t refquota) { - struct dsl_ds_releasearg *ra = arg1; - dsl_dataset_t *ds = ra->ds; - dsl_pool_t *dp = ds->ds_dir->dd_pool; - objset_t *mos = dp->dp_meta_objset; - uint64_t zapobj; - uint64_t dsobj = ds->ds_object; - uint64_t refs; - int error; + dsl_dataset_set_qr_arg_t ddsqra; - mutex_enter(&ds->ds_lock); - ds->ds_userrefs--; - refs = ds->ds_userrefs; - mutex_exit(&ds->ds_lock); - error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); - VERIFY(error == 0 || error == ENOENT); - zapobj = ds->ds_phys->ds_userrefs_obj; - VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); - if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && - DS_IS_DEFER_DESTROY(ds)) { - struct dsl_ds_destroyarg dsda = {0}; - - ASSERT(ra->own); - dsda.ds = ds; - dsda.releasing = B_TRUE; - /* We already did the destroy_check */ - dsl_dataset_destroy_sync(&dsda, tag, cr, tx); - } - - spa_history_internal_log(LOG_DS_USER_RELEASE, - dp->dp_spa, tx, cr, "<%s> %lld dataset = %llu", - ra->htag, (longlong_t)refs, dsobj); + ddsqra.ddsqra_name = dsname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = refquota; + + return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check, + dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); } static int -dsl_dataset_user_release_one(const char *dsname, void *arg) +dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx) { - struct dsl_ds_holdarg *ha = arg; - struct dsl_ds_releasearg *ra; + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; - void *dtag = ha->dstg; - char *name; - boolean_t own = B_FALSE; - boolean_t might_destroy; - - /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ - name = kmem_asprintf("%s@%s", dsname, ha->snapname); - error = dsl_dataset_hold(name, dtag, &ds); - strfree(name); - if (error == ENOENT && ha->recursive) - return (0); - (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); - if (error) - return (error); + uint64_t newval, unique; - ha->gotone = B_TRUE; + if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION) + return (SET_ERROR(ENOTSUP)); - ASSERT(dsl_dataset_is_snapshot(ds)); + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + + if (ds->ds_is_snapshot) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } - error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); - if (error) { - dsl_dataset_rele(ds, dtag); + error = dsl_prop_predict(ds->ds_dir, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); return (error); } - if (might_destroy) { -#ifdef _KERNEL - error = zfs_unmount_snap(name, NULL); - if (error) { - dsl_dataset_rele(ds, dtag); - return (error); - } -#endif - if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { - dsl_dataset_rele(ds, dtag); - return (EBUSY); - } else { - own = B_TRUE; - dsl_dataset_make_exclusive(ds, dtag); - } + /* + * If we are doing the preliminary check in open context, the + * space estimates may be inaccurate. + */ + if (!dmu_tx_is_syncing(tx)) { + dsl_dataset_rele(ds, FTAG); + return (0); } - ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); - ra->ds = ds; - ra->htag = ha->htag; - ra->own = own; - dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, - dsl_dataset_user_release_sync, ra, dtag, 0); + mutex_enter(&ds->ds_lock); + if (!DS_UNIQUE_IS_ACCURATE(ds)) + dsl_dataset_recalc_head_uniq(ds); + unique = dsl_dataset_phys(ds)->ds_unique_bytes; + mutex_exit(&ds->ds_lock); + + if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) { + uint64_t delta = MAX(unique, newval) - + MAX(unique, ds->ds_reserved); + + if (delta > + dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) || + (ds->ds_quota > 0 && newval > ds->ds_quota)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOSPC)); + } + } + dsl_dataset_rele(ds, FTAG); return (0); } +void +dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, + zprop_source_t source, uint64_t value, dmu_tx_t *tx) +{ + uint64_t newval; + uint64_t unique; + int64_t delta; + + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + source, sizeof (value), 1, &value, tx); + + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval)); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + mutex_enter(&ds->ds_dir->dd_lock); + mutex_enter(&ds->ds_lock); + ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); + unique = dsl_dataset_phys(ds)->ds_unique_bytes; + delta = MAX(0, (int64_t)(newval - unique)) - + MAX(0, (int64_t)(ds->ds_reserved - unique)); + ds->ds_reserved = newval; + mutex_exit(&ds->ds_lock); + + dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); + mutex_exit(&ds->ds_dir->dd_lock); +} + +static void +dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); + dsl_dataset_set_refreservation_sync_impl(ds, + ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx); + dsl_dataset_rele(ds, FTAG); +} + int -dsl_dataset_user_release(char *dsname, char *snapname, char *htag, - boolean_t recursive) +dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, + uint64_t refreservation) { - struct dsl_ds_holdarg *ha; - dsl_sync_task_t *dst; - spa_t *spa; - int error; + dsl_dataset_set_qr_arg_t ddsqra; -top: - ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + ddsqra.ddsqra_name = dsname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = refreservation; + + return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check, + dsl_dataset_set_refreservation_sync, &ddsqra, + 0, ZFS_SPACE_CHECK_NONE)); +} - (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); +/* + * Return (in *usedp) the amount of space written in new that is not + * present in oldsnap. New may be a snapshot or the head. Old must be + * a snapshot before new, in new's filesystem (or its origin). If not then + * fail and return EINVAL. + * + * The written space is calculated by considering two components: First, we + * ignore any freed space, and calculate the written as new's used space + * minus old's used space. Next, we add in the amount of space that was freed + * between the two snapshots, thus reducing new's used space relative to old's. + * Specifically, this is the space that was born before old->ds_creation_txg, + * and freed before new (ie. on new's deadlist or a previous deadlist). + * + * space freed [---------------------] + * snapshots ---O-------O--------O-------O------ + * oldsnap new + */ +int +dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + int err = 0; + uint64_t snapobj; + dsl_pool_t *dp = new->ds_dir->dd_pool; - error = spa_open(dsname, &spa, FTAG); - if (error) { - kmem_free(ha, sizeof (struct dsl_ds_holdarg)); - return (error); - } + ASSERT(dsl_pool_config_held(dp)); - ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); - ha->htag = htag; - ha->snapname = snapname; - ha->recursive = recursive; - if (recursive) { - error = dmu_objset_find(dsname, dsl_dataset_user_release_one, - ha, DS_FIND_CHILDREN); - } else { - error = dsl_dataset_user_release_one(dsname, ha); - } - if (error == 0) - error = dsl_sync_task_group_wait(ha->dstg); + *usedp = 0; + *usedp += dsl_dataset_phys(new)->ds_referenced_bytes; + *usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes; + + *compp = 0; + *compp += dsl_dataset_phys(new)->ds_compressed_bytes; + *compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes; + + *uncompp = 0; + *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes; + *uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes; + + snapobj = new->ds_object; + while (snapobj != oldsnap->ds_object) { + dsl_dataset_t *snap; + uint64_t used, comp, uncomp; - for (dst = list_head(&ha->dstg->dstg_tasks); dst; - dst = list_next(&ha->dstg->dstg_tasks, dst)) { - struct dsl_ds_releasearg *ra = dst->dst_arg1; - dsl_dataset_t *ds = ra->ds; + if (snapobj == new->ds_object) { + snap = new; + } else { + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); + if (err != 0) + break; + } - if (dst->dst_err) - dsl_dataset_name(ds, ha->failed); + if (dsl_dataset_phys(snap)->ds_prev_snap_txg == + dsl_dataset_phys(oldsnap)->ds_creation_txg) { + /* + * The blocks in the deadlist can not be born after + * ds_prev_snap_txg, so get the whole deadlist space, + * which is more efficient (especially for old-format + * deadlists). Unfortunately the deadlist code + * doesn't have enough information to make this + * optimization itself. + */ + dsl_deadlist_space(&snap->ds_deadlist, + &used, &comp, &uncomp); + } else { + dsl_deadlist_space_range(&snap->ds_deadlist, + 0, dsl_dataset_phys(oldsnap)->ds_creation_txg, + &used, &comp, &uncomp); + } + *usedp += used; + *compp += comp; + *uncompp += uncomp; - if (ra->own) - dsl_dataset_disown(ds, ha->dstg); - else - dsl_dataset_rele(ds, ha->dstg); + /* + * If we get to the beginning of the chain of snapshots + * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap + * was not a snapshot of/before new. + */ + snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + if (snap != new) + dsl_dataset_rele(snap, FTAG); + if (snapobj == 0) { + err = SET_ERROR(EINVAL); + break; + } - kmem_free(ra, sizeof (struct dsl_ds_releasearg)); } + return (err); +} - if (error == 0 && recursive && !ha->gotone) - error = ENOENT; - - if (error && error != EBUSY) - (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); +/* + * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, + * lastsnap, and all snapshots in between are deleted. + * + * blocks that would be freed [---------------------------] + * snapshots ---O-------O--------O-------O--------O + * firstsnap lastsnap + * + * This is the set of blocks that were born after the snap before firstsnap, + * (birth > firstsnap->prev_snap_txg) and died before the snap after the + * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). + * We calculate this by iterating over the relevant deadlists (from the snap + * after lastsnap, backward to the snap after firstsnap), summing up the + * space on the deadlist that was born after the snap before firstsnap. + */ +int +dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, + dsl_dataset_t *lastsnap, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + int err = 0; + uint64_t snapobj; + dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; - dsl_sync_task_group_destroy(ha->dstg); - kmem_free(ha, sizeof (struct dsl_ds_holdarg)); - spa_close(spa, FTAG); + ASSERT(firstsnap->ds_is_snapshot); + ASSERT(lastsnap->ds_is_snapshot); /* - * We can get EBUSY if we were racing with deferred destroy and - * dsl_dataset_user_release_check() hadn't done the necessary - * open context setup. We can also get EBUSY if we're racing - * with destroy and that thread is the ds_owner. Either way - * the busy condition should be transient, and we should retry - * the release operation. + * Check that the snapshots are in the same dsl_dir, and firstsnap + * is before lastsnap. */ - if (error == EBUSY) - goto top; + if (firstsnap->ds_dir != lastsnap->ds_dir || + dsl_dataset_phys(firstsnap)->ds_creation_txg > + dsl_dataset_phys(lastsnap)->ds_creation_txg) + return (SET_ERROR(EINVAL)); - return (error); + *usedp = *compp = *uncompp = 0; + + snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj; + while (snapobj != firstsnap->ds_object) { + dsl_dataset_t *ds; + uint64_t used, comp, uncomp; + + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); + if (err != 0) + break; + + dsl_deadlist_space_range(&ds->ds_deadlist, + dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX, + &used, &comp, &uncomp); + *usedp += used; + *compp += comp; + *uncompp += uncomp; + + snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + ASSERT3U(snapobj, !=, 0); + dsl_dataset_rele(ds, FTAG); + } + return (err); } /* - * Called at spa_load time to release a stale temporary user hold. + * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. + * For example, they could both be snapshots of the same filesystem, and + * 'earlier' is before 'later'. Or 'earlier' could be the origin of + * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's + * filesystem. Or 'earlier' could be the origin's origin. + * + * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg. */ -int -dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag) +boolean_t +dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, + uint64_t earlier_txg) { - dsl_dataset_t *ds; - char *snap; - char *name; - int namelen; + dsl_pool_t *dp = later->ds_dir->dd_pool; int error; + boolean_t ret; - rw_enter(&dp->dp_config_rwlock, RW_READER); - error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); - rw_exit(&dp->dp_config_rwlock); - if (error) - return (error); - namelen = dsl_dataset_namelen(ds)+1; - name = kmem_alloc(namelen, KM_SLEEP); - dsl_dataset_name(ds, name); - dsl_dataset_rele(ds, FTAG); + ASSERT(dsl_pool_config_held(dp)); + ASSERT(earlier->ds_is_snapshot || earlier_txg != 0); - snap = strchr(name, '@'); - *snap = '\0'; - ++snap; - return (dsl_dataset_user_release(name, snap, htag, B_FALSE)); + if (earlier_txg == 0) + earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg; + + if (later->ds_is_snapshot && + earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg) + return (B_FALSE); + + if (later->ds_dir == earlier->ds_dir) + return (B_TRUE); + if (!dsl_dir_is_clone(later->ds_dir)) + return (B_FALSE); + + if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object) + return (B_TRUE); + dsl_dataset_t *origin; + error = dsl_dataset_hold_obj(dp, + dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin); + if (error != 0) + return (B_FALSE); + ret = dsl_dataset_is_before(origin, earlier, earlier_txg); + dsl_dataset_rele(origin, FTAG); + return (ret); } -int -dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) +void +dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx) { - dsl_dataset_t *ds; - int err; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx); +} - err = dsl_dataset_hold(dsname, FTAG, &ds); - if (err) - return (err); +boolean_t +dsl_dataset_is_zapified(dsl_dataset_t *ds) +{ + dmu_object_info_t doi; - VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); - if (ds->ds_phys->ds_userrefs_obj != 0) { - zap_attribute_t *za; - zap_cursor_t zc; - - za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); - for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_userrefs_obj); - zap_cursor_retrieve(&zc, za) == 0; - zap_cursor_advance(&zc)) { - VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, - za->za_first_integer)); - } - zap_cursor_fini(&zc); - kmem_free(za, sizeof (zap_attribute_t)); - } - dsl_dataset_rele(ds, FTAG); - return (0); + dmu_object_info_from_db(ds->ds_dbuf, &doi); + return (doi.doi_type == DMU_OTN_ZAP_METADATA); } -/* - * Note, this fuction is used as the callback for dmu_objset_find(). We - * always return 0 so that we will continue to find and process - * inconsistent datasets, even if we encounter an error trying to - * process one of them. - */ -/* ARGSUSED */ -int -dsl_destroy_inconsistent(const char *dsname, void *arg) +boolean_t +dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds) { - dsl_dataset_t *ds; - - if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { - if (DS_IS_INCONSISTENT(ds)) - (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); - else - dsl_dataset_disown(ds, FTAG); - } - return (0); + return (dsl_dataset_is_zapified(ds) && + zap_contains(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deadlist.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deadlist.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deadlist.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deadlist.c 27 Mar 2016 02:52:21 -0000 @@ -0,0 +1,539 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include +#include +#include +#include +#include +#include + +/* + * Deadlist concurrency: + * + * Deadlists can only be modified from the syncing thread. + * + * Except for dsl_deadlist_insert(), it can only be modified with the + * dp_config_rwlock held with RW_WRITER. + * + * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can + * be called concurrently, from open context, with the dl_config_rwlock held + * with RW_READER. + * + * Therefore, we only need to provide locking between dsl_deadlist_insert() and + * the accessors, protecting: + * dl_phys->dl_used,comp,uncomp + * and protecting the dl_tree from being loaded. + * The locking is provided by dl_lock. Note that locking on the bpobj_t + * provides its own locking, and dl_oldfmt is immutable. + */ + +static int +dsl_deadlist_compare(const void *arg1, const void *arg2) +{ + const dsl_deadlist_entry_t *dle1 = arg1; + const dsl_deadlist_entry_t *dle2 = arg2; + + if (dle1->dle_mintxg < dle2->dle_mintxg) + return (-1); + else if (dle1->dle_mintxg > dle2->dle_mintxg) + return (+1); + else + return (0); +} + +static void +dsl_deadlist_load_tree(dsl_deadlist_t *dl) +{ + zap_cursor_t zc; + zap_attribute_t za; + + ASSERT(!dl->dl_oldfmt); + if (dl->dl_havetree) + return; + + avl_create(&dl->dl_tree, dsl_deadlist_compare, + sizeof (dsl_deadlist_entry_t), + offsetof(dsl_deadlist_entry_t, dle_node)); + for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); + dle->dle_mintxg = strtonum(za.za_name, NULL); + VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, + za.za_first_integer)); + avl_add(&dl->dl_tree, dle); + } + zap_cursor_fini(&zc); + dl->dl_havetree = B_TRUE; +} + +void +dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) +{ + dmu_object_info_t doi; + + mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); + dl->dl_os = os; + dl->dl_object = object; + VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); + dmu_object_info_from_db(dl->dl_dbuf, &doi); + if (doi.doi_type == DMU_OT_BPOBJ) { + dmu_buf_rele(dl->dl_dbuf, dl); + dl->dl_dbuf = NULL; + dl->dl_oldfmt = B_TRUE; + VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); + return; + } + + dl->dl_oldfmt = B_FALSE; + dl->dl_phys = dl->dl_dbuf->db_data; + dl->dl_havetree = B_FALSE; +} + +void +dsl_deadlist_close(dsl_deadlist_t *dl) +{ + void *cookie = NULL; + dsl_deadlist_entry_t *dle; + + dl->dl_os = NULL; + + if (dl->dl_oldfmt) { + dl->dl_oldfmt = B_FALSE; + bpobj_close(&dl->dl_bpobj); + return; + } + + if (dl->dl_havetree) { + while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) + != NULL) { + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + } + avl_destroy(&dl->dl_tree); + } + dmu_buf_rele(dl->dl_dbuf, dl); + mutex_destroy(&dl->dl_lock); + dl->dl_dbuf = NULL; + dl->dl_phys = NULL; +} + +uint64_t +dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) +{ + if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) + return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); + return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, + sizeof (dsl_deadlist_phys_t), tx)); +} + +void +dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) +{ + dmu_object_info_t doi; + zap_cursor_t zc; + zap_attribute_t za; + + VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); + if (doi.doi_type == DMU_OT_BPOBJ) { + bpobj_free(os, dlobj, tx); + return; + } + + for (zap_cursor_init(&zc, os, dlobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t obj = za.za_first_integer; + if (obj == dmu_objset_pool(os)->dp_empty_bpobj) + bpobj_decr_empty(os, tx); + else + bpobj_free(os, obj, tx); + } + zap_cursor_fini(&zc); + VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); +} + +static void +dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, + const blkptr_t *bp, dmu_tx_t *tx) +{ + if (dle->dle_bpobj.bpo_object == + dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { + uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); + bpobj_close(&dle->dle_bpobj); + bpobj_decr_empty(dl->dl_os, tx); + VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, + dle->dle_mintxg, obj, tx)); + } + bpobj_enqueue(&dle->dle_bpobj, bp, tx); +} + +static void +dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, + uint64_t obj, dmu_tx_t *tx) +{ + if (dle->dle_bpobj.bpo_object != + dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { + bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); + } else { + bpobj_close(&dle->dle_bpobj); + bpobj_decr_empty(dl->dl_os, tx); + VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, + dle->dle_mintxg, obj, tx)); + } +} + +void +dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + if (dl->dl_oldfmt) { + bpobj_enqueue(&dl->dl_bpobj, bp, tx); + return; + } + + dsl_deadlist_load_tree(dl); + + dmu_buf_will_dirty(dl->dl_dbuf, tx); + mutex_enter(&dl->dl_lock); + dl->dl_phys->dl_used += + bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); + dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); + dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); + mutex_exit(&dl->dl_lock); + + dle_tofind.dle_mintxg = bp->blk_birth; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); + else + dle = AVL_PREV(&dl->dl_tree, dle); + dle_enqueue(dl, dle, bp, tx); +} + +/* + * Insert new key in deadlist, which must be > all current entries. + * mintxg is not inclusive. + */ +void +dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) +{ + uint64_t obj; + dsl_deadlist_entry_t *dle; + + if (dl->dl_oldfmt) + return; + + dsl_deadlist_load_tree(dl); + + dle = kmem_alloc(sizeof (*dle), KM_SLEEP); + dle->dle_mintxg = mintxg; + obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + avl_add(&dl->dl_tree, dle); + + VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, + mintxg, obj, tx)); +} + +/* + * Remove this key, merging its entries into the previous key. + */ +void +dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle, *dle_prev; + + if (dl->dl_oldfmt) + return; + + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); + dle_prev = AVL_PREV(&dl->dl_tree, dle); + + dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); + + avl_remove(&dl->dl_tree, dle); + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + + VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); +} + +/* + * Walk ds's snapshots to regenerate generate ZAP & AVL. + */ +static void +dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, + uint64_t mrs_obj, dmu_tx_t *tx) +{ + dsl_deadlist_t dl; + dsl_pool_t *dp = dmu_objset_pool(os); + + dsl_deadlist_open(&dl, os, dlobj); + if (dl.dl_oldfmt) { + dsl_deadlist_close(&dl); + return; + } + + while (mrs_obj != 0) { + dsl_dataset_t *ds; + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); + dsl_deadlist_add_key(&dl, + dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); + mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + dsl_dataset_rele(ds, FTAG); + } + dsl_deadlist_close(&dl); +} + +uint64_t +dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, + uint64_t mrs_obj, dmu_tx_t *tx) +{ + dsl_deadlist_entry_t *dle; + uint64_t newobj; + + newobj = dsl_deadlist_alloc(dl->dl_os, tx); + + if (dl->dl_oldfmt) { + dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); + return (newobj); + } + + dsl_deadlist_load_tree(dl); + + for (dle = avl_first(&dl->dl_tree); dle; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + uint64_t obj; + + if (dle->dle_mintxg >= maxtxg) + break; + + obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, + dle->dle_mintxg, obj, tx)); + } + return (newobj); +} + +void +dsl_deadlist_space(dsl_deadlist_t *dl, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + if (dl->dl_oldfmt) { + VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, + usedp, compp, uncompp)); + return; + } + + mutex_enter(&dl->dl_lock); + *usedp = dl->dl_phys->dl_used; + *compp = dl->dl_phys->dl_comp; + *uncompp = dl->dl_phys->dl_uncomp; + mutex_exit(&dl->dl_lock); +} + +/* + * return space used in the range (mintxg, maxtxg]. + * Includes maxtxg, does not include mintxg. + * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is + * larger than any bp in the deadlist (eg. UINT64_MAX)). + */ +void +dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + dsl_deadlist_entry_t *dle; + dsl_deadlist_entry_t dle_tofind; + avl_index_t where; + + if (dl->dl_oldfmt) { + VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, + mintxg, maxtxg, usedp, compp, uncompp)); + return; + } + + *usedp = *compp = *uncompp = 0; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + /* + * If we don't find this mintxg, there shouldn't be anything + * after it either. + */ + ASSERT(dle != NULL || + avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); + + for (; dle && dle->dle_mintxg < maxtxg; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + uint64_t used, comp, uncomp; + + VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, + &used, &comp, &uncomp)); + + *usedp += used; + *compp += comp; + *uncompp += uncomp; + } + mutex_exit(&dl->dl_lock); +} + +static void +dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, + dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + uint64_t used, comp, uncomp; + bpobj_t bpo; + + VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); + VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); + bpobj_close(&bpo); + + dsl_deadlist_load_tree(dl); + + dmu_buf_will_dirty(dl->dl_dbuf, tx); + mutex_enter(&dl->dl_lock); + dl->dl_phys->dl_used += used; + dl->dl_phys->dl_comp += comp; + dl->dl_phys->dl_uncomp += uncomp; + mutex_exit(&dl->dl_lock); + + dle_tofind.dle_mintxg = birth; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); + dle_enqueue_subobj(dl, dle, obj, tx); +} + +static int +dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, tx); + return (0); +} + +/* + * Merge the deadlist pointed to by 'obj' into dl. obj will be left as + * an empty deadlist. + */ +void +dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + dmu_buf_t *bonus; + dsl_deadlist_phys_t *dlp; + dmu_object_info_t doi; + + VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); + if (doi.doi_type == DMU_OT_BPOBJ) { + bpobj_t bpo; + VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); + VERIFY3U(0, ==, bpobj_iterate(&bpo, + dsl_deadlist_insert_cb, dl, tx)); + bpobj_close(&bpo); + return; + } + + for (zap_cursor_init(&zc, dl->dl_os, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t mintxg = strtonum(za.za_name, NULL); + dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); + VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); + } + zap_cursor_fini(&zc); + + VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); + dlp = bonus->db_data; + dmu_buf_will_dirty(bonus, tx); + bzero(dlp, sizeof (*dlp)); + dmu_buf_rele(bonus, FTAG); +} + +/* + * Remove entries on dl that are >= mintxg, and put them on the bpobj. + */ +void +dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, + dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + ASSERT(!dl->dl_oldfmt); + dmu_buf_will_dirty(dl->dl_dbuf, tx); + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); + while (dle) { + uint64_t used, comp, uncomp; + dsl_deadlist_entry_t *dle_next; + + bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); + + VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, + &used, &comp, &uncomp)); + mutex_enter(&dl->dl_lock); + ASSERT3U(dl->dl_phys->dl_used, >=, used); + ASSERT3U(dl->dl_phys->dl_comp, >=, comp); + ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); + dl->dl_phys->dl_used -= used; + dl->dl_phys->dl_comp -= comp; + dl->dl_phys->dl_uncomp -= uncomp; + mutex_exit(&dl->dl_lock); + + VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, + dle->dle_mintxg, tx)); + + dle_next = AVL_NEXT(&dl->dl_tree, dle); + avl_remove(&dl->dl_tree, dle); + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + dle = dle_next; + } +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c,v retrieving revision 1.3 diff -u -p -r1.3 dsl_deleg.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c 27 Feb 2010 23:43:53 -0000 1.3 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c 10 Oct 2016 11:09:56 -0000 @@ -19,8 +19,8 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ /* @@ -107,7 +107,7 @@ dsl_deleg_can_allow(char *ddname, nvlist const char *perm = nvpair_name(permpair); if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0) - return (EPERM); + return (SET_ERROR(EPERM)); if ((error = dsl_deleg_access(ddname, perm, cr)) != 0) return (error); @@ -139,42 +139,49 @@ dsl_deleg_can_unallow(char *ddname, nvli if (type != ZFS_DELEG_USER && type != ZFS_DELEG_USER_SETS) - return (EPERM); + return (SET_ERROR(EPERM)); if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0) - return (EPERM); + return (SET_ERROR(EPERM)); } return (0); } +typedef struct dsl_deleg_arg { + const char *dda_name; + nvlist_t *dda_nvlist; +} dsl_deleg_arg_t; + static void -dsl_deleg_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_deleg_set_sync(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - nvlist_t *nvp = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_deleg_arg_t *dda = arg; + dsl_dir_t *dd; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; nvpair_t *whopair = NULL; - uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + uint64_t zapobj; + + VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; if (zapobj == 0) { dmu_buf_will_dirty(dd->dd_dbuf, tx); - zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos, + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); } - while (whopair = nvlist_next_nvpair(nvp, whopair)) { + while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) { const char *whokey = nvpair_name(whopair); nvlist_t *perms; nvpair_t *permpair = NULL; uint64_t jumpobj; - VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); + perms = fnvpair_value_nvlist(whopair); if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) { - jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, - DMU_OT_NONE, 0, tx); - VERIFY(zap_update(mos, zapobj, - whokey, 8, 1, &jumpobj, tx) == 0); + jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS, + zapobj, whokey, tx); } while (permpair = nvlist_next_nvpair(perms, permpair)) { @@ -183,27 +190,31 @@ dsl_deleg_set_sync(void *arg1, void *arg VERIFY(zap_update(mos, jumpobj, perm, 8, 1, &n, tx) == 0); - spa_history_internal_log(LOG_DS_PERM_UPDATE, - dd->dd_pool->dp_spa, tx, cr, - "%s %s dataset = %llu", whokey, perm, - dd->dd_phys->dd_head_dataset_obj); + spa_history_log_internal_dd(dd, "permission update", tx, + "%s %s", whokey, perm); } } + dsl_dir_rele(dd, FTAG); } static void -dsl_deleg_unset_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - nvlist_t *nvp = arg2; - objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_deleg_arg_t *dda = arg; + dsl_dir_t *dd; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; nvpair_t *whopair = NULL; - uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + uint64_t zapobj; - if (zapobj == 0) + VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; + if (zapobj == 0) { + dsl_dir_rele(dd, FTAG); return; + } - while (whopair = nvlist_next_nvpair(nvp, whopair)) { + while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) { const char *whokey = nvpair_name(whopair); nvlist_t *perms; nvpair_t *permpair = NULL; @@ -215,10 +226,8 @@ dsl_deleg_unset_sync(void *arg1, void *a (void) zap_remove(mos, zapobj, whokey, tx); VERIFY(0 == zap_destroy(mos, jumpobj, tx)); } - spa_history_internal_log(LOG_DS_PERM_WHO_REMOVE, - dd->dd_pool->dp_spa, tx, cr, - "%s dataset = %llu", whokey, - dd->dd_phys->dd_head_dataset_obj); + spa_history_log_internal_dd(dd, "permission who remove", + tx, "%s", whokey); continue; } @@ -236,41 +245,44 @@ dsl_deleg_unset_sync(void *arg1, void *a VERIFY(0 == zap_destroy(mos, jumpobj, tx)); } - spa_history_internal_log(LOG_DS_PERM_REMOVE, - dd->dd_pool->dp_spa, tx, cr, - "%s %s dataset = %llu", whokey, perm, - dd->dd_phys->dd_head_dataset_obj); + spa_history_log_internal_dd(dd, "permission remove", tx, + "%s %s", whokey, perm); } } + dsl_dir_rele(dd, FTAG); } -int -dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) +static int +dsl_deleg_check(void *arg, dmu_tx_t *tx) { + dsl_deleg_arg_t *dda = arg; dsl_dir_t *dd; int error; - nvpair_t *whopair = NULL; - int blocks_modified = 0; - error = dsl_dir_open(ddname, FTAG, &dd, NULL); - if (error) - return (error); - - if (spa_version(dmu_objset_spa(dd->dd_pool->dp_meta_objset)) < + if (spa_version(dmu_tx_pool(tx)->dp_spa) < SPA_VERSION_DELEGATED_PERMS) { - dsl_dir_close(dd, FTAG); - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); } - while (whopair = nvlist_next_nvpair(nvp, whopair)) - blocks_modified++; + error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL); + if (error == 0) + dsl_dir_rele(dd, FTAG); + return (error); +} - error = dsl_sync_task_do(dd->dd_pool, NULL, - unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, - dd, nvp, blocks_modified); - dsl_dir_close(dd, FTAG); +int +dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) +{ + dsl_deleg_arg_t dda; - return (error); + /* nvp must already have been verified to be valid */ + + dda.dda_name = ddname; + dda.dda_nvlist = nvp; + + return (dsl_sync_task(ddname, dsl_deleg_check, + unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, + &dda, fnvlist_num_pairs(nvp), ZFS_SPACE_CHECK_RESERVED)); } /* @@ -298,34 +310,36 @@ dsl_deleg_get(const char *ddname, nvlist int error; objset_t *mos; - error = dsl_dir_open(ddname, FTAG, &startdd, NULL); - if (error) + error = dsl_pool_hold(ddname, FTAG, &dp); + if (error != 0) return (error); + error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + dp = startdd->dd_pool; mos = dp->dp_meta_objset; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - rw_enter(&dp->dp_config_rwlock, RW_READER); for (dd = startdd; dd != NULL; dd = dd->dd_parent) { zap_cursor_t basezc; zap_attribute_t baseza; nvlist_t *sp_nvp; uint64_t n; - char source[MAXNAMELEN]; + char source[ZFS_MAX_DATASET_NAME_LEN]; - if (dd->dd_phys->dd_deleg_zapobj && - (zap_count(mos, dd->dd_phys->dd_deleg_zapobj, - &n) == 0) && n) { - VERIFY(nvlist_alloc(&sp_nvp, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - } else { + if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 || + zap_count(mos, + dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 || n == 0) continue; - } + sp_nvp = fnvlist_alloc(); for (zap_cursor_init(&basezc, mos, - dd->dd_phys->dd_deleg_zapobj); + dsl_dir_phys(dd)->dd_deleg_zapobj); zap_cursor_retrieve(&basezc, &baseza) == 0; zap_cursor_advance(&basezc)) { zap_cursor_t zc; @@ -335,29 +349,26 @@ dsl_deleg_get(const char *ddname, nvlist ASSERT(baseza.za_integer_length == 8); ASSERT(baseza.za_num_integers == 1); - VERIFY(nvlist_alloc(&perms_nvp, - NV_UNIQUE_NAME, KM_SLEEP) == 0); + perms_nvp = fnvlist_alloc(); for (zap_cursor_init(&zc, mos, baseza.za_first_integer); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { - VERIFY(nvlist_add_boolean(perms_nvp, - za.za_name) == 0); + fnvlist_add_boolean(perms_nvp, za.za_name); } zap_cursor_fini(&zc); - VERIFY(nvlist_add_nvlist(sp_nvp, baseza.za_name, - perms_nvp) == 0); - nvlist_free(perms_nvp); + fnvlist_add_nvlist(sp_nvp, baseza.za_name, perms_nvp); + fnvlist_free(perms_nvp); } zap_cursor_fini(&basezc); dsl_dir_name(dd, source); - VERIFY(nvlist_add_nvlist(*nvp, source, sp_nvp) == 0); + fnvlist_add_nvlist(*nvp, source, sp_nvp); nvlist_free(sp_nvp); } - rw_exit(&dp->dp_config_rwlock); - dsl_dir_close(startdd, FTAG); + dsl_dir_rele(startdd, FTAG); + dsl_pool_rele(dp, FTAG); return (0); } @@ -406,7 +417,7 @@ dsl_check_access(objset_t *mos, uint64_t if (error == 0) { error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero); if (error == ENOENT) - error = EPERM; + error = SET_ERROR(EPERM); } return (error); } @@ -418,7 +429,7 @@ static int dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm, int checkflag, cred_t *cr) { - gid_t *gids; + const gid_t *gids; int ngids; int i; uint64_t id; @@ -451,7 +462,7 @@ dsl_check_user_access(objset_t *mos, uin return (0); } - return (EPERM); + return (SET_ERROR(EPERM)); } /* @@ -529,9 +540,8 @@ dsl_load_user_sets(objset_t *mos, uint64 * Check if user has requested permission. */ int -dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) +dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) { - dsl_dataset_t *ds; dsl_dir_t *dd; dsl_pool_t *dp; void *cookie; @@ -541,25 +551,17 @@ dsl_deleg_access(const char *dsname, con avl_tree_t permsets; perm_set_t *setnode; - error = dsl_dataset_hold(dsname, FTAG, &ds); - if (error) - return (error); - dp = ds->ds_dir->dd_pool; mos = dp->dp_meta_objset; - if (dsl_delegation_on(mos) == B_FALSE) { - dsl_dataset_rele(ds, FTAG); - return (ECANCELED); - } + if (dsl_delegation_on(mos) == B_FALSE) + return (SET_ERROR(ECANCELED)); if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) < - SPA_VERSION_DELEGATED_PERMS) { - dsl_dataset_rele(ds, FTAG); - return (EPERM); - } + SPA_VERSION_DELEGATED_PERMS) + return (SET_ERROR(EPERM)); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { /* * Snapshots are treated as descendents only, * local permissions do not apply. @@ -572,7 +574,7 @@ dsl_deleg_access(const char *dsname, con avl_create(&permsets, perm_set_compare, sizeof (perm_set_t), offsetof(perm_set_t, p_node)); - rw_enter(&dp->dp_config_rwlock, RW_READER); + ASSERT(dsl_pool_config_held(dp)); for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent, checkflag = ZFS_DELEG_DESCENDENT) { uint64_t zapobj; @@ -582,7 +584,7 @@ dsl_deleg_access(const char *dsname, con * If not in global zone then make sure * the zoned property is set */ - if (!INGLOBALZONE(curproc)) { + if (!INGLOBALZONE(curthread)) { uint64_t zoned; if (dsl_prop_get_dd(dd, @@ -592,7 +594,7 @@ dsl_deleg_access(const char *dsname, con if (!zoned) break; } - zapobj = dd->dd_phys->dd_deleg_zapobj; + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; if (zapobj == 0) continue; @@ -631,10 +633,8 @@ again: if (error == 0) goto success; } - error = EPERM; + error = SET_ERROR(EPERM); success: - rw_exit(&dp->dp_config_rwlock); - dsl_dataset_rele(ds, FTAG); cookie = NULL; while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL) @@ -643,6 +643,26 @@ success: return (error); } +int +dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int error; + + error = dsl_pool_hold(dsname, FTAG, &dp); + if (error != 0) + return (error); + error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (error == 0) { + error = dsl_deleg_access_impl(ds, perm, cr); + dsl_dataset_rele(ds, FTAG); + } + dsl_pool_rele(dp, FTAG); + + return (error); +} + /* * Other routines. */ @@ -653,7 +673,7 @@ copy_create_perms(dsl_dir_t *dd, uint64_ { objset_t *mos = dd->dd_pool->dp_meta_objset; uint64_t jumpobj, pjumpobj; - uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj; + uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; zap_cursor_t zc; zap_attribute_t za; char whokey[ZFS_MAX_DELEG_NAME]; @@ -666,7 +686,7 @@ copy_create_perms(dsl_dir_t *dd, uint64_ if (zapobj == 0) { dmu_buf_will_dirty(dd->dd_dbuf, tx); - zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos, + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); } @@ -704,7 +724,7 @@ dsl_deleg_set_create_perms(dsl_dir_t *sd return; for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) { - uint64_t pzapobj = dd->dd_phys->dd_deleg_zapobj; + uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; if (pzapobj == 0) continue; Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_destroy.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_destroy.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_destroy.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_destroy.c 10 Oct 2016 11:09:56 -0000 @@ -0,0 +1,992 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2013 by Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct dmu_snapshots_destroy_arg { + nvlist_t *dsda_snaps; + nvlist_t *dsda_successful_snaps; + boolean_t dsda_defer; + nvlist_t *dsda_errlist; +} dmu_snapshots_destroy_arg_t; + +int +dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) +{ + if (!ds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + if (dsl_dataset_long_held(ds)) + return (SET_ERROR(EBUSY)); + + /* + * Only allow deferred destroy on pools that support it. + * NOTE: deferred destroy is only supported on snapshots. + */ + if (defer) { + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < + SPA_VERSION_USERREFS) + return (SET_ERROR(ENOTSUP)); + return (0); + } + + /* + * If this snapshot has an elevated user reference count, + * we can't destroy it yet. + */ + if (ds->ds_userrefs > 0) + return (SET_ERROR(EBUSY)); + + /* + * Can't delete a branch point. + */ + if (dsl_dataset_phys(ds)->ds_num_children > 1) + return (SET_ERROR(EEXIST)); + + return (0); +} + +static int +dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx) +{ + dmu_snapshots_destroy_arg_t *dsda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + int error = 0; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + for (pair = nvlist_next_nvpair(dsda->dsda_snaps, NULL); + pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_snaps, pair)) { + dsl_dataset_t *ds; + + error = dsl_dataset_hold(dp, nvpair_name(pair), + FTAG, &ds); + + /* + * If the snapshot does not exist, silently ignore it + * (it's "already destroyed"). + */ + if (error == ENOENT) + continue; + + if (error == 0) { + error = dsl_destroy_snapshot_check_impl(ds, + dsda->dsda_defer); + dsl_dataset_rele(ds, FTAG); + } + + if (error == 0) { + fnvlist_add_boolean(dsda->dsda_successful_snaps, + nvpair_name(pair)); + } else { + fnvlist_add_int32(dsda->dsda_errlist, + nvpair_name(pair), error); + } + } + + pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL); + if (pair != NULL) + return (fnvpair_value_int32(pair)); + + return (0); +} + +struct process_old_arg { + dsl_dataset_t *ds; + dsl_dataset_t *ds_prev; + boolean_t after_branch_point; + zio_t *pio; + uint64_t used, comp, uncomp; +}; + +static int +process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + struct process_old_arg *poa = arg; + dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; + + ASSERT(!BP_IS_HOLE(bp)); + + if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { + dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); + if (poa->ds_prev && !poa->after_branch_point && + bp->blk_birth > + dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) { + dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes += + bp_get_dsize_sync(dp->dp_spa, bp); + } + } else { + poa->used += bp_get_dsize_sync(dp->dp_spa, bp); + poa->comp += BP_GET_PSIZE(bp); + poa->uncomp += BP_GET_UCSIZE(bp); + dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); + } + return (0); +} + +static void +process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, + dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) +{ + struct process_old_arg poa = { 0 }; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t deadlist_obj; + + ASSERT(ds->ds_deadlist.dl_oldfmt); + ASSERT(ds_next->ds_deadlist.dl_oldfmt); + + poa.ds = ds; + poa.ds_prev = ds_prev; + poa.after_branch_point = after_branch_point; + poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, + process_old_cb, &poa, tx)); + VERIFY0(zio_wait(poa.pio)); + ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes); + + /* change snapused */ + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -poa.used, -poa.comp, -poa.uncomp, tx); + + /* swap next's deadlist to our deadlist */ + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_close(&ds_next->ds_deadlist); + deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; + dsl_dataset_phys(ds)->ds_deadlist_obj = + dsl_dataset_phys(ds_next)->ds_deadlist_obj; + dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj; + dsl_deadlist_open(&ds->ds_deadlist, mos, + dsl_dataset_phys(ds)->ds_deadlist_obj); + dsl_deadlist_open(&ds_next->ds_deadlist, mos, + dsl_dataset_phys(ds_next)->ds_deadlist_obj); +} + +static void +dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + + /* + * If it is the old version, dd_clones doesn't exist so we can't + * find the clones, but dsl_deadlist_remove_key() is a no-op so it + * doesn't matter. + */ + if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0) + return; + + for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *clone; + + VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + za.za_first_integer, FTAG, &clone)); + if (clone->ds_dir->dd_origin_txg > mintxg) { + dsl_deadlist_remove_key(&clone->ds_deadlist, + mintxg, tx); + dsl_dataset_remove_clones_key(clone, mintxg, tx); + } + dsl_dataset_rele(clone, FTAG); + } + zap_cursor_fini(&zc); +} + +void +dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) +{ + int err; + int after_branch_point = FALSE; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + dsl_dataset_t *ds_prev = NULL; + uint64_t obj; + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + ASSERT(refcount_is_zero(&ds->ds_longholds)); + + if (defer && + (ds->ds_userrefs > 0 || + dsl_dataset_phys(ds)->ds_num_children > 1)) { + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY; + spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); + return; + } + + ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); + + /* We need to log before removing it from the namespace. */ + spa_history_log_internal_ds(ds, "destroy", tx, ""); + + dsl_scan_ds_destroyed(ds, tx); + + obj = ds->ds_object; + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (ds->ds_feature_inuse[f]) { + dsl_dataset_deactivate_feature(obj, f, tx); + ds->ds_feature_inuse[f] = B_FALSE; + } + } + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + ASSERT3P(ds->ds_prev, ==, NULL); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev)); + after_branch_point = + (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj); + + dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); + if (after_branch_point && + dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) { + dsl_dataset_remove_from_next_clones(ds_prev, obj, tx); + if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { + VERIFY0(zap_add_int(mos, + dsl_dataset_phys(ds_prev)-> + ds_next_clones_obj, + dsl_dataset_phys(ds)->ds_next_snap_obj, + tx)); + } + } + if (!after_branch_point) { + dsl_dataset_phys(ds_prev)->ds_next_snap_obj = + dsl_dataset_phys(ds)->ds_next_snap_obj; + } + } + + dsl_dataset_t *ds_next; + uint64_t old_unique; + uint64_t used = 0, comp = 0, uncomp = 0; + + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next)); + ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj); + + old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes; + + dmu_buf_will_dirty(ds_next->ds_dbuf, tx); + dsl_dataset_phys(ds_next)->ds_prev_snap_obj = + dsl_dataset_phys(ds)->ds_prev_snap_obj; + dsl_dataset_phys(ds_next)->ds_prev_snap_txg = + dsl_dataset_phys(ds)->ds_prev_snap_txg; + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, + ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0); + + if (ds_next->ds_deadlist.dl_oldfmt) { + process_old_deadlist(ds, ds_prev, ds_next, + after_branch_point, tx); + } else { + /* Adjust prev's unique space. */ + if (ds_prev && !after_branch_point) { + dsl_deadlist_space_range(&ds_next->ds_deadlist, + dsl_dataset_phys(ds_prev)->ds_prev_snap_txg, + dsl_dataset_phys(ds)->ds_prev_snap_txg, + &used, &comp, &uncomp); + dsl_dataset_phys(ds_prev)->ds_unique_bytes += used; + } + + /* Adjust snapused. */ + dsl_deadlist_space_range(&ds_next->ds_deadlist, + dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX, + &used, &comp, &uncomp); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -used, -comp, -uncomp, tx); + + /* Move blocks to be freed to pool's free list. */ + dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, + &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg, + tx); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, + DD_USED_HEAD, used, comp, uncomp, tx); + + /* Merge our deadlist into next's and free it. */ + dsl_deadlist_merge(&ds_next->ds_deadlist, + dsl_dataset_phys(ds)->ds_deadlist_obj, tx); + } + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_deadlist_obj = 0; + + /* Collapse range in clone heads */ + dsl_dataset_remove_clones_key(ds, + dsl_dataset_phys(ds)->ds_creation_txg, tx); + + if (ds_next->ds_is_snapshot) { + dsl_dataset_t *ds_nextnext; + + /* + * Update next's unique to include blocks which + * were previously shared by only this snapshot + * and it. Those blocks will be born after the + * prev snap and before this snap, and will have + * died after the next snap and before the one + * after that (ie. be on the snap after next's + * deadlist). + */ + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds_next)->ds_next_snap_obj, + FTAG, &ds_nextnext)); + dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, + dsl_dataset_phys(ds)->ds_prev_snap_txg, + dsl_dataset_phys(ds)->ds_creation_txg, + &used, &comp, &uncomp); + dsl_dataset_phys(ds_next)->ds_unique_bytes += used; + dsl_dataset_rele(ds_nextnext, FTAG); + ASSERT3P(ds_next->ds_prev, ==, NULL); + + /* Collapse range in this head. */ + dsl_dataset_t *hds; + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds)); + dsl_deadlist_remove_key(&hds->ds_deadlist, + dsl_dataset_phys(ds)->ds_creation_txg, tx); + dsl_dataset_rele(hds, FTAG); + + } else { + ASSERT3P(ds_next->ds_prev, ==, ds); + dsl_dataset_rele(ds_next->ds_prev, ds_next); + ds_next->ds_prev = NULL; + if (ds_prev) { + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, + ds_next, &ds_next->ds_prev)); + } + + dsl_dataset_recalc_head_uniq(ds_next); + + /* + * Reduce the amount of our unconsumed refreservation + * being charged to our parent by the amount of + * new unique data we have gained. + */ + if (old_unique < ds_next->ds_reserved) { + int64_t mrsdelta; + uint64_t new_unique = + dsl_dataset_phys(ds_next)->ds_unique_bytes; + + ASSERT(old_unique <= new_unique); + mrsdelta = MIN(new_unique - old_unique, + ds_next->ds_reserved - old_unique); + dsl_dir_diduse_space(ds->ds_dir, + DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); + } + } + dsl_dataset_rele(ds_next, FTAG); + + /* + * This must be done after the dsl_traverse(), because it will + * re-open the objset. + */ + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + + /* remove from snapshot namespace */ + dsl_dataset_t *ds_head; + ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head)); + VERIFY0(dsl_dataset_get_snapname(ds)); +#ifdef ZFS_DEBUG + { + uint64_t val; + + err = dsl_dataset_snap_lookup(ds_head, + ds->ds_snapname, &val); + ASSERT0(err); + ASSERT3U(val, ==, obj); + } +#endif + VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE)); + dsl_dataset_rele(ds_head, FTAG); + + if (ds_prev != NULL) + dsl_dataset_rele(ds_prev, FTAG); + + spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); + + if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { + uint64_t count; + ASSERT0(zap_count(mos, + dsl_dataset_phys(ds)->ds_next_clones_obj, &count) && + count == 0); + VERIFY0(dmu_object_free(mos, + dsl_dataset_phys(ds)->ds_next_clones_obj, tx)); + } + if (dsl_dataset_phys(ds)->ds_props_obj != 0) + VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj, + tx)); + if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) + VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, + tx)); + dsl_dir_rele(ds->ds_dir, ds); + ds->ds_dir = NULL; + dmu_object_free_zapified(mos, obj, tx); +} + +static void +dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx) +{ + dmu_snapshots_destroy_arg_t *dsda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + + for (pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, NULL); + pair != NULL; + pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, pair)) { + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); + + dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx); + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * The semantics of this function are described in the comment above + * lzc_destroy_snaps(). To summarize: + * + * The snapshots must all be in the same pool. + * + * Snapshots that don't exist will be silently ignored (considered to be + * "already deleted"). + * + * On success, all snaps will be destroyed and this will return 0. + * On failure, no snaps will be destroyed, the errlist will be filled in, + * and this will return an errno. + */ +int +dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer, + nvlist_t *errlist) +{ + dmu_snapshots_destroy_arg_t dsda; + int error; + nvpair_t *pair; + + pair = nvlist_next_nvpair(snaps, NULL); + if (pair == NULL) + return (0); + + dsda.dsda_snaps = snaps; + dsda.dsda_successful_snaps = fnvlist_alloc(); + dsda.dsda_defer = defer; + dsda.dsda_errlist = errlist; + + error = dsl_sync_task(nvpair_name(pair), + dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync, + &dsda, 0, ZFS_SPACE_CHECK_NONE); + fnvlist_free(dsda.dsda_successful_snaps); + + return (error); +} + +int +dsl_destroy_snapshot(const char *name, boolean_t defer) +{ + int error; + nvlist_t *nvl = fnvlist_alloc(); + nvlist_t *errlist = fnvlist_alloc(); + + fnvlist_add_boolean(nvl, name); + error = dsl_destroy_snapshots_nvl(nvl, defer, errlist); + fnvlist_free(errlist); + fnvlist_free(nvl); + return (error); +} + +struct killarg { + dsl_dataset_t *ds; + dmu_tx_t *tx; +}; + +/* ARGSUSED */ +static int +kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + struct killarg *ka = arg; + dmu_tx_t *tx = ka->tx; + + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + return (0); + + if (zb->zb_level == ZB_ZIL_LEVEL) { + ASSERT(zilog != NULL); + /* + * It's a block in the intent log. It has no + * accounting, so just free it. + */ + dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); + } else { + ASSERT(zilog == NULL); + ASSERT3U(bp->blk_birth, >, + dsl_dataset_phys(ka->ds)->ds_prev_snap_txg); + (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); + } + + return (0); +} + +static void +old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + struct killarg ka; + + /* + * Free everything that we point to (that's born after + * the previous snapshot, if we are a clone) + * + * NB: this should be very quick, because we already + * freed all the objects in open context. + */ + ka.ds = ds; + ka.tx = tx; + VERIFY0(traverse_dataset(ds, + dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST, + kill_blkptr, &ka)); + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + dsl_dataset_phys(ds)->ds_unique_bytes == 0); +} + +typedef struct dsl_destroy_head_arg { + const char *ddha_name; +} dsl_destroy_head_arg_t; + +int +dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) +{ + int error; + uint64_t count; + objset_t *mos; + + ASSERT(!ds->ds_is_snapshot); + if (ds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + if (refcount_count(&ds->ds_longholds) != expected_holds) + return (SET_ERROR(EBUSY)); + + mos = ds->ds_dir->dd_pool->dp_meta_objset; + + /* + * Can't delete a head dataset if there are snapshots of it. + * (Except if the only snapshots are from the branch we cloned + * from.) + */ + if (ds->ds_prev != NULL && + dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object) + return (SET_ERROR(EBUSY)); + + /* + * Can't delete if there are children of this fs. + */ + error = zap_count(mos, + dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count); + if (error != 0) + return (error); + if (count != 0) + return (SET_ERROR(EEXIST)); + + if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && + dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 && + ds->ds_prev->ds_userrefs == 0) { + /* We need to remove the origin snapshot as well. */ + if (!refcount_is_zero(&ds->ds_prev->ds_longholds)) + return (SET_ERROR(EBUSY)); + } + return (0); +} + +static int +dsl_destroy_head_check(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_head_arg_t *ddha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + + error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds); + if (error != 0) + return (error); + + error = dsl_destroy_head_check_impl(ds, 0); + dsl_dataset_rele(ds, FTAG); + return (error); +} + +static void +dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) +{ + dsl_dir_t *dd; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + dd_used_t t; + + ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock)); + + VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); + + ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj); + + /* + * Decrement the filesystem count for all parent filesystems. + * + * When we receive an incremental stream into a filesystem that already + * exists, a temporary clone is created. We never count this temporary + * clone, whose name begins with a '%'. + */ + if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL) + dsl_fs_ss_count_adjust(dd->dd_parent, -1, + DD_FIELD_FILESYSTEM_COUNT, tx); + + /* + * Remove our reservation. The impl() routine avoids setting the + * actual property, which would require the (already destroyed) ds. + */ + dsl_dir_set_reservation_sync_impl(dd, 0, tx); + + ASSERT0(dsl_dir_phys(dd)->dd_used_bytes); + ASSERT0(dsl_dir_phys(dd)->dd_reserved); + for (t = 0; t < DD_USED_NUM; t++) + ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]); + + VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx)); + VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx)); + VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx)); + VERIFY0(zap_remove(mos, + dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, + dd->dd_myname, tx)); + + dsl_dir_rele(dd, FTAG); + dmu_object_free_zapified(mos, ddobj, tx); +} + +void +dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + uint64_t obj, ddobj, prevobj = 0; + boolean_t rmorigin; + + ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); + ASSERT(ds->ds_prev == NULL || + dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + /* We need to log before removing it from the namespace. */ + spa_history_log_internal_ds(ds, "destroy", tx, ""); + + rmorigin = (dsl_dir_is_clone(ds->ds_dir) && + DS_IS_DEFER_DESTROY(ds->ds_prev) && + dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 && + ds->ds_prev->ds_userrefs == 0); + + /* Remove our reservation. */ + if (ds->ds_reserved != 0) { + dsl_dataset_set_refreservation_sync_impl(ds, + (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), + 0, tx); + ASSERT0(ds->ds_reserved); + } + + obj = ds->ds_object; + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (ds->ds_feature_inuse[f]) { + dsl_dataset_deactivate_feature(obj, f, tx); + ds->ds_feature_inuse[f] = B_FALSE; + } + } + + dsl_scan_ds_destroyed(ds, tx); + + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + /* This is a clone */ + ASSERT(ds->ds_prev != NULL); + ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=, + obj); + ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj); + + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) { + dsl_dataset_remove_from_next_clones(ds->ds_prev, + obj, tx); + } + + ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1); + dsl_dataset_phys(ds->ds_prev)->ds_num_children--; + } + + /* + * Destroy the deadlist. Unless it's a clone, the + * deadlist should be empty. (If it's a clone, it's + * safe to ignore the deadlist contents.) + */ + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_deadlist_obj = 0; + + objset_t *os; + VERIFY0(dmu_objset_from_ds(ds, &os)); + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { + old_synchronous_dataset_destroy(ds, tx); + } else { + /* + * Move the bptree into the pool's list of trees to + * clean up and update space accounting information. + */ + uint64_t used, comp, uncomp; + + zil_destroy_sync(dmu_objset_zil(os), tx); + + if (!spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_ASYNC_DESTROY)) { + dsl_scan_t *scn = dp->dp_scan; + spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY, + tx); + dp->dp_bptree_obj = bptree_alloc(mos, tx); + VERIFY0(zap_add(mos, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj, tx)); + ASSERT(!scn->scn_async_destroying); + scn->scn_async_destroying = B_TRUE; + } + + used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes; + comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes; + uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes; + + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + dsl_dataset_phys(ds)->ds_unique_bytes == used); + + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + bptree_add(mos, dp->dp_bptree_obj, + &dsl_dataset_phys(ds)->ds_bp, + dsl_dataset_phys(ds)->ds_prev_snap_txg, + used, comp, uncomp, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + -used, -comp, -uncomp, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); + } + + if (ds->ds_prev != NULL) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + VERIFY0(zap_remove_int(mos, + dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones, + ds->ds_object, tx)); + } + prevobj = ds->ds_prev->ds_object; + dsl_dataset_rele(ds->ds_prev, ds); + ds->ds_prev = NULL; + } + + /* + * This must be done after the dsl_traverse(), because it will + * re-open the objset. + */ + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + + /* Erase the link in the dir */ + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0; + ddobj = ds->ds_dir->dd_object; + ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0); + VERIFY0(zap_destroy(mos, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx)); + + if (ds->ds_bookmarks != 0) { + VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx)); + spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); + } + + spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); + + ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj); + ASSERT0(dsl_dataset_phys(ds)->ds_props_obj); + ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj); + dsl_dir_rele(ds->ds_dir, ds); + ds->ds_dir = NULL; + dmu_object_free_zapified(mos, obj, tx); + + dsl_dir_destroy_sync(ddobj, tx); + + if (rmorigin) { + dsl_dataset_t *prev; + VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev)); + dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx); + dsl_dataset_rele(prev, FTAG); + } +} + +static void +dsl_destroy_head_sync(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_head_arg_t *ddha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); + dsl_destroy_head_sync_impl(ds, tx); + dsl_dataset_rele(ds, FTAG); +} + +static void +dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_head_arg_t *ddha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); + + /* Mark it as inconsistent on-disk, in case we crash */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; + + spa_history_log_internal_ds(ds, "destroy begin", tx, ""); + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_destroy_head(const char *name) +{ + dsl_destroy_head_arg_t ddha; + int error; + spa_t *spa; + boolean_t isenabled; + +#ifdef _KERNEL + zfs_destroy_unmount_origin(name); +#endif + + error = spa_open(name, &spa, FTAG); + if (error != 0) + return (error); + isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY); + spa_close(spa, FTAG); + + ddha.ddha_name = name; + + if (!isenabled) { + objset_t *os; + + error = dsl_sync_task(name, dsl_destroy_head_check, + dsl_destroy_head_begin_sync, &ddha, + 0, ZFS_SPACE_CHECK_NONE); + if (error != 0) + return (error); + + /* + * Head deletion is processed in one txg on old pools; + * remove the objects from open context so that the txg sync + * is not too long. + */ + error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os); + if (error == 0) { + uint64_t prev_snap_txg = + dsl_dataset_phys(dmu_objset_ds(os))-> + ds_prev_snap_txg; + for (uint64_t obj = 0; error == 0; + error = dmu_object_next(os, &obj, FALSE, + prev_snap_txg)) + (void) dmu_free_long_object(os, obj); + /* sync out all frees */ + txg_wait_synced(dmu_objset_pool(os), 0); + dmu_objset_disown(os, FTAG); + } + } + + return (dsl_sync_task(name, dsl_destroy_head_check, + dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE)); +} + +/* + * Note, this function is used as the callback for dmu_objset_find(). We + * always return 0 so that we will continue to find and process + * inconsistent datasets, even if we encounter an error trying to + * process one of them. + */ +/* ARGSUSED */ +int +dsl_destroy_inconsistent(const char *dsname, void *arg) +{ + objset_t *os; + + if (dmu_objset_hold(dsname, FTAG, &os) == 0) { + boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os)); + + /* + * If the dataset is inconsistent because a resumable receive + * has failed, then do not destroy it. + */ + if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os))) + need_destroy = B_FALSE; + + dmu_objset_rele(os, FTAG); + if (need_destroy) + (void) dsl_destroy_head(dsname); + } + return (0); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 dsl_dir.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c 27 Feb 2010 22:30:56 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c 17 Apr 2017 23:36:24 -0000 @@ -19,8 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek . + * All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ #include @@ -31,27 +36,111 @@ #include #include #include +#include #include #include #include #include #include #include +#include +#ifdef _KERNEL +#include +#endif +#include +#include +#include #include "zfs_namecheck.h" +#include "zfs_prop.h" -static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); -static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, - cred_t *cr, dmu_tx_t *tx); +/* + * Filesystem and Snapshot Limits + * ------------------------------ + * + * These limits are used to restrict the number of filesystems and/or snapshots + * that can be created at a given level in the tree or below. A typical + * use-case is with a delegated dataset where the administrator wants to ensure + * that a user within the zone is not creating too many additional filesystems + * or snapshots, even though they're not exceeding their space quota. + * + * The filesystem and snapshot counts are stored as extensible properties. This + * capability is controlled by a feature flag and must be enabled to be used. + * Once enabled, the feature is not active until the first limit is set. At + * that point, future operations to create/destroy filesystems or snapshots + * will validate and update the counts. + * + * Because the count properties will not exist before the feature is active, + * the counts are updated when a limit is first set on an uninitialized + * dsl_dir node in the tree (The filesystem/snapshot count on a node includes + * all of the nested filesystems/snapshots. Thus, a new leaf node has a + * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and + * snapshot count properties on a node indicate uninitialized counts on that + * node.) When first setting a limit on an uninitialized node, the code starts + * at the filesystem with the new limit and descends into all sub-filesystems + * to add the count properties. + * + * In practice this is lightweight since a limit is typically set when the + * filesystem is created and thus has no children. Once valid, changing the + * limit value won't require a re-traversal since the counts are already valid. + * When recursively fixing the counts, if a node with a limit is encountered + * during the descent, the counts are known to be valid and there is no need to + * descend into that filesystem's children. The counts on filesystems above the + * one with the new limit will still be uninitialized, unless a limit is + * eventually set on one of those filesystems. The counts are always recursively + * updated when a limit is set on a dataset, unless there is already a limit. + * When a new limit value is set on a filesystem with an existing limit, it is + * possible for the new limit to be less than the current count at that level + * since a user who can change the limit is also allowed to exceed the limit. + * + * Once the feature is active, then whenever a filesystem or snapshot is + * created, the code recurses up the tree, validating the new count against the + * limit at each initialized level. In practice, most levels will not have a + * limit set. If there is a limit at any initialized level up the tree, the + * check must pass or the creation will fail. Likewise, when a filesystem or + * snapshot is destroyed, the counts are recursively adjusted all the way up + * the initizized nodes in the tree. Renaming a filesystem into different point + * in the tree will first validate, then update the counts on each branch up to + * the common ancestor. A receive will also validate the counts and then update + * them. + * + * An exception to the above behavior is that the limit is not enforced if the + * user has permission to modify the limit. This is primarily so that + * recursive snapshots in the global zone always work. We want to prevent a + * denial-of-service in which a lower level delegated dataset could max out its + * limit and thus block recursive snapshots from being taken in the global zone. + * Because of this, it is possible for the snapshot count to be over the limit + * and snapshots taken in the global zone could cause a lower level dataset to + * hit or exceed its limit. The administrator taking the global zone recursive + * snapshot should be aware of this side-effect and behave accordingly. + * For consistency, the filesystem limit is also not enforced if the user can + * modify the limit. + * + * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() + * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in + * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by + * dsl_dir_init_fs_ss_count(). + * + * There is a special case when we receive a filesystem that already exists. In + * this case a temporary clone name of %X is created (see dmu_recv_begin). We + * never update the filesystem counts for temporary clones. + * + * Likewise, we do not update the snapshot counts for temporary snapshots, + * such as those created by zfs diff. + */ +extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); + +static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); -/* ARGSUSED */ static void -dsl_dir_evict(dmu_buf_t *db, void *arg) +dsl_dir_evict_async(void *dbu) { - dsl_dir_t *dd = arg; + dsl_dir_t *dd = dbu; dsl_pool_t *dp = dd->dd_pool; int t; + dd->dd_dbuf = NULL; + for (t = 0; t < TXG_SIZE; t++) { ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); ASSERT(dd->dd_tempreserved[t] == 0); @@ -59,39 +148,34 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) } if (dd->dd_parent) - dsl_dir_close(dd->dd_parent, dd); + dsl_dir_async_rele(dd->dd_parent, dd); - spa_close(dd->dd_pool->dp_spa, dd); + spa_async_close(dd->dd_pool->dp_spa, dd); - /* - * The props callback list should be empty since they hold the - * dir open. - */ - list_destroy(&dd->dd_prop_cbs); + dsl_prop_fini(dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); } int -dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, +dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, const char *tail, void *tag, dsl_dir_t **ddp) { dmu_buf_t *dbuf; dsl_dir_t *dd; int err; - ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || - dsl_pool_sync_context(dp)); + ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); - if (err) + if (err != 0) return (err); dd = dmu_buf_get_user(dbuf); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbuf, &doi); - ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); } #endif @@ -102,45 +186,67 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_ dd->dd_object = ddobj; dd->dd_dbuf = dbuf; dd->dd_pool = dp; - dd->dd_phys = dbuf->db_data; mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); - - list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), - offsetof(dsl_prop_cb_record_t, cbr_node)); + dsl_prop_init(dd); dsl_dir_snap_cmtime_update(dd); - if (dd->dd_phys->dd_parent_obj) { - err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, - NULL, dd, &dd->dd_parent); - if (err) + if (dsl_dir_phys(dd)->dd_parent_obj) { + err = dsl_dir_hold_obj(dp, + dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, + &dd->dd_parent); + if (err != 0) goto errout; if (tail) { #ifdef ZFS_DEBUG uint64_t foundobj; err = zap_lookup(dp->dp_meta_objset, - dd->dd_parent->dd_phys->dd_child_dir_zapobj, - tail, sizeof (foundobj), 1, &foundobj); + dsl_dir_phys(dd->dd_parent)-> + dd_child_dir_zapobj, tail, + sizeof (foundobj), 1, &foundobj); ASSERT(err || foundobj == ddobj); #endif (void) strcpy(dd->dd_myname, tail); } else { err = zap_value_search(dp->dp_meta_objset, - dd->dd_parent->dd_phys->dd_child_dir_zapobj, + dsl_dir_phys(dd->dd_parent)-> + dd_child_dir_zapobj, ddobj, 0, dd->dd_myname); } - if (err) + if (err != 0) goto errout; } else { (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); } - winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, - dsl_dir_evict); - if (winner) { + if (dsl_dir_is_clone(dd)) { + dmu_buf_t *origin_bonus; + dsl_dataset_phys_t *origin_phys; + + /* + * We can't open the origin dataset, because + * that would require opening this dsl_dir. + * Just look at its phys directly instead. + */ + err = dmu_bonus_hold(dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_origin_obj, FTAG, + &origin_bonus); + if (err != 0) + goto errout; + origin_phys = origin_bonus->db_data; + dd->dd_origin_txg = + origin_phys->ds_creation_txg; + dmu_buf_rele(origin_bonus, FTAG); + } + + dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, + &dd->dd_dbuf); + winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); + if (winner != NULL) { if (dd->dd_parent) - dsl_dir_close(dd->dd_parent, dd); + dsl_dir_rele(dd->dd_parent, dd); + dsl_prop_fini(dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dd = winner; @@ -167,29 +273,45 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_ errout: if (dd->dd_parent) - dsl_dir_close(dd->dd_parent, dd); + dsl_dir_rele(dd->dd_parent, dd); + dsl_prop_fini(dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dmu_buf_rele(dbuf, tag); return (err); - } void -dsl_dir_close(dsl_dir_t *dd, void *tag) +dsl_dir_rele(dsl_dir_t *dd, void *tag) { dprintf_dd(dd, "%s\n", ""); spa_close(dd->dd_pool->dp_spa, tag); dmu_buf_rele(dd->dd_dbuf, tag); } -/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */ +/* + * Remove a reference to the given dsl dir that is being asynchronously + * released. Async releases occur from a taskq performing eviction of + * dsl datasets and dirs. This process is identical to a normal release + * with the exception of using the async API for releasing the reference on + * the spa. + */ +void +dsl_dir_async_rele(dsl_dir_t *dd, void *tag) +{ + dprintf_dd(dd, "%s\n", ""); + spa_async_close(dd->dd_pool->dp_spa, tag); + dmu_buf_rele(dd->dd_dbuf, tag); +} + +/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */ void dsl_dir_name(dsl_dir_t *dd, char *buf) { if (dd->dd_parent) { dsl_dir_name(dd->dd_parent, buf); - (void) strcat(buf, "/"); + VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <, + ZFS_MAX_DATASET_NAME_LEN); } else { buf[0] = '\0'; } @@ -199,14 +321,16 @@ dsl_dir_name(dsl_dir_t *dd, char *buf) * dprintf_dd() with dd_lock held */ mutex_enter(&dd->dd_lock); - (void) strcat(buf, dd->dd_myname); + VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); mutex_exit(&dd->dd_lock); } else { - (void) strcat(buf, dd->dd_myname); + VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); } } -/* Calculate name legnth, avoiding all the strcat calls of dsl_dir_name */ +/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ int dsl_dir_namelen(dsl_dir_t *dd) { @@ -233,13 +357,14 @@ static int getcomponent(const char *path, char *component, const char **nextp) { char *p; + if ((path == NULL) || (path[0] == '\0')) - return (ENOENT); + return (SET_ERROR(ENOENT)); /* This would be a good place to reserve some namespace... */ p = strpbrk(path, "/@"); if (p && (p[1] == '/' || p[1] == '@')) { /* two separators in a row */ - return (EINVAL); + return (SET_ERROR(EINVAL)); } if (p == NULL || p == path) { /* @@ -249,16 +374,16 @@ getcomponent(const char *path, char *com */ if (p != NULL && (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) - return (EINVAL); - if (strlen(path) >= MAXNAMELEN) - return (ENAMETOOLONG); + return (SET_ERROR(EINVAL)); + if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); (void) strcpy(component, path); p = NULL; } else if (p[0] == '/') { - if (p-path >= MAXNAMELEN) - return (ENAMETOOLONG); + if (p - path >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); (void) strncpy(component, path, p - path); - component[p-path] = '\0'; + component[p - path] = '\0'; p++; } else if (p[0] == '@') { /* @@ -266,95 +391,81 @@ getcomponent(const char *path, char *com * any more slashes. */ if (strchr(path, '/')) - return (EINVAL); - if (p-path >= MAXNAMELEN) - return (ENAMETOOLONG); + return (SET_ERROR(EINVAL)); + if (p - path >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); (void) strncpy(component, path, p - path); - component[p-path] = '\0'; + component[p - path] = '\0'; } else { - ASSERT(!"invalid p"); + panic("invalid p=%p", (void *)p); } *nextp = p; return (0); } /* - * same as dsl_open_dir, ignore the first component of name and use the - * spa instead + * Return the dsl_dir_t, and possibly the last component which couldn't + * be found in *tail. The name must be in the specified dsl_pool_t. This + * thread must hold the dp_config_rwlock for the pool. Returns NULL if the + * path is bogus, or if tail==NULL and we couldn't parse the whole name. + * (*tail)[0] == '@' means that the last component is a snapshot. */ int -dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, +dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) { - char buf[MAXNAMELEN]; - const char *next, *nextnext = NULL; + char buf[ZFS_MAX_DATASET_NAME_LEN]; + const char *spaname, *next, *nextnext = NULL; int err; dsl_dir_t *dd; - dsl_pool_t *dp; uint64_t ddobj; - int openedspa = FALSE; - - dprintf("%s\n", name); err = getcomponent(name, buf, &next); - if (err) + if (err != 0) return (err); - if (spa == NULL) { - err = spa_open(buf, &spa, FTAG); - if (err) { - dprintf("spa_open(%s) failed\n", buf); - return (err); - } - openedspa = TRUE; - /* XXX this assertion belongs in spa_open */ - ASSERT(!dsl_pool_sync_context(spa_get_dsl(spa))); - } + /* Make sure the name is in the specified pool. */ + spaname = spa_name(dp->dp_spa); + if (strcmp(buf, spaname) != 0) + return (SET_ERROR(EXDEV)); - dp = spa_get_dsl(spa); + ASSERT(dsl_pool_config_held(dp)); - rw_enter(&dp->dp_config_rwlock, RW_READER); - err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); - if (err) { - rw_exit(&dp->dp_config_rwlock); - if (openedspa) - spa_close(spa, FTAG); + err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); + if (err != 0) { return (err); } while (next != NULL) { - dsl_dir_t *child_ds; + dsl_dir_t *child_dd; err = getcomponent(next, buf, &nextnext); - if (err) + if (err != 0) break; ASSERT(next[0] != '\0'); if (next[0] == '@') break; dprintf("looking up %s in obj%lld\n", - buf, dd->dd_phys->dd_child_dir_zapobj); + buf, dsl_dir_phys(dd)->dd_child_dir_zapobj); err = zap_lookup(dp->dp_meta_objset, - dd->dd_phys->dd_child_dir_zapobj, + dsl_dir_phys(dd)->dd_child_dir_zapobj, buf, sizeof (ddobj), 1, &ddobj); - if (err) { + if (err != 0) { if (err == ENOENT) err = 0; break; } - err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds); - if (err) + err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); + if (err != 0) break; - dsl_dir_close(dd, tag); - dd = child_ds; + dsl_dir_rele(dd, tag); + dd = child_dd; next = nextnext; } - rw_exit(&dp->dp_config_rwlock); - if (err) { - dsl_dir_close(dd, tag); - if (openedspa) - spa_close(spa, FTAG); + if (err != 0) { + dsl_dir_rele(dd, tag); return (err); } @@ -365,28 +476,416 @@ dsl_dir_open_spa(spa_t *spa, const char if (next != NULL && (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { /* bad path name */ - dsl_dir_close(dd, tag); + dsl_dir_rele(dd, tag); dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); - err = ENOENT; + err = SET_ERROR(ENOENT); } - if (tailp) + if (tailp != NULL) *tailp = next; - if (openedspa) - spa_close(spa, FTAG); *ddp = dd; return (err); } /* - * Return the dsl_dir_t, and possibly the last component which couldn't - * be found in *tail. Return NULL if the path is bogus, or if - * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@' - * means that the last component is a snapshot. + * If the counts are already initialized for this filesystem and its + * descendants then do nothing, otherwise initialize the counts. + * + * The counts on this filesystem, and those below, may be uninitialized due to + * either the use of a pre-existing pool which did not support the + * filesystem/snapshot limit feature, or one in which the feature had not yet + * been enabled. + * + * Recursively descend the filesystem tree and update the filesystem/snapshot + * counts on each filesystem below, then update the cumulative count on the + * current filesystem. If the filesystem already has a count set on it, + * then we know that its counts, and the counts on the filesystems below it, + * are already correct, so we don't have to update this filesystem. + */ +static void +dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) +{ + uint64_t my_fs_cnt = 0; + uint64_t my_ss_cnt = 0; + dsl_pool_t *dp = dd->dd_pool; + objset_t *os = dp->dp_meta_objset; + zap_cursor_t *zc; + zap_attribute_t *za; + dsl_dataset_t *ds; + + ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); + ASSERT(dsl_pool_config_held(dp)); + ASSERT(dmu_tx_is_syncing(tx)); + + dsl_dir_zapify(dd, tx); + + /* + * If the filesystem count has already been initialized then we + * don't need to recurse down any further. + */ + if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) + return; + + zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + + /* Iterate my child dirs */ + for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj); + zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { + dsl_dir_t *chld_dd; + uint64_t count; + + VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, + &chld_dd)); + + /* + * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and + * temporary datasets. + */ + if (chld_dd->dd_myname[0] == '$' || + chld_dd->dd_myname[0] == '%') { + dsl_dir_rele(chld_dd, FTAG); + continue; + } + + my_fs_cnt++; /* count this child */ + + dsl_dir_init_fs_ss_count(chld_dd, tx); + + VERIFY0(zap_lookup(os, chld_dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); + my_fs_cnt += count; + VERIFY0(zap_lookup(os, chld_dd->dd_object, + DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); + my_ss_cnt += count; + + dsl_dir_rele(chld_dd, FTAG); + } + zap_cursor_fini(zc); + /* Count my snapshots (we counted children's snapshots above) */ + VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, + dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds)); + + for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + /* Don't count temporary snapshots */ + if (za->za_name[0] != '%') + my_ss_cnt++; + } + zap_cursor_fini(zc); + + dsl_dataset_rele(ds, FTAG); + + kmem_free(zc, sizeof (zap_cursor_t)); + kmem_free(za, sizeof (zap_attribute_t)); + + /* we're in a sync task, update counts */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, + sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, + sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); +} + +static int +dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) +{ + char *ddname = (char *)arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + dsl_dir_t *dd; + int error; + + error = dsl_dataset_hold(dp, ddname, FTAG, &ds); + if (error != 0) + return (error); + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + dd = ds->ds_dir; + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && + dsl_dir_is_zapified(dd) && + zap_contains(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT) == 0) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EALREADY)); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) +{ + char *ddname = (char *)arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + spa_t *spa; + + VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); + + spa = dsl_dataset_get_spa(ds); + + if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { + /* + * Since the feature was not active and we're now setting a + * limit, increment the feature-active counter so that the + * feature becomes active for the first time. + * + * We are already in a sync task so we can update the MOS. + */ + spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); + } + + /* + * Since we are now setting a non-UINT64_MAX limit on the filesystem, + * we need to ensure the counts are correct. Descend down the tree from + * this point and update all of the counts to be accurate. + */ + dsl_dir_init_fs_ss_count(ds->ds_dir, tx); + + dsl_dataset_rele(ds, FTAG); +} + +/* + * Make sure the feature is enabled and activate it if necessary. + * Since we're setting a limit, ensure the on-disk counts are valid. + * This is only called by the ioctl path when setting a limit value. + * + * We do not need to validate the new limit, since users who can change the + * limit are also allowed to exceed the limit. + */ +int +dsl_dir_activate_fs_ss_limit(const char *ddname) +{ + int error; + + error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, + dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0, + ZFS_SPACE_CHECK_RESERVED); + + if (error == EALREADY) + error = 0; + + return (error); +} + +/* + * Used to determine if the filesystem_limit or snapshot_limit should be + * enforced. We allow the limit to be exceeded if the user has permission to + * write the property value. We pass in the creds that we got in the open + * context since we will always be the GZ root in syncing context. We also have + * to handle the case where we are allowed to change the limit on the current + * dataset, but there may be another limit in the tree above. + * + * We can never modify these two properties within a non-global zone. In + * addition, the other checks are modeled on zfs_secpolicy_write_perms. We + * can't use that function since we are already holding the dp_config_rwlock. + * In addition, we already have the dd and dealing with snapshots is simplified + * in this code. + */ + +typedef enum { + ENFORCE_ALWAYS, + ENFORCE_NEVER, + ENFORCE_ABOVE +} enforce_res_t; + +static enforce_res_t +dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) +{ + enforce_res_t enforce = ENFORCE_ALWAYS; + uint64_t obj; + dsl_dataset_t *ds; + uint64_t zoned; + + ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || + prop == ZFS_PROP_SNAPSHOT_LIMIT); + +#ifdef _KERNEL +#ifdef illumos + if (crgetzoneid(cr) != GLOBAL_ZONEID) +#endif +#ifdef __FreeBSD__ + if (jailed(cr)) +#endif +#ifdef __NetBSD__ + if (0) +#endif + return (ENFORCE_ALWAYS); + + if (secpolicy_zfs(cr) == 0) + return (ENFORCE_NEVER); +#endif + + if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) + return (ENFORCE_ALWAYS); + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + + if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) + return (ENFORCE_ALWAYS); + + if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) { + /* Only root can access zoned fs's from the GZ */ + enforce = ENFORCE_ALWAYS; + } else { + if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) + enforce = ENFORCE_ABOVE; + } + + dsl_dataset_rele(ds, FTAG); + return (enforce); +} + +/* + * Check if adding additional child filesystem(s) would exceed any filesystem + * limits or adding additional snapshot(s) would exceed any snapshot limits. + * The prop argument indicates which limit to check. + * + * Note that all filesystem limits up to the root (or the highest + * initialized) filesystem or the given ancestor must be satisfied. */ int -dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) +dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, + dsl_dir_t *ancestor, cred_t *cr) +{ + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t limit, count; + char *count_prop; + enforce_res_t enforce; + int err = 0; + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || + prop == ZFS_PROP_SNAPSHOT_LIMIT); + + /* + * If we're allowed to change the limit, don't enforce the limit + * e.g. this can happen if a snapshot is taken by an administrative + * user in the global zone (i.e. a recursive snapshot by root). + * However, we must handle the case of delegated permissions where we + * are allowed to change the limit on the current dataset, but there + * is another limit in the tree above. + */ + enforce = dsl_enforce_ds_ss_limits(dd, prop, cr); + if (enforce == ENFORCE_NEVER) + return (0); + + /* + * e.g. if renaming a dataset with no snapshots, count adjustment + * is 0. + */ + if (delta == 0) + return (0); + + if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { + /* + * We don't enforce the limit for temporary snapshots. This is + * indicated by a NULL cred_t argument. + */ + if (cr == NULL) + return (0); + + count_prop = DD_FIELD_SNAPSHOT_COUNT; + } else { + count_prop = DD_FIELD_FILESYSTEM_COUNT; + } + + /* + * If an ancestor has been provided, stop checking the limit once we + * hit that dir. We need this during rename so that we don't overcount + * the check once we recurse up to the common ancestor. + */ + if (ancestor == dd) + return (0); + + /* + * If we hit an uninitialized node while recursing up the tree, we can + * stop since we know there is no limit here (or above). The counts are + * not valid on this node and we know we won't touch this node's counts. + */ + if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object, + count_prop, sizeof (count), 1, &count) == ENOENT) + return (0); + + err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, + B_FALSE); + if (err != 0) + return (err); + + /* Is there a limit which we've hit? */ + if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) + return (SET_ERROR(EDQUOT)); + + if (dd->dd_parent != NULL) + err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, + ancestor, cr); + + return (err); +} + +/* + * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all + * parents. When a new filesystem/snapshot is created, increment the count on + * all parents, and when a filesystem/snapshot is destroyed, decrement the + * count. + */ +void +dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, + dmu_tx_t *tx) { - return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp)); + int err; + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t count; + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || + strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); + + /* + * When we receive an incremental stream into a filesystem that already + * exists, a temporary clone is created. We don't count this temporary + * clone, whose name begins with a '%'. We also ignore hidden ($FREE, + * $MOS & $ORIGIN) objsets. + */ + if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') && + strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0) + return; + + /* + * e.g. if renaming a dataset with no snapshots, count adjustment is 0 + */ + if (delta == 0) + return; + + /* + * If we hit an uninitialized node while recursing up the tree, we can + * stop since we know the counts are not valid on this node and we + * know we shouldn't touch this node's counts. An uninitialized count + * on the node indicates that either the feature has not yet been + * activated or there are no limits on this part of the tree. + */ + if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, + prop, sizeof (count), 1, &count)) == ENOENT) + return; + VERIFY0(err); + + count += delta; + /* Use a signed verify to make sure we're not neg. */ + VERIFY3S(count, >=, 0); + + VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, + tx)); + + /* Roll up this additional count into our ancestors */ + if (dd->dd_parent != NULL) + dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); } uint64_t @@ -395,13 +894,13 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_ { objset_t *mos = dp->dp_meta_objset; uint64_t ddobj; - dsl_dir_phys_t *dsphys; + dsl_dir_phys_t *ddphys; dmu_buf_t *dbuf; ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); if (pds) { - VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, + VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, name, sizeof (uint64_t), 1, &ddobj, tx)); } else { /* it's the root dir */ @@ -410,97 +909,32 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_ } VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; + ddphys = dbuf->db_data; - dsphys->dd_creation_time = gethrestime_sec(); - if (pds) - dsphys->dd_parent_obj = pds->dd_object; - dsphys->dd_props_zapobj = zap_create(mos, + ddphys->dd_creation_time = gethrestime_sec(); + if (pds) { + ddphys->dd_parent_obj = pds->dd_object; + + /* update the filesystem counts */ + dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); + } + ddphys->dd_props_zapobj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); - dsphys->dd_child_dir_zapobj = zap_create(mos, + ddphys->dd_child_dir_zapobj = zap_create(mos, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) - dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; + ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; dmu_buf_rele(dbuf, FTAG); return (ddobj); } -/* ARGSUSED */ -int -dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - dsl_dir_t *dd = ds->ds_dir; - dsl_pool_t *dp = dd->dd_pool; - objset_t *mos = dp->dp_meta_objset; - int err; - uint64_t count; - - /* - * There should be exactly two holds, both from - * dsl_dataset_destroy: one on the dd directory, and one on its - * head ds. Otherwise, someone is trying to lookup something - * inside this dir while we want to destroy it. The - * config_rwlock ensures that nobody else opens it after we - * check. - */ - if (dmu_buf_refcount(dd->dd_dbuf) > 2) - return (EBUSY); - - err = zap_count(mos, dd->dd_phys->dd_child_dir_zapobj, &count); - if (err) - return (err); - if (count != 0) - return (EEXIST); - - return (0); -} - -void -dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - dsl_dir_t *dd = ds->ds_dir; - objset_t *mos = dd->dd_pool->dp_meta_objset; - dsl_prop_setarg_t psa; - uint64_t value = 0; - uint64_t obj; - dd_used_t t; - - ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock)); - ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); - - /* Remove our reservation. */ - dsl_prop_setarg_init_uint64(&psa, "reservation", - (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), - &value); - psa.psa_effective_value = 0; /* predict default value */ - - dsl_dir_set_reservation_sync(ds, &psa, cr, tx); - - ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0); - ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); - for (t = 0; t < DD_USED_NUM; t++) - ASSERT3U(dd->dd_phys->dd_used_breakdown[t], ==, 0); - - VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); - VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); - VERIFY(0 == dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx)); - VERIFY(0 == zap_remove(mos, - dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx)); - - obj = dd->dd_object; - dsl_dir_close(dd, tag); - VERIFY(0 == dmu_object_free(mos, obj, tx)); -} - boolean_t dsl_dir_is_clone(dsl_dir_t *dd) { - return (dd->dd_phys->dd_origin_obj && + return (dsl_dir_phys(dd)->dd_origin_obj && (dd->dd_pool->dp_origin_snap == NULL || - dd->dd_phys->dd_origin_obj != + dsl_dir_phys(dd)->dd_origin_obj != dd->dd_pool->dp_origin_snap->ds_object)); } @@ -509,39 +943,56 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *n { mutex_enter(&dd->dd_lock); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, - dd->dd_phys->dd_used_bytes); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota); + dsl_dir_phys(dd)->dd_used_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, + dsl_dir_phys(dd)->dd_quota); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, - dd->dd_phys->dd_reserved); + dsl_dir_phys(dd)->dd_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, - dd->dd_phys->dd_compressed_bytes == 0 ? 100 : - (dd->dd_phys->dd_uncompressed_bytes * 100 / - dd->dd_phys->dd_compressed_bytes)); - if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 : + (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 / + dsl_dir_phys(dd)->dd_compressed_bytes)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, + dsl_dir_phys(dd)->dd_uncompressed_bytes); + if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, - dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]); + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, - dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]); + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, - dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]); + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, - dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] + - dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]); + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] + + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]); } mutex_exit(&dd->dd_lock); - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); + if (dsl_dir_is_zapified(dd)) { + uint64_t count; + objset_t *os = dd->dd_pool->dp_meta_objset; + + if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, + sizeof (count), 1, &count) == 0) { + dsl_prop_nvlist_add_uint64(nv, + ZFS_PROP_FILESYSTEM_COUNT, count); + } + if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, + sizeof (count), 1, &count) == 0) { + dsl_prop_nvlist_add_uint64(nv, + ZFS_PROP_SNAPSHOT_COUNT, count); + } + } + if (dsl_dir_is_clone(dd)) { dsl_dataset_t *ds; - char buf[MAXNAMELEN]; + char buf[ZFS_MAX_DATASET_NAME_LEN]; - VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, - dd->dd_phys->dd_origin_obj, FTAG, &ds)); + VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, + dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); } - rw_exit(&dd->dd_pool->dp_config_rwlock); } void @@ -549,9 +1000,9 @@ dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *t { dsl_pool_t *dp = dd->dd_pool; - ASSERT(dd->dd_phys); + ASSERT(dsl_dir_phys(dd)); - if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg) == 0) { + if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(dd->dd_dbuf, dd); } @@ -560,8 +1011,9 @@ dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *t static int64_t parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) { - uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved); - uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved); + uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved); + uint64_t new_accounted = + MAX(used + delta, dsl_dir_phys(dd)->dd_reserved); return (new_accounted - old_accounted); } @@ -570,10 +1022,8 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx { ASSERT(dmu_tx_is_syncing(tx)); - dmu_buf_will_dirty(dd->dd_dbuf, tx); - mutex_enter(&dd->dd_lock); - ASSERT3U(dd->dd_tempreserved[tx->tx_txg&TXG_MASK], ==, 0); + ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]); dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; @@ -622,9 +1072,9 @@ dsl_dir_space_available(dsl_dir_t *dd, } mutex_enter(&dd->dd_lock); - if (dd->dd_phys->dd_quota != 0) - quota = dd->dd_phys->dd_quota; - used = dd->dd_phys->dd_used_bytes; + if (dsl_dir_phys(dd)->dd_quota != 0) + quota = dsl_dir_phys(dd)->dd_quota; + used = dsl_dir_phys(dd)->dd_used_bytes; if (!ondiskonly) used += dsl_dir_space_towrite(dd); @@ -633,12 +1083,12 @@ dsl_dir_space_available(dsl_dir_t *dd, quota = MIN(quota, poolsize); } - if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) { + if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) { /* * We have some space reserved, in addition to what our * parent gave us. */ - parentspace += dd->dd_phys->dd_reserved - used; + parentspace += dsl_dir_phys(dd)->dd_reserved - used; } if (dd == ancestor) { @@ -652,15 +1102,6 @@ dsl_dir_space_available(dsl_dir_t *dd, if (used > quota) { /* over quota */ myspace = 0; - - /* - * While it's OK to be a little over quota, if - * we think we are using more space than there - * is in the pool (which is already 1.6% more than - * dsl_pool_adjustedsize()), something is very - * wrong. - */ - ASSERT3U(used, <=, spa_get_dspace(dd->dd_pool->dp_spa)); } else { /* * the lesser of the space provided by our parent and @@ -676,7 +1117,6 @@ dsl_dir_space_available(dsl_dir_t *dd, struct tempreserve { list_node_t tr_node; - dsl_pool_t *tr_dp; dsl_dir_t *tr_ds; uint64_t tr_size; }; @@ -707,7 +1147,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, est_inflight = dsl_dir_space_towrite(dd); for (i = 0; i < TXG_SIZE; i++) est_inflight += dd->dd_tempreserved[i]; - used_on_disk = dd->dd_phys->dd_used_bytes; + used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; /* * On the first iteration, fetch the dataset's used-on-disk and @@ -730,10 +1170,10 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, * If this transaction will result in a net free of space, * we want to let it through. */ - if (ignorequota || netfree || dd->dd_phys->dd_quota == 0) + if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0) quota = UINT64_MAX; else - quota = dd->dd_phys->dd_quota; + quota = dsl_dir_phys(dd)->dd_quota; /* * Adjust the quota against the actual pool size at the root @@ -770,7 +1210,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, used_on_disk>>10, est_inflight>>10, quota>>10, asize>>10, retval); mutex_exit(&dd->dd_lock); - return (retval); + return (SET_ERROR(retval)); } /* We need to up our estimated delta before dropping dd_lock */ @@ -787,7 +1227,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, /* see if it's OK with our parent */ if (dd->dd_parent && parent_rsrv) { - boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0); + boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); return (dsl_dir_tempreserve_impl(dd->dd_parent, parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE)); @@ -827,29 +1267,29 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_size = lsize; list_insert_tail(tr_list, tr); - - err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx); } else { if (err == EAGAIN) { - txg_delay(dd->dd_pool, tx->tx_txg, 1); - err = ERESTART; + /* + * If arc_memory_throttle() detected that pageout + * is running and we are low on memory, we delay new + * non-pageout transactions to give pageout an + * advantage. + * + * It is unfortunate to be delaying while the caller's + * locks are held. + */ + txg_delay(dd->dd_pool, tx->tx_txg, + MSEC2NSEC(10), MSEC2NSEC(10)); + err = SET_ERROR(ERESTART); } - dsl_pool_memory_pressure(dd->dd_pool); } if (err == 0) { - struct tempreserve *tr; - - tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); - tr->tr_dp = dd->dd_pool; - tr->tr_size = asize; - list_insert_tail(tr_list, tr); - err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, FALSE, asize > usize, tr_list, tx, TRUE); } - if (err) + if (err != 0) dsl_dir_tempreserve_clear(tr_list, tx); else *tr_cookiep = tr_list; @@ -873,10 +1313,8 @@ dsl_dir_tempreserve_clear(void *tr_cooki if (tr_cookie == NULL) return; - while (tr = list_head(tr_list)) { - if (tr->tr_dp) { - dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx); - } else if (tr->tr_ds) { + while ((tr = list_head(tr_list)) != NULL) { + if (tr->tr_ds) { mutex_enter(&tr->tr_ds->dd_lock); ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, tr->tr_size); @@ -892,8 +1330,14 @@ dsl_dir_tempreserve_clear(void *tr_cooki kmem_free(tr_list, sizeof (list_t)); } -static void -dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) +/* + * This should be called from open context when we think we're going to write + * or free space, for example when dirtying data. Be conservative; it's okay + * to write less space or free more, but we don't want to write more or free + * less than the amount specified. + */ +void +dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) { int64_t parent_space; uint64_t est_used; @@ -902,7 +1346,7 @@ dsl_dir_willuse_space_impl(dsl_dir_t *dd if (space > 0) dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; - est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes; + est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes; parent_space = parent_delta(dd, est_used, space); mutex_exit(&dd->dd_lock); @@ -911,19 +1355,7 @@ dsl_dir_willuse_space_impl(dsl_dir_t *dd /* XXX this is potentially expensive and unnecessary... */ if (parent_space && dd->dd_parent) - dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx); -} - -/* - * Call in open context when we think we're going to write/free space, - * eg. when dirtying data. Be conservative (ie. OK to write less than - * this or free more than this, but don't write more or free less). - */ -void -dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) -{ - dsl_pool_willuse_space(dd->dd_pool, space, tx); - dsl_dir_willuse_space_impl(dd, space, tx); + dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); } /* call from syncing context when we actually write/free space for this dd */ @@ -932,35 +1364,44 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_u int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) { int64_t accounted_delta; + + /* + * dsl_dataset_set_refreservation_sync_impl() calls this with + * dd_lock held, so that it can atomically update + * ds->ds_reserved and the dsl_dir accounting, so that + * dsl_dataset_check_quota() can see dataset and dir accounting + * consistently. + */ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(type < DD_USED_NUM); - dsl_dir_dirty(dd, tx); + dmu_buf_will_dirty(dd->dd_dbuf, tx); if (needlock) mutex_enter(&dd->dd_lock); - accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used); - ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used); + accounted_delta = + parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used); + ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used); ASSERT(compressed >= 0 || - dd->dd_phys->dd_compressed_bytes >= -compressed); + dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || - dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); - dd->dd_phys->dd_used_bytes += used; - dd->dd_phys->dd_uncompressed_bytes += uncompressed; - dd->dd_phys->dd_compressed_bytes += compressed; + dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed); + dsl_dir_phys(dd)->dd_used_bytes += used; + dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed; + dsl_dir_phys(dd)->dd_compressed_bytes += compressed; - if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { ASSERT(used > 0 || - dd->dd_phys->dd_used_breakdown[type] >= -used); - dd->dd_phys->dd_used_breakdown[type] += used; + dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used); + dsl_dir_phys(dd)->dd_used_breakdown[type] += used; #ifdef DEBUG dd_used_t t; uint64_t u = 0; for (t = 0; t < DD_USED_NUM; t++) - u += dd->dd_phys->dd_used_breakdown[t]; - ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes); + u += dsl_dir_phys(dd)->dd_used_breakdown[t]; + ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes); #endif } if (needlock) @@ -971,7 +1412,7 @@ dsl_dir_diduse_space(dsl_dir_t *dd, dd_u accounted_delta, compressed, uncompressed, tx); dsl_dir_transfer_space(dd->dd_parent, used - accounted_delta, - DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); + DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL); } } @@ -979,145 +1420,152 @@ void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) { - boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); - - ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(tx == NULL || dmu_tx_is_syncing(tx)); ASSERT(oldtype < DD_USED_NUM); ASSERT(newtype < DD_USED_NUM); - if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN)) + if (delta == 0 || + !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN)) return; - dsl_dir_dirty(dd, tx); - if (needlock) - mutex_enter(&dd->dd_lock); + if (tx != NULL) + dmu_buf_will_dirty(dd->dd_dbuf, tx); + mutex_enter(&dd->dd_lock); ASSERT(delta > 0 ? - dd->dd_phys->dd_used_breakdown[oldtype] >= delta : - dd->dd_phys->dd_used_breakdown[newtype] >= -delta); - ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta)); - dd->dd_phys->dd_used_breakdown[oldtype] -= delta; - dd->dd_phys->dd_used_breakdown[newtype] += delta; - if (needlock) - mutex_exit(&dd->dd_lock); + dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta : + dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta); + ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta)); + dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta; + dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta; + mutex_exit(&dd->dd_lock); } +typedef struct dsl_dir_set_qr_arg { + const char *ddsqra_name; + zprop_source_t ddsqra_source; + uint64_t ddsqra_value; +} dsl_dir_set_qr_arg_t; + static int -dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_setarg_t *psa = arg2; - int err; - uint64_t towrite; + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + uint64_t towrite, newval; - if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) - return (err); + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + + error = dsl_prop_predict(ds->ds_dir, "quota", + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } - if (psa->psa_effective_value == 0) + if (newval == 0) { + dsl_dataset_rele(ds, FTAG); return (0); + } - mutex_enter(&dd->dd_lock); + mutex_enter(&ds->ds_dir->dd_lock); /* * If we are doing the preliminary check in open context, and * there are pending changes, then don't fail it, since the * pending changes could under-estimate the amount of space to be * freed up. */ - towrite = dsl_dir_space_towrite(dd); + towrite = dsl_dir_space_towrite(ds->ds_dir); if ((dmu_tx_is_syncing(tx) || towrite == 0) && - (psa->psa_effective_value < dd->dd_phys->dd_reserved || - psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) { - err = ENOSPC; + (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved || + newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) { + error = SET_ERROR(ENOSPC); } - mutex_exit(&dd->dd_lock); - return (err); + mutex_exit(&ds->ds_dir->dd_lock); + dsl_dataset_rele(ds, FTAG); + return (error); } -extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *); - -/* ARGSUSED */ static void -dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value = psa->psa_effective_value; + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + uint64_t newval; - dsl_prop_set_sync(ds, psa, cr, tx); - DSL_PROP_CHECK_PREDICTION(dd, psa); + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - dmu_buf_will_dirty(dd->dd_dbuf, tx); + if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); - mutex_enter(&dd->dd_lock); - dd->dd_phys->dd_quota = effective_value; - mutex_exit(&dd->dd_lock); + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); + } else { + newval = ddsqra->ddsqra_value; + spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", + zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); + } - spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa, - tx, cr, "%lld dataset = %llu ", - (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj); + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + mutex_enter(&ds->ds_dir->dd_lock); + dsl_dir_phys(ds->ds_dir)->dd_quota = newval; + mutex_exit(&ds->ds_dir->dd_lock); + dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) { - dsl_dir_t *dd; - dsl_dataset_t *ds; - dsl_prop_setarg_t psa; - int err; - - dsl_prop_setarg_init_uint64(&psa, "quota", source, "a); + dsl_dir_set_qr_arg_t ddsqra; - err = dsl_dataset_hold(ddname, FTAG, &ds); - if (err) - return (err); + ddsqra.ddsqra_name = ddname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = quota; - err = dsl_dir_open(ddname, FTAG, &dd, NULL); - if (err) { - dsl_dataset_rele(ds, FTAG); - return (err); - } - - ASSERT(ds->ds_dir == dd); - - /* - * If someone removes a file, then tries to set the quota, we want to - * make sure the file freeing takes effect. - */ - txg_wait_open(dd->dd_pool, 0); - - err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, - dsl_dir_set_quota_sync, ds, &psa, 0); - - dsl_dir_close(dd, FTAG); - dsl_dataset_rele(ds, FTAG); - return (err); + return (dsl_sync_task(ddname, dsl_dir_set_quota_check, + dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); } int -dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value; - uint64_t used, avail; - int err; - - if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) - return (err); + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + dsl_dir_t *dd; + uint64_t newval, used, avail; + int error; - effective_value = psa->psa_effective_value; + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + dd = ds->ds_dir; /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ - if (!dmu_tx_is_syncing(tx)) + if (!dmu_tx_is_syncing(tx)) { + dsl_dataset_rele(ds, FTAG); return (0); + } + + error = dsl_prop_predict(ds->ds_dir, + zfs_prop_to_name(ZFS_PROP_RESERVATION), + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } mutex_enter(&dd->dd_lock); - used = dd->dd_phys->dd_used_bytes; + used = dsl_dir_phys(dd)->dd_used_bytes; mutex_exit(&dd->dd_lock); if (dd->dd_parent) { @@ -1127,41 +1575,32 @@ dsl_dir_set_reservation_check(void *arg1 avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; } - if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) { - uint64_t delta = MAX(used, effective_value) - - MAX(used, dd->dd_phys->dd_reserved); - - if (delta > avail) - return (ENOSPC); - if (dd->dd_phys->dd_quota > 0 && - effective_value > dd->dd_phys->dd_quota) - return (ENOSPC); + if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { + uint64_t delta = MAX(used, newval) - + MAX(used, dsl_dir_phys(dd)->dd_reserved); + + if (delta > avail || + (dsl_dir_phys(dd)->dd_quota > 0 && + newval > dsl_dir_phys(dd)->dd_quota)) + error = SET_ERROR(ENOSPC); } - return (0); + dsl_dataset_rele(ds, FTAG); + return (error); } -/* ARGSUSED */ -static void -dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +void +dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_setarg_t *psa = arg2; - uint64_t effective_value = psa->psa_effective_value; uint64_t used; int64_t delta; - dsl_prop_set_sync(ds, psa, cr, tx); - DSL_PROP_CHECK_PREDICTION(dd, psa); - dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); - used = dd->dd_phys->dd_used_bytes; - delta = MAX(used, effective_value) - - MAX(used, dd->dd_phys->dd_reserved); - dd->dd_phys->dd_reserved = effective_value; + used = dsl_dir_phys(dd)->dd_used_bytes; + delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved); + dsl_dir_phys(dd)->dd_reserved = value; if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ @@ -1169,41 +1608,49 @@ dsl_dir_set_reservation_sync(void *arg1, delta, 0, 0, tx); } mutex_exit(&dd->dd_lock); - - spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa, - tx, cr, "%lld dataset = %llu", - (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj); } -int -dsl_dir_set_reservation(const char *ddname, zprop_source_t source, - uint64_t reservation) +static void +dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd; + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; - dsl_prop_setarg_t psa; - int err; + uint64_t newval; - dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation); + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - err = dsl_dataset_hold(ddname, FTAG, &ds); - if (err) - return (err); + if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { + dsl_prop_set_sync_impl(ds, + zfs_prop_to_name(ZFS_PROP_RESERVATION), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); - err = dsl_dir_open(ddname, FTAG, &dd, NULL); - if (err) { - dsl_dataset_rele(ds, FTAG); - return (err); + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); + } else { + newval = ddsqra->ddsqra_value; + spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", + zfs_prop_to_name(ZFS_PROP_RESERVATION), + (longlong_t)newval); } - ASSERT(ds->ds_dir == dd); + dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dir_set_reservation(const char *ddname, zprop_source_t source, + uint64_t reservation) +{ + dsl_dir_set_qr_arg_t ddsqra; - err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check, - dsl_dir_set_reservation_sync, ds, &psa, 0); + ddsqra.ddsqra_name = ddname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = reservation; - dsl_dir_close(dd, FTAG); - dsl_dataset_rele(ds, FTAG); - return (err); + return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, + dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); } static dsl_dir_t * @@ -1230,84 +1677,219 @@ would_change(dsl_dir_t *dd, int64_t delt return (delta); mutex_enter(&dd->dd_lock); - delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta); + delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta); mutex_exit(&dd->dd_lock); return (would_change(dd->dd_parent, delta, ancestor)); } -struct renamearg { - dsl_dir_t *newparent; - const char *mynewname; -}; +typedef struct dsl_dir_rename_arg { + const char *ddra_oldname; + const char *ddra_newname; + cred_t *ddra_cred; +} dsl_dir_rename_arg_t; -/*ARGSUSED*/ +/* ARGSUSED */ static int -dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { - dsl_dir_t *dd = arg1; - struct renamearg *ra = arg2; - dsl_pool_t *dp = dd->dd_pool; - objset_t *mos = dp->dp_meta_objset; - int err; - uint64_t val; + int *deltap = arg; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; - /* There should be 2 references: the open and the dirty */ - if (dmu_buf_refcount(dd->dd_dbuf) > 2) - return (EBUSY); - - /* check for existing name */ - err = zap_lookup(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, - ra->mynewname, 8, 1, &val); - if (err == 0) - return (EEXIST); - if (err != ENOENT) - return (err); + dsl_dataset_name(ds, namebuf); + + if (strlen(namebuf) + *deltap >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + return (0); +} + +static int +dsl_dir_rename_check(void *arg, dmu_tx_t *tx) +{ + dsl_dir_rename_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd, *newparent; + const char *mynewname; + int error; + int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname); + + /* target dir should exist */ + error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); + if (error != 0) + return (error); + + /* new parent should exist */ + error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, + &newparent, &mynewname); + if (error != 0) { + dsl_dir_rele(dd, FTAG); + return (error); + } + + /* can't rename to different pool */ + if (dd->dd_pool != newparent->dd_pool) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (SET_ERROR(EXDEV)); + } + + /* new name should not already exist */ + if (mynewname == NULL) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (SET_ERROR(EEXIST)); + } + + /* if the name length is growing, validate child name lengths */ + if (delta > 0) { + error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, + &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + if (error != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (error); + } + } - if (ra->newparent != dd->dd_parent) { + if (dmu_tx_is_syncing(tx)) { + if (spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_FS_SS_LIMIT)) { + /* + * Although this is the check function and we don't + * normally make on-disk changes in check functions, + * we need to do that here. + * + * Ensure this portion of the tree's counts have been + * initialized in case the new parent has limits set. + */ + dsl_dir_init_fs_ss_count(dd, tx); + } + } + + if (newparent != dd->dd_parent) { /* is there enough space? */ uint64_t myspace = - MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved); + MAX(dsl_dir_phys(dd)->dd_used_bytes, + dsl_dir_phys(dd)->dd_reserved); + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t fs_cnt = 0; + uint64_t ss_cnt = 0; + + if (dsl_dir_is_zapified(dd)) { + int err; + + err = zap_lookup(os, dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, + &fs_cnt); + if (err != ENOENT && err != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (err); + } + + /* + * have to add 1 for the filesystem itself that we're + * moving + */ + fs_cnt++; + + err = zap_lookup(os, dd->dd_object, + DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, + &ss_cnt); + if (err != ENOENT && err != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (err); + } + } /* no rename into our descendant */ - if (closest_common_ancestor(dd, ra->newparent) == dd) - return (EINVAL); + if (closest_common_ancestor(dd, newparent) == dd) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (SET_ERROR(EINVAL)); + } - if (err = dsl_dir_transfer_possible(dd->dd_parent, - ra->newparent, myspace)) - return (err); + error = dsl_dir_transfer_possible(dd->dd_parent, + newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred); + if (error != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (error); + } } + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); return (0); } static void -dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - struct renamearg *ra = arg2; - dsl_pool_t *dp = dd->dd_pool; + dsl_dir_rename_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd, *newparent; + const char *mynewname; + int error; objset_t *mos = dp->dp_meta_objset; - int err; - ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2); + VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); + VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, + &mynewname)); + + /* Log this before we change the name. */ + spa_history_log_internal_dd(dd, "rename", tx, + "-> %s", ddra->ddra_newname); + + if (newparent != dd->dd_parent) { + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t fs_cnt = 0; + uint64_t ss_cnt = 0; + + /* + * We already made sure the dd counts were initialized in the + * check function. + */ + if (spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_FS_SS_LIMIT)) { + VERIFY0(zap_lookup(os, dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, + &fs_cnt)); + /* add 1 for the filesystem itself that we're moving */ + fs_cnt++; + + VERIFY0(zap_lookup(os, dd->dd_object, + DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, + &ss_cnt)); + } + + dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, + DD_FIELD_FILESYSTEM_COUNT, tx); + dsl_fs_ss_count_adjust(newparent, fs_cnt, + DD_FIELD_FILESYSTEM_COUNT, tx); + + dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, + DD_FIELD_SNAPSHOT_COUNT, tx); + dsl_fs_ss_count_adjust(newparent, ss_cnt, + DD_FIELD_SNAPSHOT_COUNT, tx); - if (ra->newparent != dd->dd_parent) { dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, - -dd->dd_phys->dd_used_bytes, - -dd->dd_phys->dd_compressed_bytes, - -dd->dd_phys->dd_uncompressed_bytes, tx); - dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD, - dd->dd_phys->dd_used_bytes, - dd->dd_phys->dd_compressed_bytes, - dd->dd_phys->dd_uncompressed_bytes, tx); - - if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) { - uint64_t unused_rsrv = dd->dd_phys->dd_reserved - - dd->dd_phys->dd_used_bytes; + -dsl_dir_phys(dd)->dd_used_bytes, + -dsl_dir_phys(dd)->dd_compressed_bytes, + -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); + dsl_dir_diduse_space(newparent, DD_USED_CHILD, + dsl_dir_phys(dd)->dd_used_bytes, + dsl_dir_phys(dd)->dd_compressed_bytes, + dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); + + if (dsl_dir_phys(dd)->dd_reserved > + dsl_dir_phys(dd)->dd_used_bytes) { + uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved - + dsl_dir_phys(dd)->dd_used_bytes; dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, -unused_rsrv, 0, 0, tx); - dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV, + dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, unused_rsrv, 0, 0, tx); } } @@ -1315,68 +1897,71 @@ dsl_dir_rename_sync(void *arg1, void *ar dmu_buf_will_dirty(dd->dd_dbuf, tx); /* remove from old parent zapobj */ - err = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, + error = zap_remove(mos, + dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, dd->dd_myname, tx); - ASSERT3U(err, ==, 0); + ASSERT0(error); - (void) strcpy(dd->dd_myname, ra->mynewname); - dsl_dir_close(dd->dd_parent, dd); - dd->dd_phys->dd_parent_obj = ra->newparent->dd_object; - VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, - ra->newparent->dd_object, NULL, dd, &dd->dd_parent)); + (void) strcpy(dd->dd_myname, mynewname); + dsl_dir_rele(dd->dd_parent, dd); + dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object; + VERIFY0(dsl_dir_hold_obj(dp, + newparent->dd_object, NULL, dd, &dd->dd_parent)); /* add to new parent zapobj */ - err = zap_add(mos, ra->newparent->dd_phys->dd_child_dir_zapobj, - dd->dd_myname, 8, 1, &dd->dd_object, tx); - ASSERT3U(err, ==, 0); + VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, + dd->dd_myname, 8, 1, &dd->dd_object, tx)); + +#ifdef __FreeBSD__ +#ifdef _KERNEL + zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname); + zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname); +#endif +#endif + + dsl_prop_notify_all(dd); - spa_history_internal_log(LOG_DS_RENAME, dd->dd_pool->dp_spa, - tx, cr, "dataset = %llu", dd->dd_phys->dd_head_dataset_obj); + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); } int -dsl_dir_rename(dsl_dir_t *dd, const char *newname) +dsl_dir_rename(const char *oldname, const char *newname) { - struct renamearg ra; - int err; + dsl_dir_rename_arg_t ddra; - /* new parent should exist */ - err = dsl_dir_open(newname, FTAG, &ra.newparent, &ra.mynewname); - if (err) - return (err); - - /* can't rename to different pool */ - if (dd->dd_pool != ra.newparent->dd_pool) { - err = ENXIO; - goto out; - } - - /* new name should not already exist */ - if (ra.mynewname == NULL) { - err = EEXIST; - goto out; - } - - err = dsl_sync_task_do(dd->dd_pool, - dsl_dir_rename_check, dsl_dir_rename_sync, dd, &ra, 3); + ddra.ddra_oldname = oldname; + ddra.ddra_newname = newname; + ddra.ddra_cred = CRED(); -out: - dsl_dir_close(ra.newparent, FTAG); - return (err); + return (dsl_sync_task(oldname, + dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, + 3, ZFS_SPACE_CHECK_RESERVED)); } int -dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space) +dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, + uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr) { dsl_dir_t *ancestor; int64_t adelta; uint64_t avail; + int err; ancestor = closest_common_ancestor(sdd, tdd); adelta = would_change(sdd, -space, ancestor); avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); if (avail < space) - return (ENOSPC); + return (SET_ERROR(ENOSPC)); + + err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, + ancestor, cr); + if (err != 0) + return (err); + err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, + ancestor, cr); + if (err != 0) + return (err); return (0); } @@ -1403,3 +1988,19 @@ dsl_dir_snap_cmtime_update(dsl_dir_t *dd dd->dd_snap_cmtime = t; mutex_exit(&dd->dd_lock); } + +void +dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); +} + +boolean_t +dsl_dir_is_zapified(dsl_dir_t *dd) +{ + dmu_object_info_t doi; + + dmu_object_info_from_db(dd->dd_dbuf, &doi); + return (doi.doi_type == DMU_OTN_ZAP_METADATA); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c,v retrieving revision 1.3 diff -u -p -r1.3 dsl_pool.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c 27 Feb 2010 23:43:53 -0000 1.3 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c 28 Apr 2017 15:42:59 -0000 @@ -19,14 +19,21 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #include #include +#include #include #include +#include +#include #include #include #include @@ -36,33 +43,207 @@ #include #include #include +#include +#include +#include +#include +#include + +#if defined(__FreeBSD__) && defined(_KERNEL) +#include +#include +#endif + +/* + * ZFS Write Throttle + * ------------------ + * + * ZFS must limit the rate of incoming writes to the rate at which it is able + * to sync data modifications to the backend storage. Throttling by too much + * creates an artificial limit; throttling by too little can only be sustained + * for short periods and would lead to highly lumpy performance. On a per-pool + * basis, ZFS tracks the amount of modified (dirty) data. As operations change + * data, the amount of dirty data increases; as ZFS syncs out data, the amount + * of dirty data decreases. When the amount of dirty data exceeds a + * predetermined threshold further modifications are blocked until the amount + * of dirty data decreases (as data is synced out). + * + * The limit on dirty data is tunable, and should be adjusted according to + * both the IO capacity and available memory of the system. The larger the + * window, the more ZFS is able to aggregate and amortize metadata (and data) + * changes. However, memory is a limited resource, and allowing for more dirty + * data comes at the cost of keeping other useful data in memory (for example + * ZFS data cached by the ARC). + * + * Implementation + * + * As buffers are modified dsl_pool_willuse_space() increments both the per- + * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of + * dirty space used; dsl_pool_dirty_space() decrements those values as data + * is synced out from dsl_pool_sync(). While only the poolwide value is + * relevant, the per-txg value is useful for debugging. The tunable + * zfs_dirty_data_max determines the dirty space limit. Once that value is + * exceeded, new writes are halted until space frees up. + * + * The zfs_dirty_data_sync tunable dictates the threshold at which we + * ensure that there is a txg syncing (see the comment in txg.c for a full + * description of transaction group stages). + * + * The IO scheduler uses both the dirty space limit and current amount of + * dirty data as inputs. Those values affect the number of concurrent IOs ZFS + * issues. See the comment in vdev_queue.c for details of the IO scheduler. + * + * The delay is also calculated based on the amount of dirty data. See the + * comment above dmu_tx_delay() for details. + */ + +/* + * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, + * capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system. + */ +uint64_t zfs_dirty_data_max; +uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024; +int zfs_dirty_data_max_percent = 10; + +/* + * If there is at least this much dirty data, push out a txg. + */ +uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024; + +/* + * Once there is this amount of dirty data, the dmu_tx_delay() will kick in + * and delay each transaction. + * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. + */ +int zfs_delay_min_dirty_percent = 60; + +/* + * This controls how quickly the delay approaches infinity. + * Larger values cause it to delay more for a given amount of dirty data. + * Therefore larger values will cause there to be less dirty data for a + * given throughput. + * + * For the smoothest delay, this value should be about 1 billion divided + * by the maximum number of operations per second. This will smoothly + * handle between 10x and 1/10th this number. + * + * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the + * multiply in dmu_tx_delay(). + */ +uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; + + +#if defined(__FreeBSD__) && defined(_KERNEL) -int zfs_no_write_throttle = 0; -int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ -int zfs_txg_synctime_ms = 5000; /* target millisecs to sync a txg */ - -uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ -uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ -uint64_t zfs_write_limit_inflated = 0; -uint64_t zfs_write_limit_override = 0; +extern int zfs_vdev_async_write_active_max_dirty_percent; -kmutex_t zfs_write_limit_lock; +SYSCTL_DECL(_vfs_zfs); -static pgcnt_t old_physmem = 0; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN, + &zfs_dirty_data_max, 0, + "The maximum amount of dirty data in bytes after which new writes are " + "halted until space becomes available"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN, + &zfs_dirty_data_max_max, 0, + "The absolute cap on dirty_data_max when auto calculating"); + +static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent, + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), + sysctl_zfs_dirty_data_max_percent, "I", + "The percent of physical memory used to auto calculate dirty_data_max"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN, + &zfs_dirty_data_sync, 0, + "Force a txg if the number of dirty buffer bytes exceed this value"); + +static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS); +/* No zfs_delay_min_dirty_percent tunable due to limit requirements */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent, + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int), + sysctl_zfs_delay_min_dirty_percent, "I", + "The limit of outstanding dirty data before transations are delayed"); + +static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS); +/* No zfs_delay_scale tunable due to limit requirements */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), + sysctl_zfs_delay_scale, "QU", + "Controls how quickly the delay approaches infinity"); static int +sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS) +{ + int val, err; + + val = zfs_dirty_data_max_percent; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < 0 || val > 100) + return (EINVAL); + + zfs_dirty_data_max_percent = val; + + return (0); +} + +static int +sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS) +{ + int val, err; + + val = zfs_delay_min_dirty_percent; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < zfs_vdev_async_write_active_max_dirty_percent) + return (EINVAL); + + zfs_delay_min_dirty_percent = val; + + return (0); +} + +static int +sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_delay_scale; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val > UINT64_MAX / zfs_dirty_data_max) + return (EINVAL); + + zfs_delay_scale = val; + + return (0); +} +#endif + +hrtime_t zfs_throttle_delay = MSEC2NSEC(10); +hrtime_t zfs_throttle_resolution = MSEC2NSEC(10); + +int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { uint64_t obj; int err; err = zap_lookup(dp->dp_meta_objset, - dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, + dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj, name, sizeof (obj), 1, &obj); if (err) return (err); - return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); + return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); } static dsl_pool_t * @@ -74,21 +255,20 @@ dsl_pool_open_impl(spa_t *spa, uint64_t dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); dp->dp_spa = spa; dp->dp_meta_rootbp = *bp; - rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); - dp->dp_write_limit = zfs_write_limit_min; + rrw_init(&dp->dp_config_rwlock, B_TRUE); txg_init(dp, txg); txg_list_create(&dp->dp_dirty_datasets, offsetof(dsl_dataset_t, ds_dirty_link)); + txg_list_create(&dp->dp_dirty_zilogs, + offsetof(zilog_t, zl_dirty_link)); txg_list_create(&dp->dp_dirty_dirs, offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, - offsetof(dsl_sync_task_group_t, dstg_node)); - list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t), - offsetof(dsl_dataset_t, ds_synced_link)); + offsetof(dsl_sync_task_t, dst_node)); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 1, 4, 0); @@ -97,26 +277,37 @@ dsl_pool_open_impl(spa_t *spa, uint64_t } int -dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) +dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); - dsl_dir_t *dd; - dsl_dataset_t *ds; - rw_enter(&dp->dp_config_rwlock, RW_WRITER); err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &dp->dp_meta_objset); - if (err) - goto out; + if (err != 0) + dsl_pool_close(dp); + else + *dpp = dp; + return (err); +} + +int +dsl_pool_open(dsl_pool_t *dp) +{ + int err; + dsl_dir_t *dd; + dsl_dataset_t *ds; + uint64_t obj; + + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &dp->dp_root_dir_obj); if (err) goto out; - err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, + err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir); if (err) goto out; @@ -125,131 +316,122 @@ dsl_pool_open(spa_t *spa, uint64_t txg, if (err) goto out; - if (spa_version(spa) >= SPA_VERSION_ORIGIN) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); if (err) goto out; - err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, - FTAG, &ds); + err = dsl_dataset_hold_obj(dp, + dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds); if (err == 0) { err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, &dp->dp_origin_snap); dsl_dataset_rele(ds, FTAG); } - dsl_dir_close(dd, dp); + dsl_dir_rele(dd, dp); if (err) goto out; } - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, - &dp->dp_tmp_userrefs_obj); - if (err == ENOENT) - err = 0; - if (err) - goto out; - - /* get scrub status */ - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, - &dp->dp_scrub_func); - if (err == 0) { - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, - &dp->dp_scrub_queue_obj); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_min_txg); - if (err) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_max_txg); + if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, + &dp->dp_free_dir); if (err) goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_bookmark); + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); if (err) goto out; + VERIFY0(bpobj_open(&dp->dp_free_bpobj, + dp->dp_meta_objset, obj)); + } + + /* + * Note: errors ignored, because the leak dir will not exist if we + * have not encountered a leak yet. + */ + (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, + &dp->dp_leak_dir); + + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_ddt_bookmark); - if (err && err != ENOENT) - goto out; - err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, - &dp->dp_scrub_ddt_class_max); - if (err && err != ENOENT) + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj); + if (err != 0) goto out; + } + + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, - &spa->spa_scrub_errors); - if (err) + DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, + &dp->dp_empty_bpobj); + if (err != 0) goto out; - if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { - /* - * A new-type scrub was in progress on an old - * pool. Restart from the beginning, since the - * old software may have changed the pool in the - * meantime. - */ - dsl_pool_scrub_restart(dp); - } - } else { - /* - * It's OK if there is no scrub in progress (and if - * there was an I/O error, ignore it). - */ - err = 0; } -out: - rw_exit(&dp->dp_config_rwlock); + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, + &dp->dp_tmp_userrefs_obj); + if (err == ENOENT) + err = 0; if (err) - dsl_pool_close(dp); - else - *dpp = dp; + goto out; + + err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); +out: + rrw_exit(&dp->dp_config_rwlock, FTAG); return (err); } void dsl_pool_close(dsl_pool_t *dp) { - /* drop our references from dsl_pool_open() */ - /* + * Drop our references from dsl_pool_open(). + * * Since we held the origin_snap from "syncing" context (which * includes pool-opening context), it actually only got a "ref" * and not a hold, so just drop that here. */ if (dp->dp_origin_snap) - dsl_dataset_drop_ref(dp->dp_origin_snap, dp); + dsl_dataset_rele(dp->dp_origin_snap, dp); if (dp->dp_mos_dir) - dsl_dir_close(dp->dp_mos_dir, dp); + dsl_dir_rele(dp->dp_mos_dir, dp); + if (dp->dp_free_dir) + dsl_dir_rele(dp->dp_free_dir, dp); + if (dp->dp_leak_dir) + dsl_dir_rele(dp->dp_leak_dir, dp); if (dp->dp_root_dir) - dsl_dir_close(dp->dp_root_dir, dp); + dsl_dir_rele(dp->dp_root_dir, dp); + + bpobj_close(&dp->dp_free_bpobj); /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ if (dp->dp_meta_objset) dmu_objset_evict(dp->dp_meta_objset); txg_list_destroy(&dp->dp_dirty_datasets); - txg_list_destroy(&dp->dp_dirty_dirs); + txg_list_destroy(&dp->dp_dirty_zilogs); txg_list_destroy(&dp->dp_sync_tasks); - list_destroy(&dp->dp_synced_datasets); + txg_list_destroy(&dp->dp_dirty_dirs); + + /* + * We can't set retry to TRUE since we're explicitly specifying + * a spa to flush. This is good enough; any missed buffers for + * this spa won't cause trouble, and they'll eventually fall + * out of the ARC just like any other unused buffer. + */ + arc_flush(dp->dp_spa, FALSE); - arc_flush(dp->dp_spa); txg_fini(dp); - rw_destroy(&dp->dp_config_rwlock); + dsl_scan_fini(dp); + dmu_buf_user_evict_wait(); + + rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); - mutex_destroy(&dp->dp_scrub_cancel_lock); + cv_destroy(&dp->dp_spaceavail_cv); taskq_destroy(dp->dp_vnrele_taskq); if (dp->dp_blkstats) kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); @@ -264,7 +446,9 @@ dsl_pool_create(spa_t *spa, nvlist_t *zp dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); objset_t *os; dsl_dataset_t *ds; - uint64_t dsobj; + uint64_t obj; + + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); /* create and open the MOS (meta-objset) */ dp->dp_meta_objset = dmu_objset_create_impl(spa, @@ -273,28 +457,48 @@ dsl_pool_create(spa_t *spa, nvlist_t *zp /* create the pool directory */ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); - ASSERT3U(err, ==, 0); + ASSERT0(err); + + /* Initialize scan structures */ + VERIFY0(dsl_scan_init(dp, txg)); /* create and open the root dir */ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); - VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, + VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir)); /* create and open the meta-objset dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); - VERIFY(0 == dsl_pool_open_special_dir(dp, + VERIFY0(dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir)); + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + /* create and open the free dir */ + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, + FREE_DIR_NAME, tx); + VERIFY0(dsl_pool_open_special_dir(dp, + FREE_DIR_NAME, &dp->dp_free_dir)); + + /* create and open the free_bplist */ + obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); + VERIFY0(bpobj_open(&dp->dp_free_bpobj, + dp->dp_meta_objset, obj)); + } + if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) dsl_pool_create_origin(dp, tx); /* create the root dataset */ - dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); + obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); /* create the root objset */ - VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); os = dmu_objset_create_impl(dp->dp_spa, ds, dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); #ifdef _KERNEL zfs_create_fs(os, kcred, zplprops, tx); #endif @@ -302,9 +506,54 @@ dsl_pool_create(spa_t *spa, nvlist_t *zp dmu_tx_commit(tx); + rrw_exit(&dp->dp_config_rwlock, FTAG); + return (dp); } +/* + * Account for the meta-objset space in its placeholder dsl_dir. + */ +void +dsl_pool_mos_diduse_space(dsl_pool_t *dp, + int64_t used, int64_t comp, int64_t uncomp) +{ + ASSERT3U(comp, ==, uncomp); /* it's all metadata */ + mutex_enter(&dp->dp_lock); + dp->dp_mos_used_delta += used; + dp->dp_mos_compressed_delta += comp; + dp->dp_mos_uncompressed_delta += uncomp; + mutex_exit(&dp->dp_lock); +} + +static void +dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) +{ + zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + dmu_objset_sync(dp->dp_meta_objset, zio, tx); + VERIFY0(zio_wait(zio)); + dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); + spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); +} + +static void +dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) +{ + ASSERT(MUTEX_HELD(&dp->dp_lock)); + + if (delta < 0) + ASSERT3U(-delta, <=, dp->dp_dirty_total); + + dp->dp_dirty_total += delta; + + /* + * Note: we signal even when increasing dp_dirty_total. + * This ensures forward progress -- each thread wakes the next waiter. + */ + if (dp->dp_dirty_total <= zfs_dirty_data_max) + cv_signal(&dp->dp_spaceavail_cv); +} + void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { @@ -312,157 +561,150 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t t dmu_tx_t *tx; dsl_dir_t *dd; dsl_dataset_t *ds; - dsl_sync_task_group_t *dstg; objset_t *mos = dp->dp_meta_objset; - hrtime_t start, write_time; - uint64_t data_written; - int err; + list_t synced_datasets; - tx = dmu_tx_create_assigned(dp, txg); + list_create(&synced_datasets, sizeof (dsl_dataset_t), + offsetof(dsl_dataset_t, ds_synced_link)); - dp->dp_read_overhead = 0; - start = gethrtime(); + tx = dmu_tx_create_assigned(dp, txg); + /* + * Write out all dirty blocks of dirty datasets. + */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { + while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { /* * We must not sync any non-MOS datasets twice, because * we may have taken a snapshot of them. However, we * may sync newly-created datasets on pass 2. */ ASSERT(!list_link_active(&ds->ds_synced_link)); - list_insert_tail(&dp->dp_synced_datasets, ds); + list_insert_tail(&synced_datasets, ds); dsl_dataset_sync(ds, zio, tx); } - DTRACE_PROBE(pool_sync__1setup); - err = zio_wait(zio); + VERIFY0(zio_wait(zio)); - write_time = gethrtime() - start; - ASSERT(err == 0); - DTRACE_PROBE(pool_sync__2rootzio); - - for (ds = list_head(&dp->dp_synced_datasets); ds; - ds = list_next(&dp->dp_synced_datasets, ds)) - dmu_objset_do_userquota_callbacks(ds->ds_objset, tx); + /* + * We have written all of the accounted dirty data, so our + * dp_space_towrite should now be zero. However, some seldom-used + * code paths do not adhere to this (e.g. dbuf_undirty(), also + * rounding error in dbuf_write_physdone). + * Shore up the accounting of any dirtied space now. + */ + dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); + + /* + * Update the long range free counter after + * we're done syncing user data + */ + mutex_enter(&dp->dp_lock); + ASSERT(spa_sync_pass(dp->dp_spa) == 1 || + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0); + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0; + mutex_exit(&dp->dp_lock); + + /* + * After the data blocks have been written (ensured by the zio_wait() + * above), update the user/group space accounting. + */ + for (ds = list_head(&synced_datasets); ds != NULL; + ds = list_next(&synced_datasets, ds)) { + dmu_objset_do_userquota_updates(ds->ds_objset, tx); + } /* * Sync the datasets again to push out the changes due to - * userquota updates. This must be done before we process the - * sync tasks, because that could cause a snapshot of a dataset - * whose ds_bp will be rewritten when we do this 2nd sync. + * userspace updates. This must be done before we process the + * sync tasks, so that any snapshots will have the correct + * user accounting information (and we won't get confused + * about which blocks are part of the snapshot). */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { + while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { ASSERT(list_link_active(&ds->ds_synced_link)); dmu_buf_rele(ds->ds_dbuf, ds); dsl_dataset_sync(ds, zio, tx); } - err = zio_wait(zio); + VERIFY0(zio_wait(zio)); /* - * If anything was added to a deadlist during a zio done callback, - * it had to be put on the deferred queue. Enqueue it for real now. + * Now that the datasets have been completely synced, we can + * clean up our in-memory structures accumulated while syncing: + * + * - move dead blocks from the pending deadlist to the on-disk deadlist + * - release hold from dsl_dataset_dirty() */ - for (ds = list_head(&dp->dp_synced_datasets); ds; - ds = list_next(&dp->dp_synced_datasets, ds)) - bplist_sync(&ds->ds_deadlist, - bplist_enqueue_cb, &ds->ds_deadlist, tx); - - while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) { - /* - * No more sync tasks should have been added while we - * were syncing. - */ - ASSERT(spa_sync_pass(dp->dp_spa) == 1); - dsl_sync_task_group_sync(dstg, tx); + while ((ds = list_remove_head(&synced_datasets)) != NULL) { + dsl_dataset_sync_done(ds, tx); } - DTRACE_PROBE(pool_sync__3task); - - start = gethrtime(); - while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) + while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { dsl_dir_sync(dd, tx); - write_time += gethrtime() - start; + } - if (spa_sync_pass(dp->dp_spa) == 1) { - dp->dp_scrub_prefetch_zio_root = zio_root(dp->dp_spa, NULL, - NULL, ZIO_FLAG_CANFAIL); - dsl_pool_scrub_sync(dp, tx); - (void) zio_wait(dp->dp_scrub_prefetch_zio_root); + /* + * The MOS's space is accounted for in the pool/$MOS + * (dp_mos_dir). We can't modify the mos while we're syncing + * it, so we remember the deltas and apply them here. + */ + if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || + dp->dp_mos_uncompressed_delta != 0) { + dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, + dp->dp_mos_used_delta, + dp->dp_mos_compressed_delta, + dp->dp_mos_uncompressed_delta, tx); + dp->dp_mos_used_delta = 0; + dp->dp_mos_compressed_delta = 0; + dp->dp_mos_uncompressed_delta = 0; } - start = gethrtime(); if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - dmu_objset_sync(mos, zio, tx); - err = zio_wait(zio); - ASSERT(err == 0); - dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); - spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); - } - write_time += gethrtime() - start; - DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, - hrtime_t, dp->dp_read_overhead); - write_time -= dp->dp_read_overhead; - - dmu_tx_commit(tx); - - data_written = dp->dp_space_towrite[txg & TXG_MASK]; - dp->dp_space_towrite[txg & TXG_MASK] = 0; - ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); - - /* - * If the write limit max has not been explicitly set, set it - * to a fraction of available physical memory (default 1/8th). - * Note that we must inflate the limit because the spa - * inflates write sizes to account for data replication. - * Check this each sync phase to catch changing memory size. - */ - if (physmem != old_physmem && zfs_write_limit_shift) { - mutex_enter(&zfs_write_limit_lock); - old_physmem = physmem; - zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; - zfs_write_limit_inflated = MAX(zfs_write_limit_min, - spa_get_asize(dp->dp_spa, zfs_write_limit_max)); - mutex_exit(&zfs_write_limit_lock); + dsl_pool_sync_mos(dp, tx); } /* - * Attempt to keep the sync time consistent by adjusting the - * amount of write traffic allowed into each transaction group. - * Weight the throughput calculation towards the current value: - * thru = 3/4 old_thru + 1/4 new_thru - * - * Note: write_time is in nanosecs, so write_time/MICROSEC - * yields millisecs + * If we modify a dataset in the same txg that we want to destroy it, + * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. + * dsl_dir_destroy_check() will fail if there are unexpected holds. + * Therefore, we want to sync the MOS (thus syncing the dd_dbuf + * and clearing the hold on it) before we process the sync_tasks. + * The MOS data dirtied by the sync_tasks will be synced on the next + * pass. */ - ASSERT(zfs_write_limit_min > 0); - if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) { - uint64_t throughput = data_written / (write_time / MICROSEC); - - if (dp->dp_throughput) - dp->dp_throughput = throughput / 4 + - 3 * dp->dp_throughput / 4; - else - dp->dp_throughput = throughput; - dp->dp_write_limit = MIN(zfs_write_limit_inflated, - MAX(zfs_write_limit_min, - dp->dp_throughput * zfs_txg_synctime_ms)); + if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { + dsl_sync_task_t *dst; + /* + * No more sync tasks should have been added while we + * were syncing. + */ + ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); + while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) + dsl_sync_task_sync(dst, tx); } + + dmu_tx_commit(tx); + + DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); } void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) { - dsl_dataset_t *ds; - objset_t *os; + zilog_t *zilog; - while (ds = list_head(&dp->dp_synced_datasets)) { - list_remove(&dp->dp_synced_datasets, ds); - os = ds->ds_objset; - zil_clean(os->os_zil); - ASSERT(!dmu_objset_is_dirty(os, txg)); - dmu_buf_rele(ds->ds_dbuf, ds); + while (zilog = txg_list_head(&dp->dp_dirty_zilogs, txg)) { + dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); + /* + * We don't remove the zilog from the dp_dirty_zilogs + * list until after we've cleaned it. This ensures that + * callers of zilog_is_dirty() receive an accurate + * answer when they are racing with the spa sync thread. + */ + zil_clean(zilog, txg); + (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg); + ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); + dmu_buf_rele(ds->ds_dbuf, zilog); } ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } @@ -475,7 +717,7 @@ int dsl_pool_sync_context(dsl_pool_t *dp) { return (curthread == dp->dp_tx.tx_sync_thread || - spa_get_dsl(dp->dp_spa) == NULL); + spa_is_initializing(dp->dp_spa)); } uint64_t @@ -484,121 +726,83 @@ dsl_pool_adjustedsize(dsl_pool_t *dp, bo uint64_t space, resv; /* - * Reserve about 1.6% (1/64), or at least 32MB, for allocation - * efficiency. - * XXX The intent log is not accounted for, so it must fit - * within this slop. - * * If we're trying to assess whether it's OK to do a free, * cut the reservation in half to allow forward progress * (e.g. make it possible to rm(1) files from a full pool). */ space = spa_get_dspace(dp->dp_spa); - resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); + resv = spa_get_slop_space(dp->dp_spa); if (netfree) resv >>= 1; return (space - resv); } -int -dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) +boolean_t +dsl_pool_need_dirty_delay(dsl_pool_t *dp) { - uint64_t reserved = 0; - uint64_t write_limit = (zfs_write_limit_override ? - zfs_write_limit_override : dp->dp_write_limit); - - if (zfs_no_write_throttle) { - atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], - space); - return (0); - } - - /* - * Check to see if we have exceeded the maximum allowed IO for - * this transaction group. We can do this without locks since - * a little slop here is ok. Note that we do the reserved check - * with only half the requested reserve: this is because the - * reserve requests are worst-case, and we really don't want to - * throttle based off of worst-case estimates. - */ - if (write_limit > 0) { - reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] - + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; - - if (reserved && reserved > write_limit) - return (ERESTART); - } - - atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); - - /* - * If this transaction group is over 7/8ths capacity, delay - * the caller 1 clock tick. This will slow down the "fill" - * rate until the sync process can catch up with us. - */ - if (reserved && reserved > (write_limit - (write_limit >> 3))) - txg_delay(dp, tx->tx_txg, 1); - - return (0); + uint64_t delay_min_bytes = + zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; + boolean_t rv; + + mutex_enter(&dp->dp_lock); + if (dp->dp_dirty_total > zfs_dirty_data_sync) + txg_kick(dp); + rv = (dp->dp_dirty_total > delay_min_bytes); + mutex_exit(&dp->dp_lock); + return (rv); } void -dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) +dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) { - ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); - atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); -} - -void -dsl_pool_memory_pressure(dsl_pool_t *dp) -{ - uint64_t space_inuse = 0; - int i; - - if (dp->dp_write_limit == zfs_write_limit_min) - return; - - for (i = 0; i < TXG_SIZE; i++) { - space_inuse += dp->dp_space_towrite[i]; - space_inuse += dp->dp_tempreserved[i]; + if (space > 0) { + mutex_enter(&dp->dp_lock); + dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; + dsl_pool_dirty_delta(dp, space); + mutex_exit(&dp->dp_lock); } - dp->dp_write_limit = MAX(zfs_write_limit_min, - MIN(dp->dp_write_limit, space_inuse / 4)); } void -dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) +dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) { - if (space > 0) { - mutex_enter(&dp->dp_lock); - dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; - mutex_exit(&dp->dp_lock); + ASSERT3S(space, >=, 0); + if (space == 0) + return; + mutex_enter(&dp->dp_lock); + if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { + /* XXX writing something we didn't dirty? */ + space = dp->dp_dirty_pertxg[txg & TXG_MASK]; } + ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); + dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; + ASSERT3U(dp->dp_dirty_total, >=, space); + dsl_pool_dirty_delta(dp, -space); + mutex_exit(&dp->dp_lock); } /* ARGSUSED */ static int -upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) +upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds, *prev = NULL; int err; - dsl_pool_t *dp = spa_get_dsl(spa); - err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); - while (ds->ds_phys->ds_prev_snap_obj != 0) { - err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, - FTAG, &prev); + while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } - if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) + if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) break; dsl_dataset_rele(ds, FTAG); ds = prev; @@ -612,7 +816,9 @@ upgrade_clones_cb(spa_t *spa, uint64_t d * The $ORIGIN can't have any data, or the accounting * will be wrong. */ - ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth); + rrw_exit(&ds->ds_bp_rwlock, FTAG); /* The origin doesn't get attached to itself */ if (ds->ds_object == prev->ds_object) { @@ -621,33 +827,35 @@ upgrade_clones_cb(spa_t *spa, uint64_t d } dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_prev_snap_obj = prev->ds_object; - ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; + dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object; + dsl_dataset_phys(ds)->ds_prev_snap_txg = + dsl_dataset_phys(prev)->ds_creation_txg; dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); - ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; + dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object; dmu_buf_will_dirty(prev->ds_dbuf, tx); - prev->ds_phys->ds_num_children++; + dsl_dataset_phys(prev)->ds_num_children++; - if (ds->ds_phys->ds_next_snap_obj == 0) { + if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) { ASSERT(ds->ds_prev == NULL); - VERIFY(0 == dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, + ds, &ds->ds_prev)); } } - ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); - ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); + ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object); + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object); - if (prev->ds_phys->ds_next_clones_obj == 0) { + if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) { dmu_buf_will_dirty(prev->ds_dbuf, tx); - prev->ds_phys->ds_next_clones_obj = + dsl_dataset_phys(prev)->ds_next_clones_obj = zap_create(dp->dp_meta_objset, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } - VERIFY(0 == zap_add_int(dp->dp_meta_objset, - prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx)); dsl_dataset_rele(ds, FTAG); if (prev != dp->dp_origin_snap) @@ -661,8 +869,62 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap != NULL); - VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, - tx, DS_FIND_CHILDREN)); + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, + tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); +} + +/* ARGSUSED */ +static int +upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + dmu_tx_t *tx = arg; + objset_t *mos = dp->dp_meta_objset; + + if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) { + dsl_dataset_t *origin; + + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin)); + + if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { + dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); + dsl_dir_phys(origin->ds_dir)->dd_clones = + zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, + 0, tx); + } + + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dir_phys(origin->ds_dir)->dd_clones, + ds->ds_object, tx)); + + dsl_dataset_rele(origin, FTAG); + } + return (0); +} + +void +dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + uint64_t obj; + + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); + VERIFY0(dsl_pool_open_special_dir(dp, + FREE_DIR_NAME, &dp->dp_free_dir)); + + /* + * We can't use bpobj_alloc(), because spa_version() still + * returns the old version, and we need a new-version bpobj with + * subobj support. So call dmu_object_alloc() directly. + */ + obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, + SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); + VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); + VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); + + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); } void @@ -673,17 +935,16 @@ dsl_pool_create_origin(dsl_pool_t *dp, d ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap == NULL); + ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); /* create the origin dir, ds, & snap-ds */ - rw_enter(&dp->dp_config_rwlock, RW_WRITER); dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, NULL, 0, kcred, tx); - VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx); - VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, + VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); + VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, &dp->dp_origin_snap)); dsl_dataset_rele(ds, FTAG); - rw_exit(&dp->dp_config_rwlock); } taskq_t * @@ -703,23 +964,34 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t * zap_cursor_t zc; objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; + nvlist_t *holds; if (zapobj == 0) return; ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + holds = fnvlist_alloc(); + for (zap_cursor_init(&zc, mos, zapobj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { char *htag; - uint64_t dsobj; + nvlist_t *tags; htag = strchr(za.za_name, '-'); *htag = '\0'; ++htag; - dsobj = strtonum(za.za_name, NULL); - (void) dsl_dataset_user_release_tmp(dp, dsobj, htag); + if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { + tags = fnvlist_alloc(); + fnvlist_add_boolean(tags, htag); + fnvlist_add_nvlist(holds, za.za_name, tags); + fnvlist_free(tags); + } else { + fnvlist_add_boolean(tags, htag); + } } + dsl_dataset_user_release_tmp(dp, holds); + fnvlist_free(holds); zap_cursor_fini(&zc); } @@ -734,16 +1006,13 @@ dsl_pool_user_hold_create_obj(dsl_pool_t ASSERT(dp->dp_tmp_userrefs_obj == 0); ASSERT(dmu_tx_is_syncing(tx)); - dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, - DMU_OT_NONE, 0, tx); - - VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, - sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0); + dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); } static int dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, - const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding) + const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) { objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; @@ -762,13 +1031,13 @@ dsl_pool_user_hold_rele_impl(dsl_pool_t dsl_pool_user_hold_create_obj(dp, tx); zapobj = dp->dp_tmp_userrefs_obj; } else { - return (ENOENT); + return (SET_ERROR(ENOENT)); } } name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); if (holding) - error = zap_add(mos, zapobj, name, 8, 1, now, tx); + error = zap_add(mos, zapobj, name, 8, 1, &now, tx); else error = zap_remove(mos, zapobj, name, tx); strfree(name); @@ -781,7 +1050,7 @@ dsl_pool_user_hold_rele_impl(dsl_pool_t */ int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, - uint64_t *now, dmu_tx_t *tx) + uint64_t now, dmu_tx_t *tx) { return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); } @@ -793,6 +1062,122 @@ int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, dmu_tx_t *tx) { - return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, + return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, tx, B_FALSE)); } + +/* + * DSL Pool Configuration Lock + * + * The dp_config_rwlock protects against changes to DSL state (e.g. dataset + * creation / destruction / rename / property setting). It must be held for + * read to hold a dataset or dsl_dir. I.e. you must call + * dsl_pool_config_enter() or dsl_pool_hold() before calling + * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock + * must be held continuously until all datasets and dsl_dirs are released. + * + * The only exception to this rule is that if a "long hold" is placed on + * a dataset, then the dp_config_rwlock may be dropped while the dataset + * is still held. The long hold will prevent the dataset from being + * destroyed -- the destroy will fail with EBUSY. A long hold can be + * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset + * (by calling dsl_{dataset,objset}_{try}own{_obj}). + * + * Legitimate long-holders (including owners) should be long-running, cancelable + * tasks that should cause "zfs destroy" to fail. This includes DMU + * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), + * "zfs send", and "zfs diff". There are several other long-holders whose + * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). + * + * The usual formula for long-holding would be: + * dsl_pool_hold() + * dsl_dataset_hold() + * ... perform checks ... + * dsl_dataset_long_hold() + * dsl_pool_rele() + * ... perform long-running task ... + * dsl_dataset_long_rele() + * dsl_dataset_rele() + * + * Note that when the long hold is released, the dataset is still held but + * the pool is not held. The dataset may change arbitrarily during this time + * (e.g. it could be destroyed). Therefore you shouldn't do anything to the + * dataset except release it. + * + * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only + * or modifying operations. + * + * Modifying operations should generally use dsl_sync_task(). The synctask + * infrastructure enforces proper locking strategy with respect to the + * dp_config_rwlock. See the comment above dsl_sync_task() for details. + * + * Read-only operations will manually hold the pool, then the dataset, obtain + * information from the dataset, then release the pool and dataset. + * dmu_objset_{hold,rele}() are convenience routines that also do the pool + * hold/rele. + */ + +int +dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) +{ + spa_t *spa; + int error; + + error = spa_open(name, &spa, tag); + if (error == 0) { + *dp = spa_get_dsl(spa); + dsl_pool_config_enter(*dp, tag); + } + return (error); +} + +void +dsl_pool_rele(dsl_pool_t *dp, void *tag) +{ + dsl_pool_config_exit(dp, tag); + spa_close(dp->dp_spa, tag); +} + +void +dsl_pool_config_enter(dsl_pool_t *dp, void *tag) +{ + /* + * We use a "reentrant" reader-writer lock, but not reentrantly. + * + * The rrwlock can (with the track_all flag) track all reading threads, + * which is very useful for debugging which code path failed to release + * the lock, and for verifying that the *current* thread does hold + * the lock. + * + * (Unlike a rwlock, which knows that N threads hold it for + * read, but not *which* threads, so rw_held(RW_READER) returns TRUE + * if any thread holds it for read, even if this thread doesn't). + */ + ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); + rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); +} + +void +dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag) +{ + ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); + rrw_enter_read_prio(&dp->dp_config_rwlock, tag); +} + +void +dsl_pool_config_exit(dsl_pool_t *dp, void *tag) +{ + rrw_exit(&dp->dp_config_rwlock, tag); +} + +boolean_t +dsl_pool_config_held(dsl_pool_t *dp) +{ + return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); +} + +boolean_t +dsl_pool_config_held_writer(dsl_pool_t *dp) +{ + return (RRW_WRITE_HELD(&dp->dp_config_rwlock)); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 dsl_prop.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c 27 Feb 2010 22:30:58 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c 10 Oct 2016 11:09:56 -0000 @@ -19,8 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include @@ -41,27 +42,27 @@ #define ZPROP_RECVD_SUFFIX "$recvd" static int -dodefault(const char *propname, int intsz, int numints, void *buf) +dodefault(zfs_prop_t prop, int intsz, int numints, void *buf) { - zfs_prop_t prop; - /* * The setonce properties are read-only, BUT they still * have a default value that can be used as the initial * value. */ - if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL || + if (prop == ZPROP_INVAL || (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop))) - return (ENOENT); + return (SET_ERROR(ENOENT)); if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { + if (zfs_prop_default_string(prop) == NULL) + return (SET_ERROR(ENOENT)); if (intsz != 1) - return (EOVERFLOW); + return (SET_ERROR(EOVERFLOW)); (void) strncpy(buf, zfs_prop_default_string(prop), numints); } else { if (intsz != 8 || numints < 1) - return (EOVERFLOW); + return (SET_ERROR(EOVERFLOW)); *(uint64_t *)buf = zfs_prop_default_numeric(prop); } @@ -82,7 +83,7 @@ dsl_prop_get_dd(dsl_dir_t *dd, const cha char *inheritstr; char *recvdstr; - ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); + ASSERT(dsl_pool_config_held(dd->dd_pool)); if (setpoint) setpoint[0] = '\0'; @@ -97,8 +98,6 @@ dsl_prop_get_dd(dsl_dir_t *dd, const cha * after this loop. */ for (; dd != NULL; dd = dd->dd_parent) { - ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); - if (dd != target || snapshot) { if (!inheritable) break; @@ -106,8 +105,8 @@ dsl_prop_get_dd(dsl_dir_t *dd, const cha } /* Check for a local value. */ - err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, - intsz, numints, buf); + err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, + propname, intsz, numints, buf); if (err != ENOENT) { if (setpoint != NULL && err == 0) dsl_dir_name(dd, setpoint); @@ -118,14 +117,14 @@ dsl_prop_get_dd(dsl_dir_t *dd, const cha * Skip the check for a received value if there is an explicit * inheritance entry. */ - err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, + err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj, inheritstr); if (err != 0 && err != ENOENT) break; if (err == ENOENT) { /* Check for a received value. */ - err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, + err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, recvdstr, intsz, numints, buf); if (err != ENOENT) { if (setpoint != NULL && err == 0) { @@ -146,11 +145,11 @@ dsl_prop_get_dd(dsl_dir_t *dd, const cha * at the end of the loop (instead of at the beginning) ensures * that err has a valid post-loop value. */ - err = ENOENT; + err = SET_ERROR(ENOENT); } if (err == ENOENT) - err = dodefault(propname, intsz, numints, buf); + err = dodefault(prop, intsz, numints, buf); strfree(inheritstr); strfree(recvdstr); @@ -164,19 +163,17 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const { zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t inheritable; - boolean_t snapshot; uint64_t zapobj; - ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock)); + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); - snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)); - zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj); + zapobj = dsl_dataset_phys(ds)->ds_props_obj; if (zapobj != 0) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; int err; - ASSERT(snapshot); + ASSERT(ds->ds_is_snapshot); /* Check for a local value. */ err = zap_lookup(mos, zapobj, propname, intsz, numints, buf); @@ -216,7 +213,59 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const } return (dsl_prop_get_dd(ds->ds_dir, propname, - intsz, numints, buf, setpoint, snapshot)); + intsz, numints, buf, setpoint, ds->ds_is_snapshot)); +} + +static dsl_prop_record_t * +dsl_prop_record_find(dsl_dir_t *dd, const char *propname) +{ + dsl_prop_record_t *pr = NULL; + + ASSERT(MUTEX_HELD(&dd->dd_lock)); + + for (pr = list_head(&dd->dd_props); + pr != NULL; pr = list_next(&dd->dd_props, pr)) { + if (strcmp(pr->pr_propname, propname) == 0) + break; + } + + return (pr); +} + +static dsl_prop_record_t * +dsl_prop_record_create(dsl_dir_t *dd, const char *propname) +{ + dsl_prop_record_t *pr; + + ASSERT(MUTEX_HELD(&dd->dd_lock)); + + pr = kmem_alloc(sizeof (dsl_prop_record_t), KM_SLEEP); + pr->pr_propname = spa_strdup(propname); + list_create(&pr->pr_cbs, sizeof (dsl_prop_cb_record_t), + offsetof(dsl_prop_cb_record_t, cbr_pr_node)); + list_insert_head(&dd->dd_props, pr); + + return (pr); +} + +void +dsl_prop_init(dsl_dir_t *dd) +{ + list_create(&dd->dd_props, sizeof (dsl_prop_record_t), + offsetof(dsl_prop_record_t, pr_node)); +} + +void +dsl_prop_fini(dsl_dir_t *dd) +{ + dsl_prop_record_t *pr; + + while ((pr = list_remove_head(&dd->dd_props)) != NULL) { + list_destroy(&pr->pr_cbs); + strfree((char *)pr->pr_propname); + kmem_free(pr, sizeof (dsl_prop_record_t)); + } + list_destroy(&dd->dd_props); } /* @@ -233,38 +282,31 @@ dsl_prop_register(dsl_dataset_t *ds, con dsl_dir_t *dd = ds->ds_dir; dsl_pool_t *dp = dd->dd_pool; uint64_t value; + dsl_prop_record_t *pr; dsl_prop_cb_record_t *cbr; int err; - int need_rwlock; - need_rwlock = !RW_WRITE_HELD(&dp->dp_config_rwlock); - if (need_rwlock) - rw_enter(&dp->dp_config_rwlock, RW_READER); - - err = dsl_prop_get_ds(ds, propname, 8, 1, &value, NULL); - if (err != 0) { - if (need_rwlock) - rw_exit(&dp->dp_config_rwlock); + ASSERT(dsl_pool_config_held(dp)); + + err = dsl_prop_get_int_ds(ds, propname, &value); + if (err != 0) return (err); - } cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP); cbr->cbr_ds = ds; - cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP); - (void) strcpy((char *)cbr->cbr_propname, propname); cbr->cbr_func = callback; cbr->cbr_arg = cbarg; + mutex_enter(&dd->dd_lock); - list_insert_head(&dd->dd_prop_cbs, cbr); + pr = dsl_prop_record_find(dd, propname); + if (pr == NULL) + pr = dsl_prop_record_create(dd, propname); + cbr->cbr_pr = pr; + list_insert_head(&pr->pr_cbs, cbr); + list_insert_head(&ds->ds_prop_cbs, cbr); mutex_exit(&dd->dd_lock); cbr->cbr_func(cbr->cbr_arg, value); - - VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, - NULL, cbr, &dd)); - if (need_rwlock) - rw_exit(&dp->dp_config_rwlock); - /* Leave dir open until this callback is unregistered */ return (0); } @@ -272,19 +314,18 @@ int dsl_prop_get(const char *dsname, const char *propname, int intsz, int numints, void *buf, char *setpoint) { - dsl_dataset_t *ds; - int err; + objset_t *os; + int error; - err = dsl_dataset_hold(dsname, FTAG, &ds); - if (err) - return (err); + error = dmu_objset_hold(dsname, FTAG, &os); + if (error != 0) + return (error); - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); - err = dsl_prop_get_ds(ds, propname, intsz, numints, buf, setpoint); - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + error = dsl_prop_get_ds(dmu_objset_ds(os), propname, + intsz, numints, buf, setpoint); - dsl_dataset_rele(ds, FTAG); - return (err); + dmu_objset_rele(os, FTAG); + return (error); } /* @@ -302,17 +343,11 @@ dsl_prop_get_integer(const char *ddname, return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint)); } -void -dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, - zprop_source_t source, uint64_t *value) +int +dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname, + uint64_t *valuep) { - psa->psa_name = propname; - psa->psa_source = source; - psa->psa_intsz = 8; - psa->psa_numints = 1; - psa->psa_value = value; - - psa->psa_effective_value = -1ULL; + return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL)); } /* @@ -326,11 +361,10 @@ dsl_prop_setarg_init_uint64(dsl_prop_set * a property not handled by this function. */ int -dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) +dsl_prop_predict(dsl_dir_t *dd, const char *propname, + zprop_source_t source, uint64_t value, uint64_t *newvalp) { - const char *propname = psa->psa_name; zfs_prop_t prop = zfs_name_to_prop(propname); - zprop_source_t source = psa->psa_source; objset_t *mos; uint64_t zapobj; uint64_t version; @@ -348,7 +382,7 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl } mos = dd->dd_pool->dp_meta_objset; - zapobj = dd->dd_phys->dd_props_zapobj; + zapobj = dsl_dir_phys(dd)->dd_props_zapobj; recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); version = spa_version(dd->dd_pool->dp_spa); @@ -362,36 +396,33 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl switch (source) { case ZPROP_SRC_NONE: /* Revert to the received value, if any. */ - err = zap_lookup(mos, zapobj, recvdstr, 8, 1, - &psa->psa_effective_value); + err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp); if (err == ENOENT) - psa->psa_effective_value = 0; + *newvalp = 0; break; case ZPROP_SRC_LOCAL: - psa->psa_effective_value = *(uint64_t *)psa->psa_value; + *newvalp = value; break; case ZPROP_SRC_RECEIVED: /* * If there's no local setting, then the new received value will * be the effective value. */ - err = zap_lookup(mos, zapobj, propname, 8, 1, - &psa->psa_effective_value); + err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); if (err == ENOENT) - psa->psa_effective_value = *(uint64_t *)psa->psa_value; + *newvalp = value; break; case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): /* * We're clearing the received value, so the local setting (if * it exists) remains the effective value. */ - err = zap_lookup(mos, zapobj, propname, 8, 1, - &psa->psa_effective_value); + err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); if (err == ENOENT) - psa->psa_effective_value = 0; + *newvalp = 0; break; default: - cmn_err(CE_PANIC, "unexpected property source: %d", source); + panic("unexpected property source: %d", source); } strfree(recvdstr); @@ -402,92 +433,103 @@ dsl_prop_predict_sync(dsl_dir_t *dd, dsl return (err); } -#ifdef ZFS_DEBUG +/* + * Unregister all callbacks that are registered with the + * given callback argument. + */ void -dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa) +dsl_prop_unregister_all(dsl_dataset_t *ds, void *cbarg) { - zfs_prop_t prop = zfs_name_to_prop(psa->psa_name); - uint64_t intval; - char setpoint[MAXNAMELEN]; - uint64_t version = spa_version(dd->dd_pool->dp_spa); - int err; + dsl_prop_cb_record_t *cbr, *next_cbr; - if (version < SPA_VERSION_RECVD_PROPS) { - switch (prop) { - case ZFS_PROP_QUOTA: - case ZFS_PROP_RESERVATION: - return; + dsl_dir_t *dd = ds->ds_dir; + + mutex_enter(&dd->dd_lock); + next_cbr = list_head(&ds->ds_prop_cbs); + while (next_cbr != NULL) { + cbr = next_cbr; + next_cbr = list_next(&ds->ds_prop_cbs, cbr); + if (cbr->cbr_arg == cbarg) { + list_remove(&ds->ds_prop_cbs, cbr); + list_remove(&cbr->cbr_pr->pr_cbs, cbr); + kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); } } + mutex_exit(&dd->dd_lock); +} - err = dsl_prop_get_dd(dd, psa->psa_name, 8, 1, &intval, - setpoint, B_FALSE); - if (err == 0 && intval != psa->psa_effective_value) { - cmn_err(CE_PANIC, "%s property, source: %x, " - "predicted effective value: %llu, " - "actual effective value: %llu (setpoint: %s)", - psa->psa_name, psa->psa_source, - (unsigned long long)psa->psa_effective_value, - (unsigned long long)intval, setpoint); - } +boolean_t +dsl_prop_hascb(dsl_dataset_t *ds) +{ + return (!list_is_empty(&ds->ds_prop_cbs)); } -#endif -/* - * Unregister this callback. Return 0 on success, ENOENT if ddname is - * invalid, ENOMSG if no matching callback registered. - */ -int -dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, - dsl_prop_changed_cb_t *callback, void *cbarg) +/* ARGSUSED */ +static int +dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { dsl_dir_t *dd = ds->ds_dir; + dsl_prop_record_t *pr; dsl_prop_cb_record_t *cbr; mutex_enter(&dd->dd_lock); - for (cbr = list_head(&dd->dd_prop_cbs); - cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { - if (cbr->cbr_ds == ds && - cbr->cbr_func == callback && - cbr->cbr_arg == cbarg && - strcmp(cbr->cbr_propname, propname) == 0) - break; - } + for (pr = list_head(&dd->dd_props); + pr; pr = list_next(&dd->dd_props, pr)) { + for (cbr = list_head(&pr->pr_cbs); cbr; + cbr = list_next(&pr->pr_cbs, cbr)) { + uint64_t value; - if (cbr == NULL) { - mutex_exit(&dd->dd_lock); - return (ENOMSG); - } + /* + * Callback entries do not have holds on their + * datasets so that datasets with registered + * callbacks are still eligible for eviction. + * Unlike operations to update properties on a + * single dataset, we are performing a recursive + * descent of related head datasets. The caller + * of this function only has a dataset hold on + * the passed in head dataset, not the snapshots + * associated with this dataset. Without a hold, + * the dataset pointer within callback records + * for snapshots can be invalidated by eviction + * at any time. + * + * Use dsl_dataset_try_add_ref() to verify + * that the dataset for a snapshot has not + * begun eviction processing and to prevent + * eviction from occurring for the duration of + * the callback. If the hold attempt fails, + * this object is already being evicted and the + * callback can be safely ignored. + */ + if (ds != cbr->cbr_ds && + !dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG)) + continue; + + if (dsl_prop_get_ds(cbr->cbr_ds, + cbr->cbr_pr->pr_propname, sizeof (value), 1, + &value, NULL) == 0) + cbr->cbr_func(cbr->cbr_arg, value); - list_remove(&dd->dd_prop_cbs, cbr); + if (ds != cbr->cbr_ds) + dsl_dataset_rele(cbr->cbr_ds, FTAG); + } + } mutex_exit(&dd->dd_lock); - kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1); - kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); - /* Clean up from dsl_prop_register */ - dsl_dir_close(dd, cbr); return (0); } /* - * Return the number of callbacks that are registered for this dataset. + * Update all property values for ddobj & its descendants. This is used + * when renaming the dir. */ -int -dsl_prop_numcb(dsl_dataset_t *ds) +void +dsl_prop_notify_all(dsl_dir_t *dd) { - dsl_dir_t *dd = ds->ds_dir; - dsl_prop_cb_record_t *cbr; - int num = 0; - - mutex_enter(&dd->dd_lock); - for (cbr = list_head(&dd->dd_prop_cbs); - cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { - if (cbr->cbr_ds == ds) - num++; - } - mutex_exit(&dd->dd_lock); - - return (num); + dsl_pool_t *dp = dd->dd_pool; + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + (void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb, + NULL, DS_FIND_CHILDREN); } static void @@ -495,14 +537,15 @@ dsl_prop_changed_notify(dsl_pool_t *dp, const char *propname, uint64_t value, int first) { dsl_dir_t *dd; + dsl_prop_record_t *pr; dsl_prop_cb_record_t *cbr; objset_t *mos = dp->dp_meta_objset; zap_cursor_t zc; zap_attribute_t *za; int err; - ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd); + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); if (err) return; @@ -511,36 +554,49 @@ dsl_prop_changed_notify(dsl_pool_t *dp, * If the prop is set here, then this change is not * being inherited here or below; stop the recursion. */ - err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname); + err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj, + propname); if (err == 0) { - dsl_dir_close(dd, FTAG); + dsl_dir_rele(dd, FTAG); return; } ASSERT3U(err, ==, ENOENT); } mutex_enter(&dd->dd_lock); - for (cbr = list_head(&dd->dd_prop_cbs); cbr; - cbr = list_next(&dd->dd_prop_cbs, cbr)) { - uint64_t propobj = cbr->cbr_ds->ds_phys->ds_props_obj; + pr = dsl_prop_record_find(dd, propname); + if (pr != NULL) { + for (cbr = list_head(&pr->pr_cbs); cbr; + cbr = list_next(&pr->pr_cbs, cbr)) { + uint64_t propobj; - if (strcmp(cbr->cbr_propname, propname) != 0) - continue; + /* + * cbr->cbr_ds may be invalidated due to eviction, + * requiring the use of dsl_dataset_try_add_ref(). + * See comment block in dsl_prop_notify_all_cb() + * for details. + */ + if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG)) + continue; - /* - * If the property is set on this ds, then it is not - * inherited here; don't call the callback. - */ - if (propobj && 0 == zap_contains(mos, propobj, propname)) - continue; + propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj; - cbr->cbr_func(cbr->cbr_arg, value); + /* + * If the property is not set on this ds, then it is + * inherited here; call the callback. + */ + if (propobj == 0 || + zap_contains(mos, propobj, propname) != 0) + cbr->cbr_func(cbr->cbr_arg, value); + + dsl_dataset_rele(cbr->cbr_ds, FTAG); + } } mutex_exit(&dd->dd_lock); za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); for (zap_cursor_init(&zc, mos, - dd->dd_phys->dd_child_dir_zapobj); + dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { dsl_prop_changed_notify(dp, za->za_first_integer, @@ -548,47 +604,41 @@ dsl_prop_changed_notify(dsl_pool_t *dp, } kmem_free(za, sizeof (zap_attribute_t)); zap_cursor_fini(&zc); - dsl_dir_close(dd, FTAG); + dsl_dir_rele(dd, FTAG); } void -dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, + zprop_source_t source, int intsz, int numints, const void *value, + dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - dsl_prop_setarg_t *psa = arg2; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t zapobj, intval, dummy; int isint; char valbuf[32]; - char *valstr = NULL; + const char *valstr = NULL; char *inheritstr; char *recvdstr; char *tbuf = NULL; int err; uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa); - const char *propname = psa->psa_name; - zprop_source_t source = psa->psa_source; - isint = (dodefault(propname, 8, 1, &intval) == 0); + isint = (dodefault(zfs_name_to_prop(propname), 8, 1, &intval) == 0); - if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { ASSERT(version >= SPA_VERSION_SNAP_PROPS); - if (ds->ds_phys->ds_props_obj == 0) { + if (dsl_dataset_phys(ds)->ds_props_obj == 0) { dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_props_obj = + dsl_dataset_phys(ds)->ds_props_obj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); } - zapobj = ds->ds_phys->ds_props_obj; + zapobj = dsl_dataset_phys(ds)->ds_props_obj; } else { - zapobj = ds->ds_dir->dd_phys->dd_props_zapobj; + zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj; } if (version < SPA_VERSION_RECVD_PROPS) { - zfs_prop_t prop = zfs_name_to_prop(propname); - if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION) - return; - if (source & ZPROP_SRC_NONE) source = ZPROP_SRC_NONE; else if (source & ZPROP_SRC_RECEIVED) @@ -617,8 +667,8 @@ dsl_prop_set_sync(void *arg1, void *arg2 */ err = zap_remove(mos, zapobj, inheritstr, tx); ASSERT(err == 0 || err == ENOENT); - VERIFY(0 == zap_update(mos, zapobj, propname, - psa->psa_intsz, psa->psa_numints, psa->psa_value, tx)); + VERIFY0(zap_update(mos, zapobj, propname, + intsz, numints, value, tx)); break; case ZPROP_SRC_INHERITED: /* @@ -629,11 +679,10 @@ dsl_prop_set_sync(void *arg1, void *arg2 err = zap_remove(mos, zapobj, propname, tx); ASSERT(err == 0 || err == ENOENT); if (version >= SPA_VERSION_RECVD_PROPS && - zap_contains(mos, zapobj, ZPROP_HAS_RECVD) == 0) { + dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) { dummy = 0; - err = zap_update(mos, zapobj, inheritstr, - 8, 1, &dummy, tx); - ASSERT(err == 0); + VERIFY0(zap_update(mos, zapobj, inheritstr, + 8, 1, &dummy, tx)); } break; case ZPROP_SRC_RECEIVED: @@ -641,7 +690,7 @@ dsl_prop_set_sync(void *arg1, void *arg2 * set propname$recvd -> value */ err = zap_update(mos, zapobj, recvdstr, - psa->psa_intsz, psa->psa_numints, psa->psa_value, tx); + intsz, numints, value, tx); ASSERT(err == 0); break; case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED): @@ -671,9 +720,9 @@ dsl_prop_set_sync(void *arg1, void *arg2 strfree(recvdstr); if (isint) { - VERIFY(0 == dsl_prop_get_ds(ds, propname, 8, 1, &intval, NULL)); + VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval)); - if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { dsl_prop_cb_record_t *cbr; /* * It's a snapshot; nothing can inherit this @@ -681,10 +730,10 @@ dsl_prop_set_sync(void *arg1, void *arg2 * ds here. */ mutex_enter(&ds->ds_dir->dd_lock); - for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr; - cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) { - if (cbr->cbr_ds == ds && - strcmp(cbr->cbr_propname, propname) == 0) + for (cbr = list_head(&ds->ds_prop_cbs); cbr; + cbr = list_next(&ds->ds_prop_cbs, cbr)) { + if (strcmp(cbr->cbr_pr->pr_propname, + propname) == 0) cbr->cbr_func(cbr->cbr_arg, intval); } mutex_exit(&ds->ds_dir->dd_lock); @@ -698,7 +747,7 @@ dsl_prop_set_sync(void *arg1, void *arg2 valstr = valbuf; } else { if (source == ZPROP_SRC_LOCAL) { - valstr = (char *)psa->psa_value; + valstr = value; } else { tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); if (dsl_prop_get_ds(ds, propname, 1, @@ -707,146 +756,81 @@ dsl_prop_set_sync(void *arg1, void *arg2 } } - spa_history_internal_log((source == ZPROP_SRC_NONE || - source == ZPROP_SRC_INHERITED) ? LOG_DS_INHERIT : - LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr, - "%s=%s dataset = %llu", propname, - (valstr == NULL ? "" : valstr), ds->ds_object); + spa_history_log_internal_ds(ds, (source == ZPROP_SRC_NONE || + source == ZPROP_SRC_INHERITED) ? "inherit" : "set", tx, + "%s=%s", propname, (valstr == NULL ? "" : valstr)); if (tbuf != NULL) kmem_free(tbuf, ZAP_MAXVALUELEN); } -void -dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +int +dsl_prop_set_int(const char *dsname, const char *propname, + zprop_source_t source, uint64_t value) { - dsl_dataset_t *ds = arg1; - dsl_props_arg_t *pa = arg2; - nvlist_t *props = pa->pa_props; - dsl_prop_setarg_t psa; - nvpair_t *elem = NULL; + nvlist_t *nvl = fnvlist_alloc(); + int error; - psa.psa_source = pa->pa_source; - - while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { - nvpair_t *pair = elem; - - psa.psa_name = nvpair_name(pair); - - if (nvpair_type(pair) == DATA_TYPE_NVLIST) { - /* - * dsl_prop_get_all_impl() returns properties in this - * format. - */ - nvlist_t *attrs; - VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); - VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &pair) == 0); - } - - if (nvpair_type(pair) == DATA_TYPE_STRING) { - VERIFY(nvpair_value_string(pair, - (char **)&psa.psa_value) == 0); - psa.psa_intsz = 1; - psa.psa_numints = strlen(psa.psa_value) + 1; - } else { - uint64_t intval; - VERIFY(nvpair_value_uint64(pair, &intval) == 0); - psa.psa_intsz = sizeof (intval); - psa.psa_numints = 1; - psa.psa_value = &intval; - } - dsl_prop_set_sync(ds, &psa, cr, tx); - } + fnvlist_add_uint64(nvl, propname, value); + error = dsl_props_set(dsname, source, nvl); + fnvlist_free(nvl); + return (error); } -void -dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, - cred_t *cr, dmu_tx_t *tx) +int +dsl_prop_set_string(const char *dsname, const char *propname, + zprop_source_t source, const char *value) { - objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t zapobj = dd->dd_phys->dd_props_zapobj; - - ASSERT(dmu_tx_is_syncing(tx)); - - VERIFY(0 == zap_update(mos, zapobj, name, sizeof (val), 1, &val, tx)); + nvlist_t *nvl = fnvlist_alloc(); + int error; - dsl_prop_changed_notify(dd->dd_pool, dd->dd_object, name, val, TRUE); - - spa_history_internal_log(LOG_DS_PROPSET, dd->dd_pool->dp_spa, tx, cr, - "%s=%llu dataset = %llu", name, (u_longlong_t)val, - dd->dd_phys->dd_head_dataset_obj); + fnvlist_add_string(nvl, propname, value); + error = dsl_props_set(dsname, source, nvl); + fnvlist_free(nvl); + return (error); } int -dsl_prop_set(const char *dsname, const char *propname, zprop_source_t source, - int intsz, int numints, const void *buf) +dsl_prop_inherit(const char *dsname, const char *propname, + zprop_source_t source) { - dsl_dataset_t *ds; - uint64_t version; - int err; - dsl_prop_setarg_t psa; - - /* - * We must do these checks before we get to the syncfunc, since - * it can't fail. - */ - if (strlen(propname) >= ZAP_MAXNAMELEN) - return (ENAMETOOLONG); - - err = dsl_dataset_hold(dsname, FTAG, &ds); - if (err) - return (err); - - version = spa_version(ds->ds_dir->dd_pool->dp_spa); - if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ? - ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { - dsl_dataset_rele(ds, FTAG); - return (E2BIG); - } - if (dsl_dataset_is_snapshot(ds) && - version < SPA_VERSION_SNAP_PROPS) { - dsl_dataset_rele(ds, FTAG); - return (ENOTSUP); - } - - psa.psa_name = propname; - psa.psa_source = source; - psa.psa_intsz = intsz; - psa.psa_numints = numints; - psa.psa_value = buf; - psa.psa_effective_value = -1ULL; - - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - NULL, dsl_prop_set_sync, ds, &psa, 2); + nvlist_t *nvl = fnvlist_alloc(); + int error; - dsl_dataset_rele(ds, FTAG); - return (err); -} + fnvlist_add_boolean(nvl, propname); + error = dsl_props_set(dsname, source, nvl); + fnvlist_free(nvl); + return (error); +} + +typedef struct dsl_props_set_arg { + const char *dpsa_dsname; + zprop_source_t dpsa_source; + nvlist_t *dpsa_props; +} dsl_props_set_arg_t; -int -dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) +static int +dsl_props_set_check(void *arg, dmu_tx_t *tx) { + dsl_props_set_arg_t *dpsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t version; nvpair_t *elem = NULL; - dsl_props_arg_t pa; int err; - if (err = dsl_dataset_hold(dsname, FTAG, &ds)) + err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds); + if (err != 0) return (err); - /* - * Do these checks before the syncfunc, since it can't fail. - */ + version = spa_version(ds->ds_dir->dd_pool->dp_spa); - while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) { if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { dsl_dataset_rele(ds, FTAG); - return (ENAMETOOLONG); + return (SET_ERROR(ENAMETOOLONG)); } if (nvpair_type(elem) == DATA_TYPE_STRING) { - char *valstr; - VERIFY(nvpair_value_string(elem, &valstr) == 0); + char *valstr = fnvpair_value_string(elem); if (strlen(valstr) >= (version < SPA_VERSION_STMF_PROP ? ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { @@ -856,20 +840,83 @@ dsl_props_set(const char *dsname, zprop_ } } - if (dsl_dataset_is_snapshot(ds) && - version < SPA_VERSION_SNAP_PROPS) { + if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) { dsl_dataset_rele(ds, FTAG); - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); } + dsl_dataset_rele(ds, FTAG); + return (0); +} + +void +dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source, + nvlist_t *props, dmu_tx_t *tx) +{ + nvpair_t *elem = NULL; + + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + nvpair_t *pair = elem; + + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + /* + * dsl_prop_get_all_impl() returns properties in this + * format. + */ + nvlist_t *attrs = fnvpair_value_nvlist(pair); + pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); + } - pa.pa_props = props; - pa.pa_source = source; + if (nvpair_type(pair) == DATA_TYPE_STRING) { + const char *value = fnvpair_value_string(pair); + dsl_prop_set_sync_impl(ds, nvpair_name(pair), + source, 1, strlen(value) + 1, value, tx); + } else if (nvpair_type(pair) == DATA_TYPE_UINT64) { + uint64_t intval = fnvpair_value_uint64(pair); + dsl_prop_set_sync_impl(ds, nvpair_name(pair), + source, sizeof (intval), 1, &intval, tx); + } else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) { + dsl_prop_set_sync_impl(ds, nvpair_name(pair), + source, 0, 0, NULL, tx); + } else { + panic("invalid nvpair type"); + } + } +} - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - NULL, dsl_props_set_sync, ds, &pa, 2); +static void +dsl_props_set_sync(void *arg, dmu_tx_t *tx) +{ + dsl_props_set_arg_t *dpsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds)); + dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx); dsl_dataset_rele(ds, FTAG); - return (err); +} + +/* + * All-or-nothing; if any prop can't be set, nothing will be modified. + */ +int +dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) +{ + dsl_props_set_arg_t dpsa; + int nblks = 0; + + dpsa.dpsa_dsname = dsname; + dpsa.dpsa_source = source; + dpsa.dpsa_props = props; + + /* + * If the source includes NONE, then we will only be removing entries + * from the ZAP object. In that case don't check for ENOSPC. + */ + if ((source & ZPROP_SRC_NONE) == 0) + nblks = 2 * fnvlist_num_pairs(props); + + return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync, + &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED)); } typedef enum dsl_prop_getflags { @@ -1012,20 +1059,20 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, n dsl_pool_t *dp = dd->dd_pool; objset_t *mos = dp->dp_meta_objset; int err = 0; - char setpoint[MAXNAMELEN]; + char setpoint[ZFS_MAX_DATASET_NAME_LEN]; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if (dsl_dataset_is_snapshot(ds)) + if (ds->ds_is_snapshot) flags |= DSL_PROP_GET_SNAPSHOT; - rw_enter(&dp->dp_config_rwlock, RW_READER); + ASSERT(dsl_pool_config_held(dp)); - if (ds->ds_phys->ds_props_obj != 0) { + if (dsl_dataset_phys(ds)->ds_props_obj != 0) { ASSERT(flags & DSL_PROP_GET_SNAPSHOT); dsl_dataset_name(ds, setpoint); - err = dsl_prop_get_all_impl(mos, ds->ds_phys->ds_props_obj, - setpoint, flags, *nvp); + err = dsl_prop_get_all_impl(mos, + dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp); if (err) goto out; } @@ -1038,64 +1085,57 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, n flags |= DSL_PROP_GET_INHERITING; } dsl_dir_name(dd, setpoint); - err = dsl_prop_get_all_impl(mos, dd->dd_phys->dd_props_zapobj, - setpoint, flags, *nvp); + err = dsl_prop_get_all_impl(mos, + dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp); if (err) break; } out: - rw_exit(&dp->dp_config_rwlock); return (err); } boolean_t -dsl_prop_get_hasrecvd(objset_t *os) +dsl_prop_get_hasrecvd(const char *dsname) { - dsl_dataset_t *ds = os->os_dsl_dataset; - int rc; uint64_t dummy; - rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); - rc = dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, NULL); - rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); - ASSERT(rc != 0 || spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); - return (rc == 0); + return (0 == + dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL)); } -static void -dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source) +static int +dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source) { - dsl_dataset_t *ds = os->os_dsl_dataset; - uint64_t dummy = 0; - dsl_prop_setarg_t psa; - - if (spa_version(os->os_spa) < SPA_VERSION_RECVD_PROPS) - return; - - dsl_prop_setarg_init_uint64(&psa, ZPROP_HAS_RECVD, source, &dummy); + uint64_t version; + spa_t *spa; + int error = 0; - (void) dsl_sync_task_do(ds->ds_dir->dd_pool, NULL, - dsl_prop_set_sync, ds, &psa, 2); + VERIFY0(spa_open(dsname, &spa, FTAG)); + version = spa_version(spa); + spa_close(spa, FTAG); + + if (version >= SPA_VERSION_RECVD_PROPS) + error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0); + return (error); } /* * Call after successfully receiving properties to ensure that only the first * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties. */ -void -dsl_prop_set_hasrecvd(objset_t *os) +int +dsl_prop_set_hasrecvd(const char *dsname) { - if (dsl_prop_get_hasrecvd(os)) { - ASSERT(spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); - return; - } - dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_LOCAL); + int error = 0; + if (!dsl_prop_get_hasrecvd(dsname)) + error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL); + return (error); } void -dsl_prop_unset_hasrecvd(objset_t *os) +dsl_prop_unset_hasrecvd(const char *dsname) { - dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_NONE); + VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE)); } int @@ -1105,16 +1145,25 @@ dsl_prop_get_all(objset_t *os, nvlist_t } int -dsl_prop_get_received(objset_t *os, nvlist_t **nvp) +dsl_prop_get_received(const char *dsname, nvlist_t **nvp) { + objset_t *os; + int error; + /* * Received properties are not distinguishable from local properties * until the dataset has received properties on or after * SPA_VERSION_RECVD_PROPS. */ - dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(os) ? + dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ? DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL); - return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags)); + + error = dmu_objset_hold(dsname, FTAG, &os); + if (error != 0) + return (error); + error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags); + dmu_objset_rele(os, FTAG); + return (error); } void @@ -1132,7 +1181,7 @@ dsl_prop_nvlist_add_uint64(nvlist_t *nv, VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); /* Indicate the default source if we can. */ - if (dodefault(propname, 8, 1, &default_value) == 0 && + if (dodefault(prop, 8, 1, &default_value) == 0 && value == default_value) { VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scan.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scan.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scan.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scan.c 10 Oct 2016 11:09:56 -0000 @@ -0,0 +1,1922 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright 2016 Gary Mills + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#endif + +typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, + const zbookmark_phys_t *); + +static scan_cb_t dsl_scan_scrub_cb; +static void dsl_scan_cancel_sync(void *, dmu_tx_t *); +static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *); +static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *); + +unsigned int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ +unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ +unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ +unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */ + +unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ +unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ +unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver + per txg */ +boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ +boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, + &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN, + &zfs_resilver_delay, 0, "Number of ticks to delay resilver"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN, + &zfs_scrub_delay, 0, "Number of ticks to delay scrub"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN, + &zfs_scan_idle, 0, "Idle scan window in clock ticks"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN, + &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN, + &zfs_free_min_time_ms, 0, "Min millisecs to free per txg"); +SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN, + &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN, + &zfs_no_scrub_io, 0, "Disable scrub I/O"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN, + &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching"); + +enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; +/* max number of blocks to free in a single TXG */ +uint64_t zfs_free_max_blocks = UINT64_MAX; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN, + &zfs_free_max_blocks, 0, "Maximum number of blocks to free in one TXG"); + + +#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ + ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ + (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) + +extern int zfs_txg_timeout; + +/* + * Enable/disable the processing of the free_bpobj object. + */ +boolean_t zfs_free_bpobj_enabled = B_TRUE; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN, + &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing"); + +/* the order has to match pool_scan_type */ +static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { + NULL, + dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ + dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ +}; + +int +dsl_scan_init(dsl_pool_t *dp, uint64_t txg) +{ + int err; + dsl_scan_t *scn; + spa_t *spa = dp->dp_spa; + uint64_t f; + + scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); + scn->scn_dp = dp; + + /* + * It's possible that we're resuming a scan after a reboot so + * make sure that the scan_async_destroying flag is initialized + * appropriately. + */ + ASSERT(!scn->scn_async_destroying); + scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_ASYNC_DESTROY); + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + "scrub_func", sizeof (uint64_t), 1, &f); + if (err == 0) { + /* + * There was an old-style scrub in progress. Restart a + * new-style scrub from the beginning. + */ + scn->scn_restart_txg = txg; + zfs_dbgmsg("old-style scrub was in progress; " + "restarting new-style scrub in txg %llu", + scn->scn_restart_txg); + + /* + * Load the queue obj from the old location so that it + * can be freed by dsl_scan_done(). + */ + (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + "scrub_queue", sizeof (uint64_t), 1, + &scn->scn_phys.scn_queue_obj); + } else { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys); + if (err == ENOENT) + return (0); + else if (err) + return (err); + + if (scn->scn_phys.scn_state == DSS_SCANNING && + spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { + /* + * A new-type scrub was in progress on an old + * pool, and the pool was accessed by old + * software. Restart from the beginning, since + * the old software may have changed the pool in + * the meantime. + */ + scn->scn_restart_txg = txg; + zfs_dbgmsg("new-style scrub was modified " + "by old software; restarting in txg %llu", + scn->scn_restart_txg); + } + } + + spa_scan_stat_init(spa); + return (0); +} + +void +dsl_scan_fini(dsl_pool_t *dp) +{ + if (dp->dp_scan) { + kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); + dp->dp_scan = NULL; + } +} + +/* ARGSUSED */ +static int +dsl_scan_setup_check(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + if (scn->scn_phys.scn_state == DSS_SCANNING) + return (SET_ERROR(EBUSY)); + + return (0); +} + +static void +dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + pool_scan_func_t *funcp = arg; + dmu_object_type_t ot = 0; + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + + ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); + ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); + bzero(&scn->scn_phys, sizeof (scn->scn_phys)); + scn->scn_phys.scn_func = *funcp; + scn->scn_phys.scn_state = DSS_SCANNING; + scn->scn_phys.scn_min_txg = 0; + scn->scn_phys.scn_max_txg = tx->tx_txg; + scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ + scn->scn_phys.scn_start_time = gethrestime_sec(); + scn->scn_phys.scn_errors = 0; + scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; + scn->scn_restart_txg = 0; + scn->scn_done_txg = 0; + spa_scan_stat_init(spa); + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; + + /* rewrite all disk labels */ + vdev_config_dirty(spa->spa_root_vdev); + + if (vdev_resilver_needed(spa->spa_root_vdev, + &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { + spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); + } else { + spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START); + } + + spa->spa_scrub_started = B_TRUE; + /* + * If this is an incremental scrub, limit the DDT scrub phase + * to just the auto-ditto class (for correctness); the rest + * of the scrub should go faster using top-down pruning. + */ + if (scn->scn_phys.scn_min_txg > TXG_INITIAL) + scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; + + } + + /* back to the generic stuff */ + + if (dp->dp_blkstats == NULL) { + dp->dp_blkstats = + kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); + } + bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + + if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) + ot = DMU_OT_ZAP_OTHER; + + scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, + ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); + + dsl_scan_sync_state(scn, tx); + + spa_history_log_internal(spa, "scan setup", tx, + "func=%u mintxg=%llu maxtxg=%llu", + *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); +} + +/* ARGSUSED */ +static void +dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) +{ + static const char *old_names[] = { + "scrub_bookmark", + "scrub_ddt_bookmark", + "scrub_ddt_class_max", + "scrub_queue", + "scrub_min_txg", + "scrub_max_txg", + "scrub_func", + "scrub_errors", + NULL + }; + + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + int i; + + /* Remove any remnants of an old-style scrub. */ + for (i = 0; old_names[i]; i++) { + (void) zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); + } + + if (scn->scn_phys.scn_queue_obj != 0) { + VERIFY(0 == dmu_object_free(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, tx)); + scn->scn_phys.scn_queue_obj = 0; + } + + /* + * If we were "restarted" from a stopped state, don't bother + * with anything else. + */ + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (complete) + scn->scn_phys.scn_state = DSS_FINISHED; + else + scn->scn_phys.scn_state = DSS_CANCELED; + + if (dsl_scan_restarting(scn, tx)) + spa_history_log_internal(spa, "scan aborted, restarting", tx, + "errors=%llu", spa_get_errlog_size(spa)); + else if (!complete) + spa_history_log_internal(spa, "scan cancelled", tx, + "errors=%llu", spa_get_errlog_size(spa)); + else + spa_history_log_internal(spa, "scan done", tx, + "errors=%llu", spa_get_errlog_size(spa)); + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > 0) { + cv_wait(&spa->spa_scrub_io_cv, + &spa->spa_scrub_lock); + } + mutex_exit(&spa->spa_scrub_lock); + spa->spa_scrub_started = B_FALSE; + spa->spa_scrub_active = B_FALSE; + + /* + * If the scrub/resilver completed, update all DTLs to + * reflect this. Whether it succeeded or not, vacate + * all temporary scrub DTLs. + */ + vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, + complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); + if (complete) { + spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ? + ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); + } + spa_errlog_rotate(spa); + + /* + * We may have finished replacing a device. + * Let the async thread assess this and handle the detach. + */ + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); + } + + scn->scn_phys.scn_end_time = gethrestime_sec(); +} + +/* ARGSUSED */ +static int +dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return (SET_ERROR(ENOENT)); + return (0); +} + +/* ARGSUSED */ +static void +dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + dsl_scan_done(scn, B_FALSE, tx); + dsl_scan_sync_state(scn, tx); +} + +int +dsl_scan_cancel(dsl_pool_t *dp) +{ + return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, + dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); +} + +static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, + dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, + dmu_objset_type_t ostype, dmu_tx_t *tx); +static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, + dmu_objset_type_t ostype, + dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); + +void +dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) +{ + zio_free(dp->dp_spa, txg, bp); +} + +void +dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) +{ + ASSERT(dsl_pool_sync_context(dp)); + zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), + pio->io_flags)); +} + +static uint64_t +dsl_scan_ds_maxtxg(dsl_dataset_t *ds) +{ + uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; + if (ds->ds_is_snapshot) + return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); + return (smt); +} + +static void +dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) +{ + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys, tx)); +} + +extern int zfs_vdev_async_write_active_min_dirty_percent; + +static boolean_t +dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb) +{ + /* we never skip user/group accounting objects */ + if (zb && (int64_t)zb->zb_object < 0) + return (B_FALSE); + + if (scn->scn_pausing) + return (B_TRUE); /* we're already pausing */ + + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) + return (B_FALSE); /* we're resuming */ + + /* We only know how to resume from level-0 blocks. */ + if (zb && zb->zb_level != 0) + return (B_FALSE); + + /* + * We pause if: + * - we have scanned for the maximum time: an entire txg + * timeout (default 5 sec) + * or + * - we have scanned for at least the minimum time (default 1 sec + * for scrub, 3 sec for resilver), and either we have sufficient + * dirty data that we are starting to write more quickly + * (default 30%), or someone is explicitly waiting for this txg + * to complete. + * or + * - the spa is shutting down because this pool is being exported + * or the machine is rebooting. + */ + int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scan_min_time_ms; + uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; + int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; + if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout || + (NSEC2MSEC(elapsed_nanosecs) > mintime && + (txg_sync_waiting(scn->scn_dp) || + dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) || + spa_shutting_down(scn->scn_dp->dp_spa)) { + if (zb) { + dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + scn->scn_phys.scn_bookmark = *zb; + } + dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); + scn->scn_pausing = B_TRUE; + return (B_TRUE); + } + return (B_FALSE); +} + +typedef struct zil_scan_arg { + dsl_pool_t *zsa_dp; + zil_header_t *zsa_zh; +} zil_scan_arg_t; + +/* ARGSUSED */ +static int +dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + zil_scan_arg_t *zsa = arg; + dsl_pool_t *dp = zsa->zsa_dp; + dsl_scan_t *scn = dp->dp_scan; + zil_header_t *zh = zsa->zsa_zh; + zbookmark_phys_t zb; + + if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return (0); + + /* + * One block ("stubby") can be allocated a long time ago; we + * want to visit that one because it has been allocated + * (on-disk) even if it hasn't been claimed (even though for + * scrub there's nothing to do to it). + */ + if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); + + VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + return (0); +} + +/* ARGSUSED */ +static int +dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + if (lrc->lrc_txtype == TX_WRITE) { + zil_scan_arg_t *zsa = arg; + dsl_pool_t *dp = zsa->zsa_dp; + dsl_scan_t *scn = dp->dp_scan; + zil_header_t *zh = zsa->zsa_zh; + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_phys_t zb; + + if (BP_IS_HOLE(bp) || + bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return (0); + + /* + * birth can be < claim_txg if this record's txg is + * already txg sync'ed (but this log block contains + * other records that are not synced) + */ + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + lr->lr_foid, ZB_ZIL_LEVEL, + lr->lr_offset / BP_GET_LSIZE(bp)); + + VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + } + return (0); +} + +static void +dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) +{ + uint64_t claim_txg = zh->zh_claim_txg; + zil_scan_arg_t zsa = { dp, zh }; + zilog_t *zilog; + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed (or, in read-only mode, blocks that *would* be claimed). + */ + if (claim_txg == 0 && spa_writeable(dp->dp_spa)) + return; + + zilog = zil_alloc(dp->dp_meta_objset, zh); + + (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, + claim_txg); + + zil_free(zilog); +} + +/* ARGSUSED */ +static void +dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, + uint64_t objset, uint64_t object, uint64_t blkid) +{ + zbookmark_phys_t czb; + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + + if (zfs_no_scrub_prefetch) + return; + + if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) + return; + + SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); + + (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, + NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); +} + +static boolean_t +dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, + const zbookmark_phys_t *zb) +{ + /* + * We never skip over user/group accounting objects (obj<0) + */ + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && + (int64_t)zb->zb_object >= 0) { + /* + * If we already visited this bp & everything below (in + * a prior txg sync), don't bother doing it again. + */ + if (zbookmark_subtree_completed(dnp, zb, + &scn->scn_phys.scn_bookmark)) + return (B_TRUE); + + /* + * If we found the block we're trying to resume from, or + * we went past it to a different object, zero it out to + * indicate that it's OK to start checking for pausing + * again. + */ + if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || + zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { + dprintf("resuming at %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); + } + } + return (B_FALSE); +} + +/* + * Return nonzero on i/o error. + * Return new buf to write out in *bufp. + */ +static int +dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, + dnode_phys_t *dnp, const blkptr_t *bp, + const zbookmark_phys_t *zb, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; + int err; + + if (BP_GET_LEVEL(bp) > 0) { + arc_flags_t flags = ARC_FLAG_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + arc_buf_t *buf; + + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, + zb->zb_object, zb->zb_blkid * epb + i); + } + for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + zbookmark_phys_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + dsl_scan_visitbp(cbp, &czb, dnp, + ds, scn, ostype, tx); + } + arc_buf_destroy(buf, &buf); + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + arc_flags_t flags = ARC_FLAG_WAIT; + dnode_phys_t *cdnp; + int i, j; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + arc_buf_t *buf; + + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { + for (j = 0; j < cdnp->dn_nblkptr; j++) { + blkptr_t *cbp = &cdnp->dn_blkptr[j]; + dsl_scan_prefetch(scn, buf, cbp, + zb->zb_objset, zb->zb_blkid * epb + i, j); + } + } + for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { + dsl_scan_visitdnode(scn, ds, ostype, + cdnp, zb->zb_blkid * epb + i, tx); + } + + arc_buf_destroy(buf, &buf); + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + arc_flags_t flags = ARC_FLAG_WAIT; + objset_phys_t *osp; + arc_buf_t *buf; + + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + + osp = buf->b_data; + + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); + + if (OBJSET_BUF_HAS_USERUSED(buf)) { + /* + * We also always visit user/group accounting + * objects, and never skip them, even if we are + * pausing. This is necessary so that the space + * deltas from this txg get integrated. + */ + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_groupused_dnode, + DMU_GROUPUSED_OBJECT, tx); + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_userused_dnode, + DMU_USERUSED_OBJECT, tx); + } + arc_buf_destroy(buf, &buf); + } + + return (0); +} + +static void +dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, + dmu_objset_type_t ostype, dnode_phys_t *dnp, + uint64_t object, dmu_tx_t *tx) +{ + int j; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zbookmark_phys_t czb; + + SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, + dnp->dn_nlevels - 1, j); + dsl_scan_visitbp(&dnp->dn_blkptr[j], + &czb, dnp, ds, scn, ostype, tx); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zbookmark_phys_t czb; + SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, + 0, DMU_SPILL_BLKID); + dsl_scan_visitbp(&dnp->dn_spill, + &czb, dnp, ds, scn, ostype, tx); + } +} + +/* + * The arguments are in this order because mdb can only print the + * first 5; we want them to be useful. + */ +static void +dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, + dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, + dmu_objset_type_t ostype, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + arc_buf_t *buf = NULL; + blkptr_t bp_toread = *bp; + + /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ + + if (dsl_scan_check_pause(scn, zb)) + return; + + if (dsl_scan_check_resume(scn, dnp, zb)) + return; + + if (BP_IS_HOLE(bp)) + return; + + scn->scn_visited_this_txg++; + + dprintf_bp(bp, + "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", + ds, ds ? ds->ds_object : 0, + zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, + bp); + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return; + + if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0) + return; + + /* + * If dsl_scan_ddt() has aready visited this block, it will have + * already done any translations or scrubbing, so don't call the + * callback again. + */ + if (ddt_class_contains(dp->dp_spa, + scn->scn_phys.scn_ddt_class_max, bp)) { + ASSERT(buf == NULL); + return; + } + + /* + * If this block is from the future (after cur_max_txg), then we + * are doing this on behalf of a deleted snapshot, and we will + * revisit the future block on the next pass of this dataset. + * Don't scan it now unless we need to because something + * under it was modified. + */ + if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) { + scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); + } +} + +static void +dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, + dmu_tx_t *tx) +{ + zbookmark_phys_t zb; + + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + dsl_scan_visitbp(bp, &zb, NULL, + ds, scn, DMU_OST_NONE, tx); + + dprintf_ds(ds, "finished scan%s", ""); +} + +void +dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { + if (ds->ds_is_snapshot) { + /* + * Note: + * - scn_cur_{min,max}_txg stays the same. + * - Setting the flag is not really necessary if + * scn_cur_max_txg == scn_max_txg, because there + * is nothing after this snapshot that we care + * about. However, we set it anyway and then + * ignore it when we retraverse it in + * dsl_scan_visitds(). + */ + scn->scn_phys.scn_bookmark.zb_objset = + dsl_dataset_phys(ds)->ds_next_snap_obj; + zfs_dbgmsg("destroying ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)dsl_dataset_phys(ds)-> + ds_next_snap_obj); + scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; + } else { + SET_BOOKMARK(&scn->scn_phys.scn_bookmark, + ZB_DESTROYED_OBJSET, 0, 0, 0); + zfs_dbgmsg("destroying ds %llu; currently traversing; " + "reset bookmark to -1,0,0,0", + (u_longlong_t)ds->ds_object); + } + } else if (zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); + if (ds->ds_is_snapshot) { + /* + * We keep the same mintxg; it could be > + * ds_creation_txg if the previous snapshot was + * deleted too. + */ + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + dsl_dataset_phys(ds)->ds_next_snap_obj, + mintxg, tx) == 0); + zfs_dbgmsg("destroying ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)dsl_dataset_phys(ds)-> + ds_next_snap_obj); + } else { + zfs_dbgmsg("destroying ds %llu; in queue; removing", + (u_longlong_t)ds->ds_object); + } + } + + /* + * dsl_scan_sync() should be called after this, and should sync + * out our changed state, but just to be safe, do it here. + */ + dsl_scan_sync_state(scn, tx); +} + +void +dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); + + if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { + scn->scn_phys.scn_bookmark.zb_objset = + dsl_dataset_phys(ds)->ds_prev_snap_obj; + zfs_dbgmsg("snapshotting ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); + } else if (zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); + zfs_dbgmsg("snapshotting ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); + } + dsl_scan_sync_state(scn, tx); +} + +void +dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds1->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { + scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; + zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds1->ds_object, + (u_longlong_t)ds2->ds_object); + } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { + scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; + zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds2->ds_object, + (u_longlong_t)ds1->ds_object); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds1->ds_object, &mintxg) == 0) { + int err; + + ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); + err = zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); + VERIFY(err == 0 || err == EEXIST); + if (err == EEXIST) { + /* Both were there to begin with */ + VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + ds1->ds_object, mintxg, tx)); + } + zfs_dbgmsg("clone_swap ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds1->ds_object, + (u_longlong_t)ds2->ds_object); + } else if (zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { + ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); + VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); + zfs_dbgmsg("clone_swap ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds2->ds_object, + (u_longlong_t)ds1->ds_object); + } + + dsl_scan_sync_state(scn, tx); +} + +struct enqueue_clones_arg { + dmu_tx_t *tx; + uint64_t originobj; +}; + +/* ARGSUSED */ +static int +enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) +{ + struct enqueue_clones_arg *eca = arg; + dsl_dataset_t *ds; + int err; + dsl_scan_t *scn = dp->dp_scan; + + if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj) + return (0); + + err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); + if (err) + return (err); + + while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); + + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + ds = prev; + } + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, + dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0); + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + dsl_dataset_t *ds; + objset_t *os; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + + if (scn->scn_phys.scn_cur_min_txg >= + scn->scn_phys.scn_max_txg) { + /* + * This can happen if this snapshot was created after the + * scan started, and we already completed a previous snapshot + * that was created after the scan started. This snapshot + * only references blocks with: + * + * birth < our ds_creation_txg + * cur_min_txg is no less than ds_creation_txg. + * We have already visited these blocks. + * or + * birth > scn_max_txg + * The scan requested not to visit these blocks. + * + * Subsequent snapshots (and clones) can reference our + * blocks, or blocks with even higher birth times. + * Therefore we do not need to visit them either, + * so we do not add them to the work queue. + * + * Note that checking for cur_min_txg >= cur_max_txg + * is not sufficient, because in that case we may need to + * visit subsequent snapshots. This happens when min_txg > 0, + * which raises cur_min_txg. In this case we will visit + * this dataset but skip all of its blocks, because the + * rootbp's birth time is < cur_min_txg. Then we will + * add the next snapshots/clones to the work queue. + */ + char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP); + dsl_dataset_name(ds, dsname); + zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " + "cur_min_txg (%llu) >= max_txg (%llu)", + dsobj, dsname, + scn->scn_phys.scn_cur_min_txg, + scn->scn_phys.scn_max_txg); + kmem_free(dsname, MAXNAMELEN); + + goto out; + } + + if (dmu_objset_from_ds(ds, &os)) + goto out; + + /* + * Only the ZIL in the head (non-snapshot) is valid. Even though + * snapshots can have ZIL block pointers (which may be the same + * BP as in the head), they must be ignored. So we traverse the + * ZIL here, rather than in scan_recurse(), because the regular + * snapshot block-sharing rules don't apply to it. + */ + if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot) + dsl_scan_zil(dp, &os->os_zil_header); + + /* + * Iterate over the bps in this ds. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + dsl_dataset_name(ds, dsname); + zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " + "pausing=%u", + (longlong_t)dsobj, dsname, + (longlong_t)scn->scn_phys.scn_cur_min_txg, + (longlong_t)scn->scn_phys.scn_cur_max_txg, + (int)scn->scn_pausing); + kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); + + if (scn->scn_pausing) + goto out; + + /* + * We've finished this pass over this dataset. + */ + + /* + * If we did not completely visit this dataset, do another pass. + */ + if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { + zfs_dbgmsg("incomplete pass; visiting again"); + scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, + scn->scn_phys.scn_cur_max_txg, tx) == 0); + goto out; + } + + /* + * Add descendent datasets to work queue. + */ + if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + dsl_dataset_phys(ds)->ds_next_snap_obj, + dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0); + } + if (dsl_dataset_phys(ds)->ds_num_children > 1) { + boolean_t usenext = B_FALSE; + if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { + uint64_t count; + /* + * A bug in a previous version of the code could + * cause upgrade_clones_cb() to not set + * ds_next_snap_obj when it should, leading to a + * missing entry. Therefore we can only use the + * next_clones_obj when its count is correct. + */ + int err = zap_count(dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj, &count); + if (err == 0 && + count == dsl_dataset_phys(ds)->ds_num_children - 1) + usenext = B_TRUE; + } + + if (usenext) { + VERIFY0(zap_join_key(dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj, + scn->scn_phys.scn_queue_obj, + dsl_dataset_phys(ds)->ds_creation_txg, tx)); + } else { + struct enqueue_clones_arg eca; + eca.tx = tx; + eca.originobj = ds->ds_object; + + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + enqueue_clones_cb, &eca, DS_FIND_CHILDREN)); + } + } + +out: + dsl_dataset_rele(ds, FTAG); +} + +/* ARGSUSED */ +static int +enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) +{ + dmu_tx_t *tx = arg; + dsl_dataset_t *ds; + int err; + dsl_scan_t *scn = dp->dp_scan; + + err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); + if (err) + return (err); + + while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + /* + * If this is a clone, we don't need to worry about it for now. + */ + if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { + dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele(prev, FTAG); + return (0); + } + dsl_dataset_rele(ds, FTAG); + ds = prev; + } + + VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0); + dsl_dataset_rele(ds, FTAG); + return (0); +} + +/* + * Scrub/dedup interaction. + * + * If there are N references to a deduped block, we don't want to scrub it + * N times -- ideally, we should scrub it exactly once. + * + * We leverage the fact that the dde's replication class (enum ddt_class) + * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest + * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. + * + * To prevent excess scrubbing, the scrub begins by walking the DDT + * to find all blocks with refcnt > 1, and scrubs each of these once. + * Since there are two replication classes which contain blocks with + * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. + * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. + * + * There would be nothing more to say if a block's refcnt couldn't change + * during a scrub, but of course it can so we must account for changes + * in a block's replication class. + * + * Here's an example of what can occur: + * + * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 + * when visited during the top-down scrub phase, it will be scrubbed twice. + * This negates our scrub optimization, but is otherwise harmless. + * + * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 + * on each visit during the top-down scrub phase, it will never be scrubbed. + * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's + * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to + * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 + * while a scrub is in progress, it scrubs the block right then. + */ +static void +dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) +{ + ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; + ddt_entry_t dde = { 0 }; + int error; + uint64_t n = 0; + + while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { + ddt_t *ddt; + + if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) + break; + dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", + (longlong_t)ddb->ddb_class, + (longlong_t)ddb->ddb_type, + (longlong_t)ddb->ddb_checksum, + (longlong_t)ddb->ddb_cursor); + + /* There should be no pending changes to the dedup table */ + ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; + ASSERT(avl_first(&ddt->ddt_tree) == NULL); + + dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); + n++; + + if (dsl_scan_check_pause(scn, NULL)) + break; + } + + zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u", + (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max, + (int)scn->scn_pausing); + + ASSERT(error == 0 || error == ENOENT); + ASSERT(error != ENOENT || + ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); +} + +/* ARGSUSED */ +void +dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + const ddt_key_t *ddk = &dde->dde_key; + ddt_phys_t *ddp = dde->dde_phys; + blkptr_t bp; + zbookmark_phys_t zb = { 0 }; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) + continue; + ddt_bp_create(checksum, ddk, ddp, &bp); + + scn->scn_visited_this_txg++; + scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); + } +} + +static void +dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + zap_cursor_t zc; + zap_attribute_t za; + + if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= + scn->scn_phys.scn_ddt_class_max) { + scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; + scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; + dsl_scan_ddt(scn, tx); + if (scn->scn_pausing) + return; + } + + if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { + /* First do the MOS & ORIGIN */ + + scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; + scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; + dsl_scan_visit_rootbp(scn, NULL, + &dp->dp_meta_rootbp, tx); + spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); + if (scn->scn_pausing) + return; + + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + enqueue_cb, tx, DS_FIND_CHILDREN)); + } else { + dsl_scan_visitds(scn, + dp->dp_origin_snap->ds_object, tx); + } + ASSERT(!scn->scn_pausing); + } else if (scn->scn_phys.scn_bookmark.zb_objset != + ZB_DESTROYED_OBJSET) { + /* + * If we were paused, continue from here. Note if the + * ds we were paused on was deleted, the zb_objset may + * be -1, so we will skip this and find a new objset + * below. + */ + dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); + if (scn->scn_pausing) + return; + } + + /* + * In case we were paused right at the end of the ds, zero the + * bookmark so we don't think that we're still trying to resume. + */ + bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); + + /* keep pulling things out of the zap-object-as-queue */ + while (zap_cursor_init(&zc, dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj), + zap_cursor_retrieve(&zc, &za) == 0) { + dsl_dataset_t *ds; + uint64_t dsobj; + + dsobj = strtonum(za.za_name, NULL); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, dsobj, tx)); + + /* Set up min/max txg */ + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + if (za.za_first_integer != 0) { + scn->scn_phys.scn_cur_min_txg = + MAX(scn->scn_phys.scn_min_txg, + za.za_first_integer); + } else { + scn->scn_phys.scn_cur_min_txg = + MAX(scn->scn_phys.scn_min_txg, + dsl_dataset_phys(ds)->ds_prev_snap_txg); + } + scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); + dsl_dataset_rele(ds, FTAG); + + dsl_scan_visitds(scn, dsobj, tx); + zap_cursor_fini(&zc); + if (scn->scn_pausing) + return; + } + zap_cursor_fini(&zc); +} + +static boolean_t +dsl_scan_free_should_pause(dsl_scan_t *scn) +{ + uint64_t elapsed_nanosecs; + + if (zfs_recover) + return (B_FALSE); + + if (scn->scn_visited_this_txg >= zfs_free_max_blocks) + return (B_TRUE); + + elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; + return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms && + txg_sync_waiting(scn->scn_dp)) || + spa_shutting_down(scn->scn_dp->dp_spa)); +} + +static int +dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg; + + if (!scn->scn_is_bptree || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { + if (dsl_scan_free_should_pause(scn)) + return (SET_ERROR(ERESTART)); + } + + zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, + dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, + -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), + -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); + scn->scn_visited_this_txg++; + return (0); +} + +boolean_t +dsl_scan_active(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + uint64_t used = 0, comp, uncomp; + + if (spa->spa_load_state != SPA_LOAD_NONE) + return (B_FALSE); + if (spa_shutting_down(spa)) + return (B_FALSE); + if (scn->scn_phys.scn_state == DSS_SCANNING || + (scn->scn_async_destroying && !scn->scn_async_stalled)) + return (B_TRUE); + + if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, + &used, &comp, &uncomp); + } + return (used != 0); +} + +void +dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dp->dp_scan; + spa_t *spa = dp->dp_spa; + int err = 0; + + /* + * Check for scn_restart_txg before checking spa_load_state, so + * that we can restart an old-style scan while the pool is being + * imported (see dsl_scan_init). + */ + if (dsl_scan_restarting(scn, tx)) { + pool_scan_func_t func = POOL_SCAN_SCRUB; + dsl_scan_done(scn, B_FALSE, tx); + if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) + func = POOL_SCAN_RESILVER; + zfs_dbgmsg("restarting scan func=%u txg=%llu", + func, tx->tx_txg); + dsl_scan_setup_sync(&func, tx); + } + + /* + * Only process scans in sync pass 1. + */ + if (spa_sync_pass(dp->dp_spa) > 1) + return; + + /* + * If the spa is shutting down, then stop scanning. This will + * ensure that the scan does not dirty any new data during the + * shutdown phase. + */ + if (spa_shutting_down(spa)) + return; + + /* + * If the scan is inactive due to a stalled async destroy, try again. + */ + if (!scn->scn_async_stalled && !dsl_scan_active(scn)) + return; + + scn->scn_visited_this_txg = 0; + scn->scn_pausing = B_FALSE; + scn->scn_sync_start_time = gethrtime(); + spa->spa_scrub_active = B_TRUE; + + /* + * First process the async destroys. If we pause, don't do + * any scrubbing or resilvering. This ensures that there are no + * async destroys while we are scanning, so the scan code doesn't + * have to worry about traversing it. It is also faster to free the + * blocks than to scrub them. + */ + if (zfs_free_bpobj_enabled && + spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + scn->scn_is_bptree = B_FALSE; + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bpobj_iterate(&dp->dp_free_bpobj, + dsl_scan_free_block_cb, scn, tx); + VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + + if (err != 0 && err != ERESTART) + zfs_panic_recover("error %u from bpobj_iterate()", err); + } + + if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { + ASSERT(scn->scn_async_destroying); + scn->scn_is_bptree = B_TRUE; + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bptree_iterate(dp->dp_meta_objset, + dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); + VERIFY0(zio_wait(scn->scn_zio_root)); + + if (err == EIO || err == ECKSUM) { + err = 0; + } else if (err != 0 && err != ERESTART) { + zfs_panic_recover("error %u from " + "traverse_dataset_destroyed()", err); + } + + if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { + /* finished; deactivate async destroy feature */ + spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); + ASSERT(!spa_feature_is_active(spa, + SPA_FEATURE_ASYNC_DESTROY)); + VERIFY0(zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, tx)); + VERIFY0(bptree_free(dp->dp_meta_objset, + dp->dp_bptree_obj, tx)); + dp->dp_bptree_obj = 0; + scn->scn_async_destroying = B_FALSE; + scn->scn_async_stalled = B_FALSE; + } else { + /* + * If we didn't make progress, mark the async + * destroy as stalled, so that we will not initiate + * a spa_sync() on its behalf. Note that we only + * check this if we are not finished, because if the + * bptree had no blocks for us to visit, we can + * finish without "making progress". + */ + scn->scn_async_stalled = + (scn->scn_visited_this_txg == 0); + } + } + if (scn->scn_visited_this_txg) { + zfs_dbgmsg("freed %llu blocks in %llums from " + "free_bpobj/bptree txg %llu; err=%d", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t) + NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), + (longlong_t)tx->tx_txg, err); + scn->scn_visited_this_txg = 0; + + /* + * Write out changes to the DDT that may be required as a + * result of the blocks freed. This ensures that the DDT + * is clean when a scrub/resilver runs. + */ + ddt_sync(spa, tx->tx_txg); + } + if (err != 0) + return; + if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && + zfs_free_leak_on_eio && + (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || + dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 || + dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { + /* + * We have finished background destroying, but there is still + * some space left in the dp_free_dir. Transfer this leaked + * space to the dp_leak_dir. + */ + if (dp->dp_leak_dir == NULL) { + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, + LEAK_DIR_NAME, tx); + VERIFY0(dsl_pool_open_special_dir(dp, + LEAK_DIR_NAME, &dp->dp_leak_dir)); + rrw_exit(&dp->dp_config_rwlock, FTAG); + } + dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, + dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, + dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, + dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, + -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, + -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); + } + if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) { + /* finished; verify that space accounting went to zero */ + ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); + ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); + ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); + } + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + if (scn->scn_done_txg == tx->tx_txg) { + ASSERT(!scn->scn_pausing); + /* finished with scan. */ + zfs_dbgmsg("txg %llu scan complete", tx->tx_txg); + dsl_scan_done(scn, B_TRUE, tx); + ASSERT3U(spa->spa_scrub_inflight, ==, 0); + dsl_scan_sync_state(scn, tx); + return; + } + + if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= + scn->scn_phys.scn_ddt_class_max) { + zfs_dbgmsg("doing scan sync txg %llu; " + "ddt bm=%llu/%llu/%llu/%llx", + (longlong_t)tx->tx_txg, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, + (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); + ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); + ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); + ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); + ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); + } else { + zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", + (longlong_t)tx->tx_txg, + (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, + (longlong_t)scn->scn_phys.scn_bookmark.zb_object, + (longlong_t)scn->scn_phys.scn_bookmark.zb_level, + (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); + } + + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + dsl_pool_config_enter(dp, FTAG); + dsl_scan_visit(scn, tx); + dsl_pool_config_exit(dp, FTAG); + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + zfs_dbgmsg("visited %llu blocks in %llums", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time)); + + if (!scn->scn_pausing) { + scn->scn_done_txg = tx->tx_txg + 1; + zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu", + tx->tx_txg, scn->scn_done_txg); + } + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > 0) { + cv_wait(&spa->spa_scrub_io_cv, + &spa->spa_scrub_lock); + } + mutex_exit(&spa->spa_scrub_lock); + } + + dsl_scan_sync_state(scn, tx); +} + +/* + * This will start a new scan, or restart an existing one. + */ +void +dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) +{ + if (txg == 0) { + dmu_tx_t *tx; + tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); + + txg = dmu_tx_get_txg(tx); + dp->dp_scan->scn_restart_txg = txg; + dmu_tx_commit(tx); + } else { + dp->dp_scan->scn_restart_txg = txg; + } + zfs_dbgmsg("restarting resilver txg=%llu", txg); +} + +boolean_t +dsl_scan_resilvering(dsl_pool_t *dp) +{ + return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && + dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); +} + +/* + * scrub consumers + */ + +static void +count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) +{ + int i; + + /* + * If we resume after a reboot, zab will be NULL; don't record + * incomplete stats in that case. + */ + if (zab == NULL) + return; + + for (i = 0; i < 4; i++) { + int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; + int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; + if (t & DMU_OT_NEWTYPE) + t = DMU_OT_OTHER; + zfs_blkstat_t *zb = &zab->zab_type[l][t]; + int equal; + + zb->zb_count++; + zb->zb_asize += BP_GET_ASIZE(bp); + zb->zb_lsize += BP_GET_LSIZE(bp); + zb->zb_psize += BP_GET_PSIZE(bp); + zb->zb_gangs += BP_COUNT_GANG(bp); + + switch (BP_GET_NDVAS(bp)) { + case 2: + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + zb->zb_ditto_2_of_2_samevdev++; + break; + case 3: + equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + + (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2])) + + (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2])); + if (equal == 1) + zb->zb_ditto_2_of_3_samevdev++; + else if (equal == 3) + zb->zb_ditto_3_of_3_samevdev++; + break; + } + } +} + +static void +dsl_scan_scrub_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + + zio_data_buf_free(zio->io_data, zio->io_size); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_inflight--; + cv_broadcast(&spa->spa_scrub_io_cv); + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { + spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; + } + mutex_exit(&spa->spa_scrub_lock); +} + +static int +dsl_scan_scrub_cb(dsl_pool_t *dp, + const blkptr_t *bp, const zbookmark_phys_t *zb) +{ + dsl_scan_t *scn = dp->dp_scan; + size_t size = BP_GET_PSIZE(bp); + spa_t *spa = dp->dp_spa; + uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); + boolean_t needs_io; + int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; + unsigned int scan_delay = 0; + + if (phys_birth <= scn->scn_phys.scn_min_txg || + phys_birth >= scn->scn_phys.scn_max_txg) + return (0); + + count_block(dp->dp_blkstats, bp); + + if (BP_IS_EMBEDDED(bp)) + return (0); + + ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); + if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { + zio_flags |= ZIO_FLAG_SCRUB; + needs_io = B_TRUE; + scan_delay = zfs_scrub_delay; + } else { + ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); + zio_flags |= ZIO_FLAG_RESILVER; + needs_io = B_FALSE; + scan_delay = zfs_resilver_delay; + } + + /* If it's an intent log block, failure is expected. */ + if (zb->zb_level == ZB_ZIL_LEVEL) + zio_flags |= ZIO_FLAG_SPECULATIVE; + + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { + vdev_t *vd = vdev_lookup_top(spa, + DVA_GET_VDEV(&bp->blk_dva[d])); + + /* + * Keep track of how much data we've examined so that + * zpool(1M) status can make useful progress reports. + */ + scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); + spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); + + /* if it's a resilver, this may not be in the target range */ + if (!needs_io) { + if (DVA_GET_GANG(&bp->blk_dva[d])) { + /* + * Gang members may be spread across multiple + * vdevs, so the best estimate we have is the + * scrub range, which has already been checked. + * XXX -- it would be better to change our + * allocation policy to ensure that all + * gang members reside on the same vdev. + */ + needs_io = B_TRUE; + } else { + needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, + phys_birth, 1); + } + } + } + + if (needs_io && !zfs_no_scrub_io) { + vdev_t *rvd = spa->spa_root_vdev; + uint64_t maxinflight = rvd->vdev_children * + MAX(zfs_top_maxinflight, 1); + void *data = zio_data_buf_alloc(size); + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight >= maxinflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); + + /* + * If we're seeing recent (zfs_scan_idle) "important" I/Os + * then throttle our workload to limit the impact of a scan. + */ + if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) + delay(MAX((int)scan_delay, 0)); + + zio_nowait(zio_read(NULL, spa, bp, data, size, + dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB, + zio_flags, zb)); + } + + /* do not relocate this block */ + return (0); +} + +int +dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) +{ + spa_t *spa = dp->dp_spa; + + /* + * Purge all vdev caches and probe all devices. We do this here + * rather than in sync context because this requires a writer lock + * on the spa_config lock, which we can't do from sync context. The + * spa_scrub_reopen flag indicates that vdev_open() should not + * attempt to start another scrub. + */ + spa_vdev_state_enter(spa, SCL_NONE); + spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + (void) spa_vdev_state_exit(spa, NULL, 0); + + return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, + dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE)); +} + +static boolean_t +dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) +{ + return (scn->scn_restart_txg != 0 && + scn->scn_restart_txg <= tx->tx_txg); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c 27 Feb 2010 22:30:59 -0000 1.1.1.2 +++ /dev/null 1 Jan 1970 00:00:00 -0000 @@ -1,1200 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); - -static scrub_cb_t dsl_pool_scrub_clean_cb; -static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; -static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, - uint64_t objset, uint64_t object); - -int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ -int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ -boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ -boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ -enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; - -extern int zfs_txg_timeout; - -static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = { - NULL, - dsl_pool_scrub_clean_cb -}; - -/* ARGSUSED */ -static void -dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_pool_t *dp = arg1; - enum scrub_func *funcp = arg2; - dmu_object_type_t ot = 0; - boolean_t complete = B_FALSE; - - dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx); - - ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE); - ASSERT(*funcp > SCRUB_FUNC_NONE); - ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS); - - dp->dp_scrub_min_txg = 0; - dp->dp_scrub_max_txg = tx->tx_txg; - dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max; - - if (*funcp == SCRUB_FUNC_CLEAN) { - vdev_t *rvd = dp->dp_spa->spa_root_vdev; - - /* rewrite all disk labels */ - vdev_config_dirty(rvd); - - if (vdev_resilver_needed(rvd, - &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) { - spa_event_notify(dp->dp_spa, NULL, - ESC_ZFS_RESILVER_START); - dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, - tx->tx_txg); - } else { - spa_event_notify(dp->dp_spa, NULL, - ESC_ZFS_SCRUB_START); - } - - /* zero out the scrub stats in all vdev_stat_t's */ - vdev_scrub_stat_update(rvd, - dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : - POOL_SCRUB_EVERYTHING, B_FALSE); - - /* - * If this is an incremental scrub, limit the DDT scrub phase - * to just the auto-ditto class (for correctness); the rest - * of the scrub should go faster using top-down pruning. - */ - if (dp->dp_scrub_min_txg > TXG_INITIAL) - dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO; - - dp->dp_spa->spa_scrub_started = B_TRUE; - } - - /* back to the generic stuff */ - - if (dp->dp_blkstats == NULL) { - dp->dp_blkstats = - kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); - } - bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); - - if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) - ot = DMU_OT_ZAP_OTHER; - - dp->dp_scrub_func = *funcp; - dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, - ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); - bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); - bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); - dp->dp_scrub_restart = B_FALSE; - dp->dp_spa->spa_scrub_errors = 0; - - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, - &dp->dp_scrub_func, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1, - &dp->dp_scrub_queue_obj, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_min_txg, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, - &dp->dp_scrub_max_txg, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_ddt_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, - &dp->dp_scrub_ddt_class_max, tx)); - VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, - &dp->dp_spa->spa_scrub_errors, tx)); - - spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr, - "func=%u mintxg=%llu maxtxg=%llu", - *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg); -} - -int -dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func) -{ - return (dsl_sync_task_do(dp, NULL, - dsl_pool_scrub_setup_sync, dp, &func, 0)); -} - -/* ARGSUSED */ -static void -dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_pool_t *dp = arg1; - boolean_t *completep = arg2; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - mutex_enter(&dp->dp_scrub_cancel_lock); - - if (dp->dp_scrub_restart) { - dp->dp_scrub_restart = B_FALSE; - *completep = B_FALSE; - } - - /* XXX this is scrub-clean specific */ - mutex_enter(&dp->dp_spa->spa_scrub_lock); - while (dp->dp_spa->spa_scrub_inflight > 0) { - cv_wait(&dp->dp_spa->spa_scrub_io_cv, - &dp->dp_spa->spa_scrub_lock); - } - mutex_exit(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_started = B_FALSE; - dp->dp_spa->spa_scrub_active = B_FALSE; - - dp->dp_scrub_func = SCRUB_FUNC_NONE; - VERIFY(0 == dmu_object_free(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, tx)); - dp->dp_scrub_queue_obj = 0; - bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); - bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); - - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_QUEUE, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MIN_TXG, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_MAX_TXG, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_FUNC, tx)); - VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, tx)); - - (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_BOOKMARK, tx); - (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_CLASS_MAX, tx); - - spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, - "complete=%u", *completep); - - /* below is scrub-clean specific */ - vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE, - *completep); - /* - * If the scrub/resilver completed, update all DTLs to reflect this. - * Whether it succeeded or not, vacate all temporary scrub DTLs. - */ - vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, - *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); - if (*completep) - spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ? - ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); - spa_errlog_rotate(dp->dp_spa); - - /* - * We may have finished replacing a device. - * Let the async thread assess this and handle the detach. - */ - spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE); - - dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0; - mutex_exit(&dp->dp_scrub_cancel_lock); -} - -int -dsl_pool_scrub_cancel(dsl_pool_t *dp) -{ - boolean_t complete = B_FALSE; - - return (dsl_sync_task_do(dp, NULL, - dsl_pool_scrub_cancel_sync, dp, &complete, 3)); -} - -void -dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) -{ - /* - * This function will be used by bp-rewrite wad to intercept frees. - */ - zio_free(dp->dp_spa, txg, bpp); -} - -static boolean_t -bookmark_is_zero(const zbookmark_t *zb) -{ - return (zb->zb_objset == 0 && zb->zb_object == 0 && - zb->zb_level == 0 && zb->zb_blkid == 0); -} - -/* dnp is the dnode for zb1->zb_object */ -static boolean_t -bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, - const zbookmark_t *zb2) -{ - uint64_t zb1nextL0, zb2thisobj; - - ASSERT(zb1->zb_objset == zb2->zb_objset); - ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT); - ASSERT(zb2->zb_level == 0); - - /* - * A bookmark in the deadlist is considered to be after - * everything else. - */ - if (zb2->zb_object == DMU_DEADLIST_OBJECT) - return (B_TRUE); - - /* The objset_phys_t isn't before anything. */ - if (dnp == NULL) - return (B_FALSE); - - zb1nextL0 = (zb1->zb_blkid + 1) << - ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); - - zb2thisobj = zb2->zb_object ? zb2->zb_object : - zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); - - if (zb1->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t nextobj = zb1nextL0 * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; - return (nextobj <= zb2thisobj); - } - - if (zb1->zb_object < zb2thisobj) - return (B_TRUE); - if (zb1->zb_object > zb2thisobj) - return (B_FALSE); - if (zb2->zb_object == DMU_META_DNODE_OBJECT) - return (B_FALSE); - return (zb1nextL0 <= zb2->zb_blkid); -} - -static boolean_t -scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb) -{ - uint64_t elapsed_nanosecs; - int mintime; - - if (dp->dp_scrub_pausing) - return (B_TRUE); /* we're already pausing */ - - if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) - return (B_FALSE); /* we're resuming */ - - /* We only know how to resume from level-0 blocks. */ - if (zb != NULL && zb->zb_level != 0) - return (B_FALSE); - - mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time_ms : - zfs_scrub_min_time_ms; - elapsed_nanosecs = gethrtime() - dp->dp_scrub_start_time; - if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || - (elapsed_nanosecs / MICROSEC > mintime && txg_sync_waiting(dp))) { - if (zb) { - dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", - (longlong_t)zb->zb_objset, - (longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, - (longlong_t)zb->zb_blkid); - dp->dp_scrub_bookmark = *zb; - } - if (ddb) { - dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", - (longlong_t)ddb->ddb_class, - (longlong_t)ddb->ddb_type, - (longlong_t)ddb->ddb_checksum, - (longlong_t)ddb->ddb_cursor); - ASSERT(&dp->dp_scrub_ddt_bookmark == ddb); - } - dp->dp_scrub_pausing = B_TRUE; - return (B_TRUE); - } - return (B_FALSE); -} - -typedef struct zil_traverse_arg { - dsl_pool_t *zta_dp; - zil_header_t *zta_zh; -} zil_traverse_arg_t; - -/* ARGSUSED */ -static int -traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) -{ - zil_traverse_arg_t *zta = arg; - dsl_pool_t *dp = zta->zta_dp; - zil_header_t *zh = zta->zta_zh; - zbookmark_t zb; - - if (bp->blk_birth <= dp->dp_scrub_min_txg) - return (0); - - /* - * One block ("stubby") can be allocated a long time ago; we - * want to visit that one because it has been allocated - * (on-disk) even if it hasn't been claimed (even though for - * plain scrub there's nothing to do to it). - */ - if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) - return (0); - - SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], - ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - - VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); - return (0); -} - -/* ARGSUSED */ -static int -traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) -{ - if (lrc->lrc_txtype == TX_WRITE) { - zil_traverse_arg_t *zta = arg; - dsl_pool_t *dp = zta->zta_dp; - zil_header_t *zh = zta->zta_zh; - lr_write_t *lr = (lr_write_t *)lrc; - blkptr_t *bp = &lr->lr_blkptr; - zbookmark_t zb; - - if (bp->blk_birth <= dp->dp_scrub_min_txg) - return (0); - - /* - * birth can be < claim_txg if this record's txg is - * already txg sync'ed (but this log block contains - * other records that are not synced) - */ - if (claim_txg == 0 || bp->blk_birth < claim_txg) - return (0); - - SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], - lr->lr_foid, ZB_ZIL_LEVEL, - lr->lr_offset / BP_GET_LSIZE(bp)); - - VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); - } - return (0); -} - -static void -traverse_zil(dsl_pool_t *dp, zil_header_t *zh) -{ - uint64_t claim_txg = zh->zh_claim_txg; - zil_traverse_arg_t zta = { dp, zh }; - zilog_t *zilog; - - /* - * We only want to visit blocks that have been claimed but not yet - * replayed (or, in read-only mode, blocks that *would* be claimed). - */ - if (claim_txg == 0 && spa_writeable(dp->dp_spa)) - return; - - zilog = zil_alloc(dp->dp_meta_objset, zh); - - (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, &zta, - claim_txg); - - zil_free(zilog); -} - -static void -scrub_prefetch(dsl_pool_t *dp, arc_buf_t *buf, blkptr_t *bp, uint64_t objset, - uint64_t object, uint64_t blkid) -{ - zbookmark_t czb; - uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; - - if (zfs_no_scrub_prefetch) - return; - - if (BP_IS_HOLE(bp) || bp->blk_birth <= dp->dp_scrub_min_txg || - (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) - return; - - SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); - - (void) arc_read(dp->dp_scrub_prefetch_zio_root, dp->dp_spa, bp, - buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, - &flags, &czb); -} - -static void -scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, - arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) -{ - int err; - arc_buf_t *buf = NULL; - - if (bp->blk_birth <= dp->dp_scrub_min_txg) - return; - - if (scrub_pause(dp, zb, NULL)) - return; - - if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { - /* - * If we already visited this bp & everything below (in - * a prior txg), don't bother doing it again. - */ - if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark)) - return; - - /* - * If we found the block we're trying to resume from, or - * we went past it to a different object, zero it out to - * indicate that it's OK to start checking for pausing - * again. - */ - if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 || - zb->zb_object > dp->dp_scrub_bookmark.zb_object) { - dprintf("resuming at %llx/%llx/%llx/%llx\n", - (longlong_t)zb->zb_objset, - (longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, - (longlong_t)zb->zb_blkid); - bzero(&dp->dp_scrub_bookmark, sizeof (*zb)); - } - } - - /* - * If dsl_pool_scrub_ddt() has aready scrubbed this block, - * don't scrub it again. - */ - if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp)) - (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); - - if (BP_GET_LEVEL(bp) > 0) { - uint32_t flags = ARC_WAIT; - int i; - blkptr_t *cbp; - int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; - - err = arc_read(NULL, dp->dp_spa, bp, pbuf, - arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) { - mutex_enter(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_errors++; - mutex_exit(&dp->dp_spa->spa_scrub_lock); - return; - } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { - scrub_prefetch(dp, buf, cbp, zb->zb_objset, - zb->zb_object, zb->zb_blkid * epb + i); - } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { - zbookmark_t czb; - - SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); - scrub_visitbp(dp, dnp, buf, cbp, &czb); - } - } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_WAIT; - dnode_phys_t *cdnp; - int i, j; - int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; - - err = arc_read(NULL, dp->dp_spa, bp, pbuf, - arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) { - mutex_enter(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_errors++; - mutex_exit(&dp->dp_spa->spa_scrub_lock); - return; - } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { - for (j = 0; j < cdnp->dn_nblkptr; j++) { - blkptr_t *cbp = &cdnp->dn_blkptr[j]; - scrub_prefetch(dp, buf, cbp, zb->zb_objset, - zb->zb_blkid * epb + i, j); - } - } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { - scrub_visitdnode(dp, cdnp, buf, zb->zb_objset, - zb->zb_blkid * epb + i); - } - } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - uint32_t flags = ARC_WAIT; - objset_phys_t *osp; - - err = arc_read_nolock(NULL, dp->dp_spa, bp, - arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); - if (err) { - mutex_enter(&dp->dp_spa->spa_scrub_lock); - dp->dp_spa->spa_scrub_errors++; - mutex_exit(&dp->dp_spa->spa_scrub_lock); - return; - } - - osp = buf->b_data; - - traverse_zil(dp, &osp->os_zil_header); - - scrub_visitdnode(dp, &osp->os_meta_dnode, - buf, zb->zb_objset, DMU_META_DNODE_OBJECT); - if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { - scrub_visitdnode(dp, &osp->os_userused_dnode, - buf, zb->zb_objset, DMU_USERUSED_OBJECT); - scrub_visitdnode(dp, &osp->os_groupused_dnode, - buf, zb->zb_objset, DMU_GROUPUSED_OBJECT); - } - } - - if (buf) - (void) arc_buf_remove_ref(buf, &buf); -} - -static void -scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, - uint64_t objset, uint64_t object) -{ - int j; - - for (j = 0; j < dnp->dn_nblkptr; j++) { - zbookmark_t czb; - - SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); - scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb); - } -} - -static void -scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) -{ - zbookmark_t zb; - - SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, - ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - scrub_visitbp(dp, NULL, NULL, bp, &zb); -} - -void -dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { - SET_BOOKMARK(&dp->dp_scrub_bookmark, - ZB_DESTROYED_OBJSET, 0, 0, 0); - } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, tx) != 0) { - return; - } - - if (ds->ds_phys->ds_next_snap_obj != 0) { - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_phys->ds_next_snap_obj, tx) == 0); - } - ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); -} - -void -dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds->ds_dir->dd_pool; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - ASSERT(ds->ds_phys->ds_prev_snap_obj != 0); - - if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { - dp->dp_scrub_bookmark.zb_objset = - ds->ds_phys->ds_prev_snap_obj; - } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, tx) == 0) { - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_phys->ds_prev_snap_obj, tx) == 0); - } -} - -void -dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) -{ - dsl_pool_t *dp = ds1->ds_dir->dd_pool; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) { - dp->dp_scrub_bookmark.zb_objset = ds2->ds_object; - } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) { - dp->dp_scrub_bookmark.zb_objset = ds1->ds_object; - } - - if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds1->ds_object, tx) == 0) { - int err = zap_add_int(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, ds2->ds_object, tx); - VERIFY(err == 0 || err == EEXIST); - if (err == EEXIST) { - /* Both were there to begin with */ - VERIFY(0 == zap_add_int(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, ds1->ds_object, tx)); - } - } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds2->ds_object, tx) == 0) { - VERIFY(0 == zap_add_int(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, ds1->ds_object, tx)); - } -} - -struct enqueue_clones_arg { - dmu_tx_t *tx; - uint64_t originobj; -}; - -/* ARGSUSED */ -static int -enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -{ - struct enqueue_clones_arg *eca = arg; - dsl_dataset_t *ds; - int err; - dsl_pool_t *dp; - - err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); - if (err) - return (err); - dp = ds->ds_dir->dd_pool; - - if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) { - while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) { - dsl_dataset_t *prev; - err = dsl_dataset_hold_obj(dp, - ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); - - dsl_dataset_rele(ds, FTAG); - if (err) - return (err); - ds = prev; - } - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, eca->tx) == 0); - } - dsl_dataset_rele(ds, FTAG); - return (0); -} - -static void -scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx) -{ - dsl_dataset_t *ds; - uint64_t min_txg_save; - - VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - - /* - * Iterate over the bps in this ds. - */ - min_txg_save = dp->dp_scrub_min_txg; - dp->dp_scrub_min_txg = - MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg); - scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp); - dp->dp_scrub_min_txg = min_txg_save; - - if (dp->dp_scrub_pausing) - goto out; - - /* - * Add descendent datasets to work queue. - */ - if (ds->ds_phys->ds_next_snap_obj != 0) { - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_phys->ds_next_snap_obj, tx) == 0); - } - if (ds->ds_phys->ds_num_children > 1) { - boolean_t usenext = B_FALSE; - if (ds->ds_phys->ds_next_clones_obj != 0) { - uint64_t count; - /* - * A bug in a previous version of the code could - * cause upgrade_clones_cb() to not set - * ds_next_snap_obj when it should, leading to a - * missing entry. Therefore we can only use the - * next_clones_obj when its count is correct. - */ - int err = zap_count(dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj, &count); - if (err == 0 && - count == ds->ds_phys->ds_num_children - 1) - usenext = B_TRUE; - } - - if (usenext) { - VERIFY(zap_join(dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj, - dp->dp_scrub_queue_obj, tx) == 0); - } else { - struct enqueue_clones_arg eca; - eca.tx = tx; - eca.originobj = ds->ds_object; - - (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, - NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); - } - } - -out: - dsl_dataset_rele(ds, FTAG); -} - -/* ARGSUSED */ -static int -enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) -{ - dmu_tx_t *tx = arg; - dsl_dataset_t *ds; - int err; - dsl_pool_t *dp; - - err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds); - if (err) - return (err); - - dp = ds->ds_dir->dd_pool; - - while (ds->ds_phys->ds_prev_snap_obj != 0) { - dsl_dataset_t *prev; - err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, - FTAG, &prev); - if (err) { - dsl_dataset_rele(ds, FTAG); - return (err); - } - - /* - * If this is a clone, we don't need to worry about it for now. - */ - if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) { - dsl_dataset_rele(ds, FTAG); - dsl_dataset_rele(prev, FTAG); - return (0); - } - dsl_dataset_rele(ds, FTAG); - ds = prev; - } - - VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, - ds->ds_object, tx) == 0); - dsl_dataset_rele(ds, FTAG); - return (0); -} - -/* - * Scrub/dedup interaction. - * - * If there are N references to a deduped block, we don't want to scrub it - * N times -- ideally, we should scrub it exactly once. - * - * We leverage the fact that the dde's replication class (enum ddt_class) - * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest - * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. - * - * To prevent excess scrubbing, the scrub begins by walking the DDT - * to find all blocks with refcnt > 1, and scrubs each of these once. - * Since there are two replication classes which contain blocks with - * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. - * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. - * - * There would be nothing more to say if a block's refcnt couldn't change - * during a scrub, but of course it can so we must account for changes - * in a block's replication class. - * - * Here's an example of what can occur: - * - * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 - * when visited during the top-down scrub phase, it will be scrubbed twice. - * This negates our scrub optimization, but is otherwise harmless. - * - * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 - * on each visit during the top-down scrub phase, it will never be scrubbed. - * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's - * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to - * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 - * while a scrub is in progress, it scrubs the block right then. - */ -static void -dsl_pool_scrub_ddt(dsl_pool_t *dp) -{ - ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark; - ddt_entry_t dde; - int error; - - while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) { - if (ddb->ddb_class > dp->dp_scrub_ddt_class_max) - return; - dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde); - if (scrub_pause(dp, NULL, ddb)) - return; - } - ASSERT(error == ENOENT); - ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max); -} - -void -dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum, - const ddt_entry_t *dde) -{ - const ddt_key_t *ddk = &dde->dde_key; - const ddt_phys_t *ddp = dde->dde_phys; - blkptr_t blk; - zbookmark_t zb = { 0 }; - - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0) - continue; - ddt_bp_create(checksum, ddk, ddp, &blk); - scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb); - } -} - -void -dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) -{ - spa_t *spa = dp->dp_spa; - zap_cursor_t zc; - zap_attribute_t za; - boolean_t complete = B_TRUE; - - if (dp->dp_scrub_func == SCRUB_FUNC_NONE) - return; - - /* - * If the pool is not loaded, or is trying to unload, leave it alone. - */ - if (spa_load_state(spa) != SPA_LOAD_NONE || spa_shutting_down(spa)) - return; - - if (dp->dp_scrub_restart) { - enum scrub_func func = dp->dp_scrub_func; - dp->dp_scrub_restart = B_FALSE; - dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); - } - - if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { - /* - * We must have resumed after rebooting; reset the vdev - * stats to know that we're doing a scrub (although it - * will think we're just starting now). - */ - vdev_scrub_stat_update(spa->spa_root_vdev, - dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : - POOL_SCRUB_EVERYTHING, B_FALSE); - } - - dp->dp_scrub_pausing = B_FALSE; - dp->dp_scrub_start_time = gethrtime(); - dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); - spa->spa_scrub_active = B_TRUE; - - if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) { - dsl_pool_scrub_ddt(dp); - if (dp->dp_scrub_pausing) - goto out; - } - - if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) { - /* First do the MOS & ORIGIN */ - scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp); - if (dp->dp_scrub_pausing) - goto out; - - if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { - VERIFY(0 == dmu_objset_find_spa(spa, - NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); - } else { - scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); - } - ASSERT(!dp->dp_scrub_pausing); - } else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { - /* - * If we were paused, continue from here. Note if the ds - * we were paused on was destroyed, the zb_objset will be - * ZB_DESTROYED_OBJSET, so we will skip this and find a new - * objset below. - */ - scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx); - if (dp->dp_scrub_pausing) - goto out; - } - - /* - * In case we were paused right at the end of the ds, zero the - * bookmark so we don't think that we're still trying to resume. - */ - bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); - - /* keep pulling things out of the zap-object-as-queue */ - while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj), - zap_cursor_retrieve(&zc, &za) == 0) { - VERIFY(0 == zap_remove(dp->dp_meta_objset, - dp->dp_scrub_queue_obj, za.za_name, tx)); - scrub_visitds(dp, za.za_first_integer, tx); - if (dp->dp_scrub_pausing) - break; - zap_cursor_fini(&zc); - } - zap_cursor_fini(&zc); - if (dp->dp_scrub_pausing) - goto out; - - /* done. */ - - dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); - return; -out: - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), - sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), - &dp->dp_scrub_ddt_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, - &dp->dp_scrub_ddt_class_max, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, - &spa->spa_scrub_errors, tx)); -} - -void -dsl_pool_scrub_restart(dsl_pool_t *dp) -{ - mutex_enter(&dp->dp_scrub_cancel_lock); - dp->dp_scrub_restart = B_TRUE; - mutex_exit(&dp->dp_scrub_cancel_lock); -} - -/* - * scrub consumers - */ - -static void -count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) -{ - int i; - - /* - * If we resume after a reboot, zab will be NULL; don't record - * incomplete stats in that case. - */ - if (zab == NULL) - return; - - for (i = 0; i < 4; i++) { - int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; - int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; - zfs_blkstat_t *zb = &zab->zab_type[l][t]; - int equal; - - zb->zb_count++; - zb->zb_asize += BP_GET_ASIZE(bp); - zb->zb_lsize += BP_GET_LSIZE(bp); - zb->zb_psize += BP_GET_PSIZE(bp); - zb->zb_gangs += BP_COUNT_GANG(bp); - - switch (BP_GET_NDVAS(bp)) { - case 2: - if (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[1])) - zb->zb_ditto_2_of_2_samevdev++; - break; - case 3: - equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[1])) + - (DVA_GET_VDEV(&bp->blk_dva[0]) == - DVA_GET_VDEV(&bp->blk_dva[2])) + - (DVA_GET_VDEV(&bp->blk_dva[1]) == - DVA_GET_VDEV(&bp->blk_dva[2])); - if (equal == 1) - zb->zb_ditto_2_of_3_samevdev++; - else if (equal == 3) - zb->zb_ditto_3_of_3_samevdev++; - break; - } - } -} - -static void -dsl_pool_scrub_clean_done(zio_t *zio) -{ - spa_t *spa = zio->io_spa; - - zio_data_buf_free(zio->io_data, zio->io_size); - - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_inflight--; - cv_broadcast(&spa->spa_scrub_io_cv); - - if (zio->io_error && (zio->io_error != ECKSUM || - !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) - spa->spa_scrub_errors++; - mutex_exit(&spa->spa_scrub_lock); -} - -static int -dsl_pool_scrub_clean_cb(dsl_pool_t *dp, - const blkptr_t *bp, const zbookmark_t *zb) -{ - size_t size = BP_GET_PSIZE(bp); - spa_t *spa = dp->dp_spa; - uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); - boolean_t needs_io; - int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; - int zio_priority; - - if (phys_birth <= dp->dp_scrub_min_txg || - phys_birth >= dp->dp_scrub_max_txg) - return (0); - - count_block(dp->dp_blkstats, bp); - - if (dp->dp_scrub_isresilver == 0) { - /* It's a scrub */ - zio_flags |= ZIO_FLAG_SCRUB; - zio_priority = ZIO_PRIORITY_SCRUB; - needs_io = B_TRUE; - } else { - /* It's a resilver */ - zio_flags |= ZIO_FLAG_RESILVER; - zio_priority = ZIO_PRIORITY_RESILVER; - needs_io = B_FALSE; - } - - /* If it's an intent log block, failure is expected. */ - if (zb->zb_level == ZB_ZIL_LEVEL) - zio_flags |= ZIO_FLAG_SPECULATIVE; - - for (int d = 0; d < BP_GET_NDVAS(bp); d++) { - vdev_t *vd = vdev_lookup_top(spa, - DVA_GET_VDEV(&bp->blk_dva[d])); - - /* - * Keep track of how much data we've examined so that - * zpool(1M) status can make useful progress reports. - */ - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_scrub_examined += - DVA_GET_ASIZE(&bp->blk_dva[d]); - mutex_exit(&vd->vdev_stat_lock); - - /* if it's a resilver, this may not be in the target range */ - if (!needs_io) { - if (DVA_GET_GANG(&bp->blk_dva[d])) { - /* - * Gang members may be spread across multiple - * vdevs, so the best estimate we have is the - * scrub range, which has already been checked. - * XXX -- it would be better to change our - * allocation policy to ensure that all - * gang members reside on the same vdev. - */ - needs_io = B_TRUE; - } else { - needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, - phys_birth, 1); - } - } - } - - if (needs_io && !zfs_no_scrub_io) { - void *data = zio_data_buf_alloc(size); - - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_scrub_inflight++; - mutex_exit(&spa->spa_scrub_lock); - - zio_nowait(zio_read(NULL, spa, bp, data, size, - dsl_pool_scrub_clean_done, NULL, zio_priority, - zio_flags, zb)); - } - - /* do not relocate this block */ - return (0); -} - -int -dsl_pool_scrub_clean(dsl_pool_t *dp) -{ - spa_t *spa = dp->dp_spa; - - /* - * Purge all vdev caches and probe all devices. We do this here - * rather than in sync context because this requires a writer lock - * on the spa_config lock, which we can't do from sync context. The - * spa_scrub_reopen flag indicates that vdev_open() should not - * attempt to start another scrub. - */ - spa_vdev_state_enter(spa, SCL_NONE); - spa->spa_scrub_reopen = B_TRUE; - vdev_reopen(spa->spa_root_vdev); - spa->spa_scrub_reopen = B_FALSE; - (void) spa_vdev_state_exit(spa, NULL, 0); - - return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); -} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 dsl_synctask.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c 27 Feb 2010 22:30:59 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c 4 Feb 2015 07:24:17 -0000 @@ -19,8 +19,8 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include @@ -28,198 +28,159 @@ #include #include #include -#include +#include #define DST_AVG_BLKSHIFT 14 /* ARGSUSED */ static int -dsl_null_checkfunc(void *arg1, void *arg2, dmu_tx_t *tx) +dsl_null_checkfunc(void *arg, dmu_tx_t *tx) { return (0); } -dsl_sync_task_group_t * -dsl_sync_task_group_create(dsl_pool_t *dp) -{ - dsl_sync_task_group_t *dstg; - - dstg = kmem_zalloc(sizeof (dsl_sync_task_group_t), KM_SLEEP); - list_create(&dstg->dstg_tasks, sizeof (dsl_sync_task_t), - offsetof(dsl_sync_task_t, dst_node)); - dstg->dstg_pool = dp; - dstg->dstg_cr = CRED(); - - return (dstg); -} - -void -dsl_sync_task_create(dsl_sync_task_group_t *dstg, - dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, - void *arg1, void *arg2, int blocks_modified) -{ - dsl_sync_task_t *dst; - - if (checkfunc == NULL) - checkfunc = dsl_null_checkfunc; - dst = kmem_zalloc(sizeof (dsl_sync_task_t), KM_SLEEP); - dst->dst_checkfunc = checkfunc; - dst->dst_syncfunc = syncfunc; - dst->dst_arg1 = arg1; - dst->dst_arg2 = arg2; - list_insert_tail(&dstg->dstg_tasks, dst); - - dstg->dstg_space += blocks_modified << DST_AVG_BLKSHIFT; -} - +/* + * Called from open context to perform a callback in syncing context. Waits + * for the operation to complete. + * + * The checkfunc will be called from open context as a preliminary check + * which can quickly fail. If it succeeds, it will be called again from + * syncing context. The checkfunc should generally be designed to work + * properly in either context, but if necessary it can check + * dmu_tx_is_syncing(tx). + * + * The synctask infrastructure enforces proper locking strategy with respect + * to the dp_config_rwlock -- the lock will always be held when the callbacks + * are called. It will be held for read during the open-context (preliminary) + * call to the checkfunc, and then held for write from syncing context during + * the calls to the check and sync funcs. + * + * A dataset or pool name can be passed as the first argument. Typically, + * the check func will hold, check the return value of the hold, and then + * release the dataset. The sync func will VERIFYO(hold()) the dataset. + * This is safe because no changes can be made between the check and sync funcs, + * and the sync func will only be called if the check func successfully opened + * the dataset. + */ int -dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg) +dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check) { + spa_t *spa; dmu_tx_t *tx; - uint64_t txg; - dsl_sync_task_t *dst; + int err; + dsl_sync_task_t dst = { 0 }; + dsl_pool_t *dp; -top: - tx = dmu_tx_create_dd(dstg->dstg_pool->dp_mos_dir); - VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); + err = spa_open(pool, &spa, FTAG); + if (err != 0) + return (err); + dp = spa_get_dsl(spa); - txg = dmu_tx_get_txg(tx); +top: + tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - /* Do a preliminary error check. */ - dstg->dstg_err = 0; - rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_READER); - for (dst = list_head(&dstg->dstg_tasks); dst; - dst = list_next(&dstg->dstg_tasks, dst)) { -#ifdef ZFS_DEBUG - /* - * Only check half the time, otherwise, the sync-context - * check will almost never fail. - */ - if (spa_get_random(2) == 0) - continue; -#endif - dst->dst_err = - dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); - if (dst->dst_err) - dstg->dstg_err = dst->dst_err; - } - rw_exit(&dstg->dstg_pool->dp_config_rwlock); + dst.dst_pool = dp; + dst.dst_txg = dmu_tx_get_txg(tx); + dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT; + dst.dst_space_check = space_check; + dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc; + dst.dst_syncfunc = syncfunc; + dst.dst_arg = arg; + dst.dst_error = 0; + dst.dst_nowaiter = B_FALSE; + + dsl_pool_config_enter(dp, FTAG); + err = dst.dst_checkfunc(arg, tx); + dsl_pool_config_exit(dp, FTAG); - if (dstg->dstg_err) { + if (err != 0) { dmu_tx_commit(tx); - return (dstg->dstg_err); + spa_close(spa, FTAG); + return (err); } - VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg)); + VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, &dst, dst.dst_txg)); dmu_tx_commit(tx); - txg_wait_synced(dstg->dstg_pool, txg); + txg_wait_synced(dp, dst.dst_txg); - if (dstg->dstg_err == EAGAIN) { - txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE); + if (dst.dst_error == EAGAIN) { + txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE); goto top; } - return (dstg->dstg_err); + spa_close(spa, FTAG); + return (dst.dst_error); } void -dsl_sync_task_group_nowait(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) +dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) { - uint64_t txg; + dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP); - dstg->dstg_nowaiter = B_TRUE; - txg = dmu_tx_get_txg(tx); - VERIFY(0 == txg_list_add(&dstg->dstg_pool->dp_sync_tasks, dstg, txg)); -} - -void -dsl_sync_task_group_destroy(dsl_sync_task_group_t *dstg) -{ - dsl_sync_task_t *dst; + dst->dst_pool = dp; + dst->dst_txg = dmu_tx_get_txg(tx); + dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT; + dst->dst_space_check = space_check; + dst->dst_checkfunc = dsl_null_checkfunc; + dst->dst_syncfunc = syncfunc; + dst->dst_arg = arg; + dst->dst_error = 0; + dst->dst_nowaiter = B_TRUE; - while (dst = list_head(&dstg->dstg_tasks)) { - list_remove(&dstg->dstg_tasks, dst); - kmem_free(dst, sizeof (dsl_sync_task_t)); - } - kmem_free(dstg, sizeof (dsl_sync_task_group_t)); + VERIFY(txg_list_add_tail(&dp->dp_sync_tasks, dst, dst->dst_txg)); } +/* + * Called in syncing context to execute the synctask. + */ void -dsl_sync_task_group_sync(dsl_sync_task_group_t *dstg, dmu_tx_t *tx) +dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx) { - dsl_sync_task_t *dst; - void *tr_cookie; + dsl_pool_t *dp = dst->dst_pool; - ASSERT3U(dstg->dstg_err, ==, 0); + ASSERT0(dst->dst_error); /* * Check for sufficient space. + * + * When the sync task was created, the caller specified the + * type of space checking required. See the comment in + * zfs_space_check_t for details on the semantics of each + * type of space checking. + * + * We just check against what's on-disk; we don't want any + * in-flight accounting to get in our way, because open context + * may have already used up various in-core limits + * (arc_tempreserve, dsl_pool_tempreserve). */ - dstg->dstg_err = dsl_dir_tempreserve_space(dstg->dstg_pool->dp_mos_dir, - dstg->dstg_space, dstg->dstg_space * 3, 0, 0, &tr_cookie, tx); - /* don't bother trying again */ - if (dstg->dstg_err == ERESTART) - dstg->dstg_err = EAGAIN; - if (dstg->dstg_err) - return; - - /* - * Check for errors by calling checkfuncs. - */ - rw_enter(&dstg->dstg_pool->dp_config_rwlock, RW_WRITER); - for (dst = list_head(&dstg->dstg_tasks); dst; - dst = list_next(&dstg->dstg_tasks, dst)) { - dst->dst_err = - dst->dst_checkfunc(dst->dst_arg1, dst->dst_arg2, tx); - if (dst->dst_err) - dstg->dstg_err = dst->dst_err; - } - - if (dstg->dstg_err == 0) { - /* - * Execute sync tasks. - */ - for (dst = list_head(&dstg->dstg_tasks); dst; - dst = list_next(&dstg->dstg_tasks, dst)) { - dst->dst_syncfunc(dst->dst_arg1, dst->dst_arg2, - dstg->dstg_cr, tx); + if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) { + uint64_t quota = dsl_pool_adjustedsize(dp, + dst->dst_space_check == ZFS_SPACE_CHECK_RESERVED) - + metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); + uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; + /* MOS space is triple-dittoed, so we multiply by 3. */ + if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) { + dst->dst_error = SET_ERROR(ENOSPC); + if (dst->dst_nowaiter) + kmem_free(dst, sizeof (*dst)); + return; } } - rw_exit(&dstg->dstg_pool->dp_config_rwlock); - - dsl_dir_tempreserve_clear(tr_cookie, tx); - if (dstg->dstg_nowaiter) - dsl_sync_task_group_destroy(dstg); -} - -int -dsl_sync_task_do(dsl_pool_t *dp, - dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, - void *arg1, void *arg2, int blocks_modified) -{ - dsl_sync_task_group_t *dstg; - int err; - - dstg = dsl_sync_task_group_create(dp); - dsl_sync_task_create(dstg, checkfunc, syncfunc, - arg1, arg2, blocks_modified); - err = dsl_sync_task_group_wait(dstg); - dsl_sync_task_group_destroy(dstg); - return (err); -} - -void -dsl_sync_task_do_nowait(dsl_pool_t *dp, - dsl_checkfunc_t *checkfunc, dsl_syncfunc_t *syncfunc, - void *arg1, void *arg2, int blocks_modified, dmu_tx_t *tx) -{ - dsl_sync_task_group_t *dstg; - - dstg = dsl_sync_task_group_create(dp); - dsl_sync_task_create(dstg, checkfunc, syncfunc, - arg1, arg2, blocks_modified); - dsl_sync_task_group_nowait(dstg, tx); + /* + * Check for errors by calling checkfunc. + */ + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx); + if (dst->dst_error == 0) + dst->dst_syncfunc(dst->dst_arg, tx); + rrw_exit(&dp->dp_config_rwlock, FTAG); + if (dst->dst_nowaiter) + kmem_free(dst, sizeof (*dst)); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_userhold.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_userhold.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_userhold.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_userhold.c 10 Oct 2016 11:09:56 -0000 @@ -0,0 +1,666 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct dsl_dataset_user_hold_arg { + nvlist_t *dduha_holds; + nvlist_t *dduha_chkholds; + nvlist_t *dduha_errlist; + minor_t dduha_minor; +} dsl_dataset_user_hold_arg_t; + +/* + * If you add new checks here, you may need to add additional checks to the + * "temporary" case in snapshot_check() in dmu_objset.c. + */ +int +dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag, + boolean_t temphold, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + int error = 0; + + ASSERT(dsl_pool_config_held(dp)); + + if (strlen(htag) > MAXNAMELEN) + return (SET_ERROR(E2BIG)); + /* Tempholds have a more restricted length */ + if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) + return (SET_ERROR(E2BIG)); + + /* tags must be unique (if ds already exists) */ + if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { + uint64_t value; + + error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, + htag, 8, 1, &value); + if (error == 0) + error = SET_ERROR(EEXIST); + else if (error == ENOENT) + error = 0; + } + + return (error); +} + +static int +dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_hold_arg_t *dduha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + + if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) + return (SET_ERROR(ENOTSUP)); + + if (!dmu_tx_is_syncing(tx)) + return (0); + + for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); + pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { + dsl_dataset_t *ds; + int error = 0; + char *htag, *name; + + /* must be a snapshot */ + name = nvpair_name(pair); + if (strchr(name, '@') == NULL) + error = SET_ERROR(EINVAL); + + if (error == 0) + error = nvpair_value_string(pair, &htag); + + if (error == 0) + error = dsl_dataset_hold(dp, name, FTAG, &ds); + + if (error == 0) { + error = dsl_dataset_user_hold_check_one(ds, htag, + dduha->dduha_minor != 0, tx); + dsl_dataset_rele(ds, FTAG); + } + + if (error == 0) { + fnvlist_add_string(dduha->dduha_chkholds, name, htag); + } else { + /* + * We register ENOENT errors so they can be correctly + * reported if needed, such as when all holds fail. + */ + fnvlist_add_int32(dduha->dduha_errlist, name, error); + if (error != ENOENT) + return (error); + } + } + + return (0); +} + + +static void +dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds, + const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj; + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) { + /* + * This is the first user hold for this dataset. Create + * the userrefs zap object. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj = + zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); + } else { + zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj; + } + ds->ds_userrefs++; + + VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx)); + + if (minor != 0) { + char name[MAXNAMELEN]; + nvlist_t *tags; + + VERIFY0(dsl_pool_user_hold(dp, ds->ds_object, + htag, now, tx)); + (void) snprintf(name, sizeof (name), "%llx", + (u_longlong_t)ds->ds_object); + + if (nvlist_lookup_nvlist(tmpholds, name, &tags) != 0) { + tags = fnvlist_alloc(); + fnvlist_add_boolean(tags, htag); + fnvlist_add_nvlist(tmpholds, name, tags); + fnvlist_free(tags); + } else { + fnvlist_add_boolean(tags, htag); + } + } + + spa_history_log_internal_ds(ds, "hold", tx, + "tag=%s temp=%d refs=%llu", + htag, minor != 0, ds->ds_userrefs); +} + +typedef struct zfs_hold_cleanup_arg { + char zhca_spaname[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t zhca_spa_load_guid; + nvlist_t *zhca_holds; +} zfs_hold_cleanup_arg_t; + +static void +dsl_dataset_user_release_onexit(void *arg) +{ + zfs_hold_cleanup_arg_t *ca = arg; + spa_t *spa; + int error; + + error = spa_open(ca->zhca_spaname, &spa, FTAG); + if (error != 0) { + zfs_dbgmsg("couldn't release holds on pool=%s " + "because pool is no longer loaded", + ca->zhca_spaname); + return; + } + if (spa_load_guid(spa) != ca->zhca_spa_load_guid) { + zfs_dbgmsg("couldn't release holds on pool=%s " + "because pool is no longer loaded (guid doesn't match)", + ca->zhca_spaname); + spa_close(spa, FTAG); + return; + } + + (void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds); + fnvlist_free(ca->zhca_holds); + kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); + spa_close(spa, FTAG); +} + +static void +dsl_onexit_hold_cleanup(spa_t *spa, nvlist_t *holds, minor_t minor) +{ + zfs_hold_cleanup_arg_t *ca; + + if (minor == 0 || nvlist_empty(holds)) { + fnvlist_free(holds); + return; + } + + ASSERT(spa != NULL); + ca = kmem_alloc(sizeof (*ca), KM_SLEEP); + + (void) strlcpy(ca->zhca_spaname, spa_name(spa), + sizeof (ca->zhca_spaname)); + ca->zhca_spa_load_guid = spa_load_guid(spa); + ca->zhca_holds = holds; + VERIFY0(zfs_onexit_add_cb(minor, + dsl_dataset_user_release_onexit, ca, NULL)); +} + +void +dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag, + minor_t minor, uint64_t now, dmu_tx_t *tx) +{ + nvlist_t *tmpholds; + + if (minor != 0) + tmpholds = fnvlist_alloc(); + else + tmpholds = NULL; + dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, htag, minor, now, tx); + dsl_onexit_hold_cleanup(dsl_dataset_get_spa(ds), tmpholds, minor); +} + +static void +dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_hold_arg_t *dduha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvlist_t *tmpholds; + uint64_t now = gethrestime_sec(); + + if (dduha->dduha_minor != 0) + tmpholds = fnvlist_alloc(); + else + tmpholds = NULL; + for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_chkholds, NULL); + pair != NULL; + pair = nvlist_next_nvpair(dduha->dduha_chkholds, pair)) { + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); + dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, + fnvpair_value_string(pair), dduha->dduha_minor, now, tx); + dsl_dataset_rele(ds, FTAG); + } + dsl_onexit_hold_cleanup(dp->dp_spa, tmpholds, dduha->dduha_minor); +} + +/* + * The full semantics of this function are described in the comment above + * lzc_hold(). + * + * To summarize: + * holds is nvl of snapname -> holdname + * errlist will be filled in with snapname -> error + * + * The snaphosts must all be in the same pool. + * + * Holds for snapshots that don't exist will be skipped. + * + * If none of the snapshots for requested holds exist then ENOENT will be + * returned. + * + * If cleanup_minor is not 0, the holds will be temporary, which will be cleaned + * up when the process exits. + * + * On success all the holds, for snapshots that existed, will be created and 0 + * will be returned. + * + * On failure no holds will be created, the errlist will be filled in, + * and an errno will returned. + * + * In all cases the errlist will contain entries for holds where the snapshot + * didn't exist. + */ +int +dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist) +{ + dsl_dataset_user_hold_arg_t dduha; + nvpair_t *pair; + int ret; + + pair = nvlist_next_nvpair(holds, NULL); + if (pair == NULL) + return (0); + + dduha.dduha_holds = holds; + dduha.dduha_chkholds = fnvlist_alloc(); + dduha.dduha_errlist = errlist; + dduha.dduha_minor = cleanup_minor; + + ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check, + dsl_dataset_user_hold_sync, &dduha, + fnvlist_num_pairs(holds), ZFS_SPACE_CHECK_RESERVED); + fnvlist_free(dduha.dduha_chkholds); + + return (ret); +} + +typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag, + dsl_dataset_t **dsp); + +typedef struct dsl_dataset_user_release_arg { + dsl_holdfunc_t *ddura_holdfunc; + nvlist_t *ddura_holds; + nvlist_t *ddura_todelete; + nvlist_t *ddura_errlist; + nvlist_t *ddura_chkholds; +} dsl_dataset_user_release_arg_t; + +/* Place a dataset hold on the snapshot identified by passed dsobj string */ +static int +dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag, + dsl_dataset_t **dsp) +{ + return (dsl_dataset_hold_obj(dp, strtonum(dsobj, NULL), tag, dsp)); +} + +static int +dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura, + dsl_dataset_t *ds, nvlist_t *holds, const char *snapname) +{ + uint64_t zapobj; + nvlist_t *holds_found; + objset_t *mos; + int numholds; + + if (!ds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + if (nvlist_empty(holds)) + return (0); + + numholds = 0; + mos = ds->ds_dir->dd_pool->dp_meta_objset; + zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj; + holds_found = fnvlist_alloc(); + + for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + uint64_t tmp; + int error; + const char *holdname = nvpair_name(pair); + + if (zapobj != 0) + error = zap_lookup(mos, zapobj, holdname, 8, 1, &tmp); + else + error = SET_ERROR(ENOENT); + + /* + * Non-existent holds are put on the errlist, but don't + * cause an overall failure. + */ + if (error == ENOENT) { + if (ddura->ddura_errlist != NULL) { + char *errtag = kmem_asprintf("%s#%s", + snapname, holdname); + fnvlist_add_int32(ddura->ddura_errlist, errtag, + ENOENT); + strfree(errtag); + } + continue; + } + + if (error != 0) { + fnvlist_free(holds_found); + return (error); + } + + fnvlist_add_boolean(holds_found, holdname); + numholds++; + } + + if (DS_IS_DEFER_DESTROY(ds) && + dsl_dataset_phys(ds)->ds_num_children == 1 && + ds->ds_userrefs == numholds) { + /* we need to destroy the snapshot as well */ + if (dsl_dataset_long_held(ds)) { + fnvlist_free(holds_found); + return (SET_ERROR(EBUSY)); + } + fnvlist_add_boolean(ddura->ddura_todelete, snapname); + } + + if (numholds != 0) { + fnvlist_add_nvlist(ddura->ddura_chkholds, snapname, + holds_found); + } + fnvlist_free(holds_found); + + return (0); +} + +static int +dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_release_arg_t *ddura; + dsl_holdfunc_t *holdfunc; + dsl_pool_t *dp; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + dp = dmu_tx_pool(tx); + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + ddura = arg; + holdfunc = ddura->ddura_holdfunc; + + for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { + int error; + dsl_dataset_t *ds; + nvlist_t *holds; + const char *snapname = nvpair_name(pair); + + error = nvpair_value_nvlist(pair, &holds); + if (error != 0) + error = (SET_ERROR(EINVAL)); + else + error = holdfunc(dp, snapname, FTAG, &ds); + if (error == 0) { + error = dsl_dataset_user_release_check_one(ddura, ds, + holds, snapname); + dsl_dataset_rele(ds, FTAG); + } + if (error != 0) { + if (ddura->ddura_errlist != NULL) { + fnvlist_add_int32(ddura->ddura_errlist, + snapname, error); + } + /* + * Non-existent snapshots are put on the errlist, + * but don't cause an overall failure. + */ + if (error != ENOENT) + return (error); + } + } + + return (0); +} + +static void +dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + + for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + int error; + const char *holdname = nvpair_name(pair); + + /* Remove temporary hold if one exists. */ + error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx); + VERIFY(error == 0 || error == ENOENT); + + VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, + holdname, tx)); + ds->ds_userrefs--; + + spa_history_log_internal_ds(ds, "release", tx, + "tag=%s refs=%lld", holdname, (longlong_t)ds->ds_userrefs); + } +} + +static void +dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_release_arg_t *ddura = arg; + dsl_holdfunc_t *holdfunc = ddura->ddura_holdfunc; + dsl_pool_t *dp = dmu_tx_pool(tx); + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_chkholds, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_chkholds, + pair)) { + dsl_dataset_t *ds; + const char *name = nvpair_name(pair); + + VERIFY0(holdfunc(dp, name, FTAG, &ds)); + + dsl_dataset_user_release_sync_one(ds, + fnvpair_value_nvlist(pair), tx); + if (nvlist_exists(ddura->ddura_todelete, name)) { + ASSERT(ds->ds_userrefs == 0 && + dsl_dataset_phys(ds)->ds_num_children == 1 && + DS_IS_DEFER_DESTROY(ds)); + dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx); + } + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * The full semantics of this function are described in the comment above + * lzc_release(). + * + * To summarize: + * Releases holds specified in the nvl holds. + * + * holds is nvl of snapname -> { holdname, ... } + * errlist will be filled in with snapname -> error + * + * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots, + * otherwise they should be the names of shapshots. + * + * As a release may cause snapshots to be destroyed this trys to ensure they + * aren't mounted. + * + * The release of non-existent holds are skipped. + * + * At least one hold must have been released for the this function to succeed + * and return 0. + */ +static int +dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist, + dsl_pool_t *tmpdp) +{ + dsl_dataset_user_release_arg_t ddura; + nvpair_t *pair; + char *pool; + int error; + + pair = nvlist_next_nvpair(holds, NULL); + if (pair == NULL) + return (0); + + /* + * The release may cause snapshots to be destroyed; make sure they + * are not mounted. + */ + if (tmpdp != NULL) { + /* Temporary holds are specified by dsobj string. */ + ddura.ddura_holdfunc = dsl_dataset_hold_obj_string; + pool = spa_name(tmpdp->dp_spa); +#ifdef _KERNEL + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + dsl_dataset_t *ds; + + dsl_pool_config_enter(tmpdp, FTAG); + error = dsl_dataset_hold_obj_string(tmpdp, + nvpair_name(pair), FTAG, &ds); + if (error == 0) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(ds, name); + dsl_pool_config_exit(tmpdp, FTAG); + dsl_dataset_rele(ds, FTAG); + (void) zfs_unmount_snap(name); + } else { + dsl_pool_config_exit(tmpdp, FTAG); + } + } +#endif + } else { + /* Non-temporary holds are specified by name. */ + ddura.ddura_holdfunc = dsl_dataset_hold; + pool = nvpair_name(pair); +#ifdef _KERNEL + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + (void) zfs_unmount_snap(nvpair_name(pair)); + } +#endif + } + + ddura.ddura_holds = holds; + ddura.ddura_errlist = errlist; + ddura.ddura_todelete = fnvlist_alloc(); + ddura.ddura_chkholds = fnvlist_alloc(); + + error = dsl_sync_task(pool, dsl_dataset_user_release_check, + dsl_dataset_user_release_sync, &ddura, 0, ZFS_SPACE_CHECK_NONE); + fnvlist_free(ddura.ddura_todelete); + fnvlist_free(ddura.ddura_chkholds); + + return (error); +} + +/* + * holds is nvl of snapname -> { holdname, ... } + * errlist will be filled in with snapname -> error + */ +int +dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist) +{ + return (dsl_dataset_user_release_impl(holds, errlist, NULL)); +} + +/* + * holds is nvl of snapdsobj -> { holdname, ... } + */ +void +dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds) +{ + ASSERT(dp != NULL); + (void) dsl_dataset_user_release_impl(holds, NULL, dp); +} + +int +dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int err; + + err = dsl_pool_hold(dsname, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { + zap_attribute_t *za; + zap_cursor_t zc; + + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, + dsl_dataset_phys(ds)->ds_userrefs_obj); + zap_cursor_retrieve(&zc, za) == 0; + zap_cursor_advance(&zc)) { + fnvlist_add_uint64(nvl, za->za_name, + za->za_first_integer); + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (zap_attribute_t)); + } + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (0); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/edonr_zfs.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/edonr_zfs.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/edonr_zfs.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/edonr_zfs.c 22 Nov 2015 17:22:31 -0000 @@ -0,0 +1,102 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Use is subject to license terms. + */ +#include +#include +#include + +#define EDONR_MODE 512 +#define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE + +/* + * Native zio_checksum interface for the Edon-R hash function. + */ +/*ARGSUSED*/ +void +zio_checksum_edonr_native(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + uint8_t digest[EDONR_MODE / 8]; + EdonRState ctx; + + ASSERT(ctx_template != NULL); + bcopy(ctx_template, &ctx, sizeof (ctx)); + EdonRUpdate(&ctx, buf, size * 8); + EdonRFinal(&ctx, digest); + bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word)); +} + +/* + * Byteswapped zio_checksum interface for the Edon-R hash function. + */ +void +zio_checksum_edonr_byteswap(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + zio_cksum_t tmp; + + zio_checksum_edonr_native(buf, size, ctx_template, &tmp); + zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]); + zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]); + zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]); + zcp->zc_word[3] = BSWAP_64(zcp->zc_word[3]); +} + +void * +zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) +{ + EdonRState *ctx; + uint8_t salt_block[EDONR_BLOCK_SIZE]; + + /* + * Edon-R needs all but the last hash invocation to be on full-size + * blocks, but the salt is too small. Rather than simply padding it + * with zeros, we expand the salt into a new salt block of proper + * size by double-hashing it (the new salt block will be composed of + * H(salt) || H(H(salt))). + */ + CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8)); + EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8, + salt_block); + EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block + + EDONR_MODE / 8); + + /* + * Feed the new salt block into the hash function - this will serve + * as our MAC key. + */ + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); + EdonRInit(ctx, EDONR_MODE); + EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8); + return (ctx); +} + +void +zio_checksum_edonr_tmpl_free(void *ctx_template) +{ + EdonRState *ctx = ctx_template; + + bzero(ctx, sizeof (*ctx)); + kmem_free(ctx, sizeof (*ctx)); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/lz4.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/lz4.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/lz4.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/lz4.c 25 Apr 2017 16:40:13 -0000 @@ -0,0 +1,1039 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Header File + * Copyright (C) 2011-2013, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + */ + +#include + +static int real_LZ4_compress(const char *source, char *dest, int isize, + int osize); +static int LZ4_compressBound(int isize); +static int LZ4_uncompress_unknownOutputSize(const char *source, char *dest, + int isize, int maxOutputSize); +static int LZ4_compressCtx(void *ctx, const char *source, char *dest, + int isize, int osize); +static int LZ4_compress64kCtx(void *ctx, const char *source, char *dest, + int isize, int osize); + +#ifdef __FreeBSD__ +static kmem_cache_t *lz4_ctx_cache; +#endif + +/*ARGSUSED*/ +size_t +lz4_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uint32_t bufsiz; + char *dest = d_start; + + ASSERT(d_len >= sizeof (bufsiz)); + + bufsiz = real_LZ4_compress(s_start, &dest[sizeof (bufsiz)], s_len, + d_len - sizeof (bufsiz)); + + /* Signal an error if the compression routine returned zero. */ + if (bufsiz == 0) + return (s_len); + + /* + * Encode the compresed buffer size at the start. We'll need this in + * decompression to counter the effects of padding which might be + * added to the compressed buffer and which, if unhandled, would + * confuse the hell out of our decompression function. + */ + *(uint32_t *)dest = BE_32(bufsiz); + + return (bufsiz + sizeof (bufsiz)); +} + +/*ARGSUSED*/ +int +lz4_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + const char *src = s_start; + uint32_t bufsiz = BE_IN32(src); + + /* invalid compressed buffer size encoded at start */ + if (bufsiz + sizeof (bufsiz) > s_len) + return (1); + + /* + * Returns 0 on success (decompression function returned non-negative) + * and non-zero on failure (decompression function returned negative. + */ + return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)], + d_start, bufsiz, d_len) < 0); +} + +/* + * LZ4 API Description: + * + * Simple Functions: + * real_LZ4_compress() : + * isize : is the input size. Max supported value is ~1.9GB + * return : the number of bytes written in buffer dest + * or 0 if the compression fails (if LZ4_COMPRESSMIN is set). + * note : destination buffer must be already allocated. + * destination buffer must be sized to handle worst cases + * situations (input data not compressible) worst case size + * evaluation is provided by function LZ4_compressBound(). + * + * Advanced Functions + * + * LZ4_compressBound() : + * Provides the maximum size that LZ4 may output in a "worst case" + * scenario (input data not compressible) primarily useful for memory + * allocation of output buffer. + * + * isize : is the input size. Max supported value is ~1.9GB + * return : maximum output size in a "worst case" scenario + * note : this function is limited by "int" range (2^31-1) + * + * LZ4_uncompress_unknownOutputSize() : + * isize : is the input size, therefore the compressed size + * maxOutputSize : is the size of the destination buffer (which must be + * already allocated) + * return : the number of bytes decoded in the destination buffer + * (necessarily <= maxOutputSize). If the source stream is + * malformed, the function will stop decoding and return a + * negative result, indicating the byte position of the faulty + * instruction. This function never writes beyond dest + + * maxOutputSize, and is therefore protected against malicious + * data packets. + * note : Destination buffer must be already allocated. + * + * LZ4_compressCtx() : + * This function explicitly handles the CTX memory structure. + * + * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated + * by the caller (either on the stack or using kmem_zalloc). Passing NULL + * isn't valid. + * + * LZ4_compress64kCtx() : + * Same as LZ4_compressCtx(), but specific to small inputs (<64KB). + * isize *Must* be <64KB, otherwise the output will be corrupted. + * + * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated + * by the caller (either on the stack or using kmem_zalloc). Passing NULL + * isn't valid. + */ + +/* + * Tuning parameters + */ + +/* + * COMPRESSIONLEVEL: Increasing this value improves compression ratio + * Lowering this value reduces memory usage. Reduced memory usage + * typically improves speed, due to cache effect (ex: L1 32KB for Intel, + * L1 64KB for AMD). Memory usage formula : N->2^(N+2) Bytes + * (examples : 12 -> 16KB ; 17 -> 512KB) + */ +#define COMPRESSIONLEVEL 12 + +/* + * NOTCOMPRESSIBLE_CONFIRMATION: Decreasing this value will make the + * algorithm skip faster data segments considered "incompressible". + * This may decrease compression ratio dramatically, but will be + * faster on incompressible data. Increasing this value will make + * the algorithm search more before declaring a segment "incompressible". + * This could improve compression a bit, but will be slower on + * incompressible data. The default value (6) is recommended. + */ +#define NOTCOMPRESSIBLE_CONFIRMATION 6 + +/* + * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE: This will provide a boost to + * performance for big endian cpu, but the resulting compressed stream + * will be incompatible with little-endian CPU. You can set this option + * to 1 in situations where data will stay within closed environment. + * This option is useless on Little_Endian CPU (such as x86). + */ +/* #define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */ + +/* + * CPU Feature Detection + */ + +/* 32 or 64 bits ? */ +#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || \ + defined(__amd64) || defined(__ppc64__) || defined(_WIN64) || \ + defined(__LP64__) || defined(_LP64)) +#define LZ4_ARCH64 1 +#else +#define LZ4_ARCH64 0 +#endif + +/* + * Limits the amount of stack space that the algorithm may consume to hold + * the compression lookup table. The value `9' here means we'll never use + * more than 2k of stack (see above for a description of COMPRESSIONLEVEL). + * If more memory is needed, it is allocated from the heap. + */ +/* FreeBSD: Use heap for all platforms for now */ +#define STACKLIMIT 0 + +/* + * Little Endian or Big Endian? + * Note: overwrite the below #define if you know your architecture endianess. + */ +#if BYTE_ORDER == BIG_ENDIAN +#define LZ4_BIG_ENDIAN 1 +#else +/* + * Little Endian assumed. PDP Endian and other very rare endian format + * are unsupported. + */ +#endif + +/* + * Unaligned memory access is automatically enabled for "common" CPU, + * such as x86. For others CPU, the compiler will be more cautious, and + * insert extra code to ensure aligned access is respected. If you know + * your target CPU supports unaligned memory access, you may want to + * force this option manually to improve performance + */ +#if defined(__ARM_FEATURE_UNALIGNED) +#define LZ4_FORCE_UNALIGNED_ACCESS 1 +#endif + +/* + * FreeBSD: can't use GCC's __builtin_ctz when using sparc64 because + * gcc currently rely on libcompiler_rt. + * + * TODO: revisit this when situation changes. + */ +#if defined(__sparc64__) +#define LZ4_FORCE_SW_BITCOUNT +#endif + +/* + * Compiler Options + */ +#if __STDC_VERSION__ >= 199901L /* C99 */ +/* "restrict" is a known keyword */ +#else +/* Disable restrict */ +#define restrict +#endif + +#define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \ + (((x) & 0xffu) << 8))) + +#define expect(expr, value) (__builtin_expect((expr), (value))) + +#if defined(likely) +#undef likely +#endif +#if defined(unlikely) +#undef unlikely +#endif + +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) + +/* Basic types */ +#define BYTE uint8_t +#define U16 uint16_t +#define U32 uint32_t +#define S32 int32_t +#define U64 uint64_t + +#ifndef LZ4_FORCE_UNALIGNED_ACCESS +#pragma pack(1) +#endif + +typedef struct _U16_S { + U16 v; +} U16_S; +typedef struct _U32_S { + U32 v; +} U32_S; +typedef struct _U64_S { + U64 v; +} U64_S; + +#ifndef LZ4_FORCE_UNALIGNED_ACCESS +#pragma pack() +#endif + +#define A64(x) (((U64_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A16(x) (((U16_S *)(x))->v) + +/* + * Constants + */ +#define MINMATCH 4 + +#define HASH_LOG COMPRESSIONLEVEL +#define HASHTABLESIZE (1 << HASH_LOG) +#define HASH_MASK (HASHTABLESIZE - 1) + +#define SKIPSTRENGTH (NOTCOMPRESSIBLE_CONFIRMATION > 2 ? \ + NOTCOMPRESSIBLE_CONFIRMATION : 2) + +/* + * Defines if memory is allocated into the stack (local variable), + * or into the heap (kmem_alloc()). + */ +#define HEAPMODE (HASH_LOG > STACKLIMIT) +#define COPYLENGTH 8 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH + MINMATCH) +#define MINLENGTH (MFLIMIT + 1) + +#define MAXD_LOG 16 +#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) + +#define ML_BITS 4 +#define ML_MASK ((1U<> ((MINMATCH * 8) - \ + HASH_LOG)) +#define LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p)) +#define LZ4_WILDCOPY(s, d, e) do { LZ4_COPYPACKET(s, d) } while (d < e); +#define LZ4_BLINDCOPY(s, d, l) { BYTE* e = (d) + l; LZ4_WILDCOPY(s, d, e); \ + d = e; } + + +/* Private functions */ +#if LZ4_ARCH64 + +static inline int +LZ4_NbCommonBytes(register U64 val) +{ +#if defined(LZ4_BIG_ENDIAN) +#if !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll(val) >> 3); +#else + int r; + if (!(val >> 32)) { + r = 4; + } else { + r = 0; + val >>= 32; + } + if (!(val >> 16)) { + r += 2; + val >>= 8; + } else { + val >>= 24; + } + r += (!val); + return (r); +#endif +#else +#if !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll(val) >> 3); +#else + static const int DeBruijnBytePos[64] = + { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, + 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, + 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, + 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 + }; + return DeBruijnBytePos[((U64) ((val & -val) * 0x0218A392CDABBD3F)) >> + 58]; +#endif +#endif +} + +#else + +static inline int +LZ4_NbCommonBytes(register U32 val) +{ +#if defined(LZ4_BIG_ENDIAN) +#if !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz(val) >> 3); +#else + int r; + if (!(val >> 16)) { + r = 2; + val >>= 8; + } else { + r = 0; + val >>= 24; + } + r += (!val); + return (r); +#endif +#else +#if !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz(val) >> 3); +#else + static const int DeBruijnBytePos[32] = { + 0, 0, 3, 0, 3, 1, 3, 0, + 3, 2, 2, 1, 3, 2, 0, 1, + 3, 3, 1, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 1, 0, 1, 1 + }; + return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> + 27]; +#endif +#endif +} + +#endif + +/* Public functions */ + +static int +LZ4_compressBound(int isize) +{ + return (isize + (isize / 255) + 16); +} + +/* Compression functions */ + +/*ARGSUSED*/ +static int +LZ4_compressCtx(void *ctx, const char *source, char *dest, int isize, + int osize) +{ +#if HEAPMODE + struct refTables *srt = (struct refTables *)ctx; + HTYPE *HashTable = (HTYPE *) (srt->hashTable); +#else + HTYPE HashTable[HASHTABLESIZE] = { 0 }; +#endif + + const BYTE *ip = (BYTE *) source; + INITBASE(base); + const BYTE *anchor = ip; + const BYTE *const iend = ip + isize; + const BYTE *const oend = (BYTE *) dest + osize; + const BYTE *const mflimit = iend - MFLIMIT; +#define matchlimit (iend - LASTLITERALS) + + BYTE *op = (BYTE *) dest; + + int len, length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + /* First Byte */ + HashTable[LZ4_HASH_VALUE(ip)] = ip - base; + ip++; + forwardH = LZ4_HASH_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findMatchAttempts = (1U << skipStrength) + 3; + const BYTE *forwardIp = ip; + const BYTE *ref; + BYTE *token; + + /* Find a match */ + do { + U32 h = forwardH; + int step = findMatchAttempts++ >> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if unlikely(forwardIp > mflimit) { + goto _last_literals; + } + + forwardH = LZ4_HASH_VALUE(forwardIp); + ref = base + HashTable[h]; + HashTable[h] = ip - base; + + } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); + + /* Catch up */ + while ((ip > anchor) && (ref > (BYTE *) source) && + unlikely(ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = ip - anchor; + token = op++; + + /* Check output limit */ + if unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend) + return (0); + + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254; len -= 255) + *op++ = 255; + *op++ = (BYTE)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); + + _next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref); + + /* Start Counting */ + ip += MINMATCH; + ref += MINMATCH; /* MinMatch verified */ + anchor = ip; + while likely(ip < matchlimit - (STEPSIZE - 1)) { + UARCH diff = AARCH(ref) ^ AARCH(ip); + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NbCommonBytes(diff); + goto _endCount; + } +#if LZ4_ARCH64 + if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } +#endif + if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < matchlimit) && (*ref == *ip)) + ip++; + _endCount: + + /* Encode MatchLength */ + len = (ip - anchor); + /* Check output limit */ + if unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend) + return (0); + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509; len -= 510) { + *op++ = 255; + *op++ = 255; + } + if (len > 254) { + len -= 255; + *op++ = 255; + } + *op++ = (BYTE)len; + } else + *token += len; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + /* Fill table */ + HashTable[LZ4_HASH_VALUE(ip - 2)] = ip - 2 - base; + + /* Test next position */ + ref = base + HashTable[LZ4_HASH_VALUE(ip)]; + HashTable[LZ4_HASH_VALUE(ip)] = ip - base; + if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { + token = op++; + *token = 0; + goto _next_match; + } + /* Prepare next loop */ + anchor = ip++; + forwardH = LZ4_HASH_VALUE(ip); + } + + _last_literals: + /* Encode Last Literals */ + { + int lastRun = iend - anchor; + if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) > + oend) + return (0); + if (lastRun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastRun -= RUN_MASK; + for (; lastRun > 254; lastRun -= 255) { + *op++ = 255; + } + *op++ = (BYTE)lastRun; + } else + *op++ = (lastRun << ML_BITS); + (void) memcpy(op, anchor, iend - anchor); + op += iend - anchor; + } + + /* End */ + return (int)(((char *)op) - dest); +} + + + +/* Note : this function is valid only if isize < LZ4_64KLIMIT */ +#define LZ4_64KLIMIT ((1 << 16) + (MFLIMIT - 1)) +#define HASHLOG64K (HASH_LOG + 1) +#define HASH64KTABLESIZE (1U << HASHLOG64K) +#define LZ4_HASH64K_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8) - \ + HASHLOG64K)) +#define LZ4_HASH64K_VALUE(p) LZ4_HASH64K_FUNCTION(A32(p)) + +/*ARGSUSED*/ +static int +LZ4_compress64kCtx(void *ctx, const char *source, char *dest, int isize, + int osize) +{ +#if HEAPMODE + struct refTables *srt = (struct refTables *)ctx; + U16 *HashTable = (U16 *) (srt->hashTable); +#else + U16 HashTable[HASH64KTABLESIZE] = { 0 }; +#endif + + const BYTE *ip = (BYTE *) source; + const BYTE *anchor = ip; + const BYTE *const base = ip; + const BYTE *const iend = ip + isize; + const BYTE *const oend = (BYTE *) dest + osize; + const BYTE *const mflimit = iend - MFLIMIT; +#define matchlimit (iend - LASTLITERALS) + + BYTE *op = (BYTE *) dest; + + int len, length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + /* First Byte */ + ip++; + forwardH = LZ4_HASH64K_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findMatchAttempts = (1U << skipStrength) + 3; + const BYTE *forwardIp = ip; + const BYTE *ref; + BYTE *token; + + /* Find a match */ + do { + U32 h = forwardH; + int step = findMatchAttempts++ >> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if (forwardIp > mflimit) { + goto _last_literals; + } + + forwardH = LZ4_HASH64K_VALUE(forwardIp); + ref = base + HashTable[h]; + HashTable[h] = ip - base; + + } while (A32(ref) != A32(ip)); + + /* Catch up */ + while ((ip > anchor) && (ref > (BYTE *) source) && + (ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = ip - anchor; + token = op++; + + /* Check output limit */ + if unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend) + return (0); + + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254; len -= 255) + *op++ = 255; + *op++ = (BYTE)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); + + _next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref); + + /* Start Counting */ + ip += MINMATCH; + ref += MINMATCH; /* MinMatch verified */ + anchor = ip; + while (ip < matchlimit - (STEPSIZE - 1)) { + UARCH diff = AARCH(ref) ^ AARCH(ip); + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NbCommonBytes(diff); + goto _endCount; + } +#if LZ4_ARCH64 + if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } +#endif + if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < matchlimit) && (*ref == *ip)) + ip++; + _endCount: + + /* Encode MatchLength */ + len = (ip - anchor); + /* Check output limit */ + if unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend) + return (0); + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509; len -= 510) { + *op++ = 255; + *op++ = 255; + } + if (len > 254) { + len -= 255; + *op++ = 255; + } + *op++ = (BYTE)len; + } else + *token += len; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + /* Fill table */ + HashTable[LZ4_HASH64K_VALUE(ip - 2)] = ip - 2 - base; + + /* Test next position */ + ref = base + HashTable[LZ4_HASH64K_VALUE(ip)]; + HashTable[LZ4_HASH64K_VALUE(ip)] = ip - base; + if (A32(ref) == A32(ip)) { + token = op++; + *token = 0; + goto _next_match; + } + /* Prepare next loop */ + anchor = ip++; + forwardH = LZ4_HASH64K_VALUE(ip); + } + + _last_literals: + /* Encode Last Literals */ + { + int lastRun = iend - anchor; + if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) > + oend) + return (0); + if (lastRun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastRun -= RUN_MASK; + for (; lastRun > 254; lastRun -= 255) + *op++ = 255; + *op++ = (BYTE)lastRun; + } else + *op++ = (lastRun << ML_BITS); + (void) memcpy(op, anchor, iend - anchor); + op += iend - anchor; + } + + /* End */ + return (int)(((char *)op) - dest); +} + +static int +real_LZ4_compress(const char *source, char *dest, int isize, int osize) +{ +#if HEAPMODE +#ifdef __FreeBSD__ + void *ctx = kmem_cache_alloc(lz4_ctx_cache, KM_NOSLEEP); +#else + void *ctx = kmem_zalloc(sizeof (struct refTables), KM_NOSLEEP); +#endif + int result; + + /* + * out of kernel memory, gently fall through - this will disable + * compression in zio_compress_data + */ + if (ctx == NULL) + return (0); + + bzero(ctx, sizeof(struct refTables)); + if (isize < LZ4_64KLIMIT) + result = LZ4_compress64kCtx(ctx, source, dest, isize, osize); + else + result = LZ4_compressCtx(ctx, source, dest, isize, osize); + +#ifdef __FreeBSD__ + kmem_cache_free(lz4_ctx_cache, ctx); +#else + kmem_free(ctx, sizeof (struct refTables)); +#endif + return (result); +#else + if (isize < (int)LZ4_64KLIMIT) + return (LZ4_compress64kCtx(NULL, source, dest, isize, osize)); + return (LZ4_compressCtx(NULL, source, dest, isize, osize)); +#endif +} + +/* Decompression functions */ + +/* + * Note: The decoding function LZ4_uncompress_unknownOutputSize() is safe + * against "buffer overflow" attack type. They will never write nor + * read outside of the provided output buffers. + * LZ4_uncompress_unknownOutputSize() also insures that it will never + * read outside of the input buffer. A corrupted input will produce + * an error result, a negative int, indicating the position of the + * error within input stream. + */ + +static int +LZ4_uncompress_unknownOutputSize(const char *source, char *dest, int isize, + int maxOutputSize) +{ + /* Local Variables */ + const BYTE *restrict ip = (const BYTE *) source; + const BYTE *const iend = ip + isize; + const BYTE *ref; + + BYTE *op = (BYTE *) dest; + BYTE *const oend = op + maxOutputSize; + BYTE *cpy; + + size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 + size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; +#endif + + /* Main Loop */ + while (ip < iend) { + unsigned token; + size_t length; + + /* get runlength */ + token = *ip++; + if ((length = (token >> ML_BITS)) == RUN_MASK) { + int s = 255; + while ((ip < iend) && (s == 255)) { + s = *ip++; + length += s; + } + } + /* copy literals */ + cpy = op + length; + /* CORNER-CASE: cpy might overflow. */ + if (cpy < op) + goto _output_error; /* cpy was overflowed, bail! */ + if ((cpy > oend - COPYLENGTH) || + (ip + length > iend - COPYLENGTH)) { + if (cpy > oend) + /* Error: writes beyond output buffer */ + goto _output_error; + if (ip + length != iend) + /* + * Error: LZ4 format requires to consume all + * input at this stage + */ + goto _output_error; + (void) memcpy(op, ip, length); + op += length; + /* Necessarily EOF, due to parsing restrictions */ + break; + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + if (ref < (BYTE * const) dest) + /* + * Error: offset creates reference outside of + * destination buffer + */ + goto _output_error; + + /* get matchlength */ + if ((length = (token & ML_MASK)) == ML_MASK) { + while (ip < iend) { + int s = *ip++; + length += s; + if (s == 255) + continue; + break; + } + } + /* copy repeated sequence */ + if unlikely(op - ref < STEPSIZE) { +#if LZ4_ARCH64 + size_t dec64 = dec64table[op-ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op-ref]; + A32(op) = A32(ref); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE - 4); + if (cpy > oend - COPYLENGTH) { + if (cpy > oend) + /* + * Error: request to write outside of + * destination buffer + */ + goto _output_error; + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + if (op == oend) + /* + * Check EOF (should never happen, since + * last 5 bytes are supposed to be literals) + */ + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + + /* end of decoding */ + return (int)(((char *)op) - dest); + + /* write overflow error detected */ + _output_error: + return (int)(-(((char *)ip) - source)); +} + +#ifdef __FreeBSD__ + +extern void +lz4_init(void) +{ + +#if HEAPMODE + lz4_ctx_cache = kmem_cache_create("lz4_ctx", sizeof(struct refTables), + 0, NULL, NULL, NULL, NULL, NULL, 0); +#endif +} + +extern void +lz4_fini(void) +{ + +#if HEAPMODE + kmem_cache_destroy(lz4_ctx_cache); +#endif +} + +#endif /* __FreeBSD__ */ Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 lzjb.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c 27 Feb 2010 22:30:59 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c 2 Nov 2013 01:24:10 -0000 @@ -20,12 +20,11 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* - * We keep our own copy of this algorithm for 2 main reasons: + * We keep our own copy of this algorithm for 3 main reasons: * 1. If we didn't, anyone modifying common/os/compress.c would * directly break our on disk format * 2. Our version of lzjb does not have a number of checks that the @@ -33,11 +32,13 @@ * 3. We initialize the lempel to ensure deterministic results, * so that identical blocks can always be deduplicated. * In particular, we are adding the "feature" that compress() can - * take a destination buffer size and return -1 if the data will not - * compress to d_len or less. + * take a destination buffer size and returns the compressed length, or the + * source length if compression would overflow the destination buffer. */ +#include #include +#include #define MATCH_BITS 6 #define MATCH_MIN 3 @@ -51,7 +52,8 @@ lzjb_compress(void *s_start, void *d_sta { uchar_t *src = s_start; uchar_t *dst = d_start; - uchar_t *cpy, *copymap; + uchar_t *cpy; + uchar_t *copymap = NULL; int copymask = 1 << (NBBY - 1); int mlen, offset, hash; uint16_t *hp; @@ -100,7 +102,8 @@ lzjb_decompress(void *s_start, void *d_s uchar_t *src = s_start; uchar_t *dst = d_start; uchar_t *d_end = (uchar_t *)d_start + d_len; - uchar_t *cpy, copymap; + uchar_t *cpy; + uchar_t copymap = 0; int copymask = 1 << (NBBY - 1); while (dst < d_end) { @@ -114,7 +117,9 @@ lzjb_decompress(void *s_start, void *d_s src += 2; if ((cpy = dst - offset) < (uchar_t *)d_start) return (-1); - while (--mlen >= 0 && dst < d_end) + if (mlen > (d_end - dst)) + mlen = d_end - dst; + while (--mlen >= 0) *dst++ = *cpy++; } else { *dst++ = *src++; Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 metaslab.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c 27 Feb 2010 22:31:01 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c 27 Mar 2017 06:19:45 -0000 @@ -19,8 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ #include @@ -30,14 +32,107 @@ #include #include #include +#include +#include + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); + +#define GANG_ALLOCATION(flags) \ + ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ +SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN, + &metaslab_gang_bang, 0, + "Force gang block allocation for blocks larger than or equal to this value"); + +/* + * The in-core space map representation is more compact than its on-disk form. + * The zfs_condense_pct determines how much more compact the in-core + * space map representation must be before we compact it on-disk. + * Values should be greater than or equal to 100. + */ +int zfs_condense_pct = 200; +SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, + &zfs_condense_pct, 0, + "Condense on-disk spacemap when it is more than this many percents" + " of in-memory counterpart"); + +/* + * Condensing a metaslab is not guaranteed to actually reduce the amount of + * space used on disk. In particular, a space map uses data in increments of + * MAX(1 << ashift, space_map_blksize), so a metaslab might use the + * same number of blocks after condensing. Since the goal of condensing is to + * reduce the number of IOPs required to read the space map, we only want to + * condense when we can be sure we will reduce the number of blocks used by the + * space map. Unfortunately, we cannot precisely compute whether or not this is + * the case in metaslab_should_condense since we are holding ms_lock. Instead, + * we apply the following heuristic: do not condense a spacemap unless the + * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold + * blocks. + */ +int zfs_metaslab_condense_block_threshold = 4; + +/* + * The zfs_mg_noalloc_threshold defines which metaslab groups should + * be eligible for allocation. The value is defined as a percentage of + * free space. Metaslab groups that have more free space than + * zfs_mg_noalloc_threshold are always eligible for allocations. Once + * a metaslab group's free space is less than or equal to the + * zfs_mg_noalloc_threshold the allocator will avoid allocating to that + * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. + * Once all groups in the pool reach zfs_mg_noalloc_threshold then all + * groups are allowed to accept allocations. Gang blocks are always + * eligible to allocate on any metaslab group. The default value of 0 means + * no metaslab group will be excluded based on this criterion. + */ +int zfs_mg_noalloc_threshold = 0; +SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, + &zfs_mg_noalloc_threshold, 0, + "Percentage of metaslab group size that should be free" + " to make it eligible for allocation"); + +/* + * Metaslab groups are considered eligible for allocations if their + * fragmenation metric (measured as a percentage) is less than or equal to + * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold + * then it will be skipped unless all metaslab groups within the metaslab + * class have also crossed this threshold. + */ +int zfs_mg_fragmentation_threshold = 85; +SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN, + &zfs_mg_fragmentation_threshold, 0, + "Percentage of metaslab group size that should be considered " + "eligible for allocations unless all metaslab groups within the metaslab class " + "have also crossed this threshold"); + +/* + * Allow metaslabs to keep their active state as long as their fragmentation + * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An + * active metaslab that exceeds this threshold will no longer keep its active + * status allowing better metaslabs to be selected. + */ +int zfs_metaslab_fragmentation_threshold = 70; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN, + &zfs_metaslab_fragmentation_threshold, 0, + "Maximum percentage of metaslab fragmentation level to keep their active state"); + +/* + * When set will load all metaslabs when pool is first opened. + */ +int metaslab_debug_load = 0; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, + &metaslab_debug_load, 0, + "Load all metaslabs when pool is first opened"); /* - * Metaslab debugging: when set, keeps all space maps in core to verify frees. + * When set will prevent metaslabs from being unloaded. */ -static int metaslab_debug = 0; +int metaslab_debug_unload = 0; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN, + &metaslab_debug_unload, 0, + "Prevent metaslabs from being unloaded"); /* * Minimum size which forces the dynamic allocator to change @@ -45,31 +140,123 @@ static int metaslab_debug = 0; * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ -uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; +uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; +SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, + &metaslab_df_alloc_threshold, 0, + "Minimum size which forces the dynamic allocator to change it's allocation strategy"); /* * The minimum free space, in percent, which must be available * in a space map to continue allocations in a first-fit fashion. - * Once the space_map's free space drops below this level we dynamically + * Once the space map's free space drops below this level we dynamically * switch to using best-fit allocations. */ int metaslab_df_free_pct = 4; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, + &metaslab_df_free_pct, 0, + "The minimum free space, in percent, which must be available in a " + "space map to continue allocations in a first-fit fashion"); /* * A metaslab is considered "free" if it contains a contiguous * segment which is greater than metaslab_min_alloc_size. */ uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; +SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, + &metaslab_min_alloc_size, 0, + "A metaslab is considered \"free\" if it contains a contiguous " + "segment which is greater than vfs.zfs.metaslab.min_alloc_size"); + +/* + * Percentage of all cpus that can be used by the metaslab taskq. + */ +int metaslab_load_pct = 50; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, + &metaslab_load_pct, 0, + "Percentage of cpus that can be used by the metaslab taskq"); + +/* + * Determines how many txgs a metaslab may remain loaded without having any + * allocations from it. As long as a metaslab continues to be used we will + * keep it loaded. + */ +int metaslab_unload_delay = TXG_SIZE * 2; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, + &metaslab_unload_delay, 0, + "Number of TXGs that an unused metaslab can be kept in memory"); + +/* + * Max number of metaslabs per group to preload. + */ +int metaslab_preload_limit = SPA_DVAS_PER_BP; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, + &metaslab_preload_limit, 0, + "Max number of metaslabs per group to preload"); + +/* + * Enable/disable preloading of metaslab. + */ +boolean_t metaslab_preload_enabled = B_TRUE; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, + &metaslab_preload_enabled, 0, + "Max number of metaslabs per group to preload"); + +/* + * Enable/disable fragmentation weighting on metaslabs. + */ +boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, + &metaslab_fragmentation_factor_enabled, 0, + "Enable fragmentation weighting on metaslabs"); + +/* + * Enable/disable lba weighting (i.e. outer tracks are given preference). + */ +boolean_t metaslab_lba_weighting_enabled = B_TRUE; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, + &metaslab_lba_weighting_enabled, 0, + "Enable LBA weighting (i.e. outer tracks are given preference)"); + +/* + * Enable/disable metaslab group biasing. + */ +boolean_t metaslab_bias_enabled = B_TRUE; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, + &metaslab_bias_enabled, 0, + "Enable metaslab group biasing"); + +/* + * Enable/disable segment-based metaslab selection. + */ +boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE; + +/* + * When using segment-based metaslab selection, we will continue + * allocating from the active metaslab until we have exhausted + * zfs_metaslab_switch_threshold of its buckets. + */ +int zfs_metaslab_switch_threshold = 2; /* - * Max number of space_maps to prefetch. + * Internal switch to enable/disable the metaslab allocation tracing + * facility. */ -int metaslab_prefetch_limit = SPA_DVAS_PER_BP; +boolean_t metaslab_trace_enabled = B_TRUE; /* - * Percentage bonus multiplier for metaslabs that are in the bonus area. + * Maximum entries that the metaslab allocation tracing facility will keep + * in a given list when running in non-debug mode. We limit the number + * of entries in non-debug mode to prevent us from using up too much memory. + * The limit should be sufficiently large that we don't expect any allocation + * to every exceed this value. In debug mode, the system will panic if this + * limit is ever reached allowing for further investigation. */ -int metaslab_smo_bonus_pct = 150; +uint64_t metaslab_trace_max_entries = 5000; + +static uint64_t metaslab_weight(metaslab_t *); +static void metaslab_set_fragmentation(metaslab_t *); + +kmem_cache_t *metaslab_alloc_trace_cache; /* * ========================================================================== @@ -77,7 +264,7 @@ int metaslab_smo_bonus_pct = 150; * ========================================================================== */ metaslab_class_t * -metaslab_class_create(spa_t *spa, space_map_ops_t *ops) +metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) { metaslab_class_t *mc; @@ -86,6 +273,8 @@ metaslab_class_create(spa_t *spa, space_ mc->mc_spa = spa; mc->mc_rotor = NULL; mc->mc_ops = ops; + mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); + refcount_create_tracked(&mc->mc_alloc_slots); return (mc); } @@ -99,6 +288,8 @@ metaslab_class_destroy(metaslab_class_t ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); + refcount_destroy(&mc->mc_alloc_slots); + mutex_destroy(&mc->mc_lock); kmem_free(mc, sizeof (metaslab_class_t)); } @@ -138,6 +329,27 @@ metaslab_class_space_update(metaslab_cla atomic_add_64(&mc->mc_dspace, dspace_delta); } +void +metaslab_class_minblocksize_update(metaslab_class_t *mc) +{ + metaslab_group_t *mg; + vdev_t *vd; + uint64_t minashift = UINT64_MAX; + + if ((mg = mc->mc_rotor) == NULL) { + mc->mc_minblocksize = SPA_MINBLOCKSIZE; + return; + } + + do { + vd = mg->mg_vd; + if (vd->vdev_ashift < minashift) + minashift = vd->vdev_ashift; + } while ((mg = mg->mg_next) != mc->mc_rotor); + + mc->mc_minblocksize = 1ULL << minashift; +} + uint64_t metaslab_class_get_alloc(metaslab_class_t *mc) { @@ -162,11 +374,133 @@ metaslab_class_get_dspace(metaslab_class return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } +uint64_t +metaslab_class_get_minblocksize(metaslab_class_t *mc) +{ + return (mc->mc_minblocksize); +} + +void +metaslab_class_histogram_verify(metaslab_class_t *mc) +{ + vdev_t *rvd = mc->mc_spa->spa_root_vdev; + uint64_t *mc_hist; + int i; + + if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) + return; + + mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, + KM_SLEEP); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + /* + * Skip any holes, uninitialized top-levels, or + * vdevs that are not in this metalab class. + */ + if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || + mg->mg_class != mc) { + continue; + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + mc_hist[i] += mg->mg_histogram[i]; + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); + + kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); +} + /* - * ========================================================================== - * Metaslab groups - * ========================================================================== + * Calculate the metaslab class's fragmentation metric. The metric + * is weighted based on the space contribution of each metaslab group. + * The return value will be a number between 0 and 100 (inclusive), or + * ZFS_FRAG_INVALID if the metric has not been set. See comment above the + * zfs_frag_table for more information about the metric. + */ +uint64_t +metaslab_class_fragmentation(metaslab_class_t *mc) +{ + vdev_t *rvd = mc->mc_spa->spa_root_vdev; + uint64_t fragmentation = 0; + + spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + /* + * Skip any holes, uninitialized top-levels, or + * vdevs that are not in this metalab class. + */ + if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || + mg->mg_class != mc) { + continue; + } + + /* + * If a metaslab group does not contain a fragmentation + * metric then just bail out. + */ + if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { + spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); + return (ZFS_FRAG_INVALID); + } + + /* + * Determine how much this metaslab_group is contributing + * to the overall pool fragmentation metric. + */ + fragmentation += mg->mg_fragmentation * + metaslab_group_get_space(mg); + } + fragmentation /= metaslab_class_get_space(mc); + + ASSERT3U(fragmentation, <=, 100); + spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); + return (fragmentation); +} + +/* + * Calculate the amount of expandable space that is available in + * this metaslab class. If a device is expanded then its expandable + * space will be the amount of allocatable space that is currently not + * part of this metaslab class. */ +uint64_t +metaslab_class_expandable_space(metaslab_class_t *mc) +{ + vdev_t *rvd = mc->mc_spa->spa_root_vdev; + uint64_t space = 0; + + spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || + mg->mg_class != mc) { + continue; + } + + /* + * Calculate if we have enough space to add additional + * metaslabs. We report the expandable space in terms + * of the metaslab size since that's the unit of expansion. + */ + space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, + 1ULL << tvd->vdev_ms_shift); + } + spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); + return (space); +} + static int metaslab_compare(const void *x1, const void *x2) { @@ -181,9 +515,9 @@ metaslab_compare(const void *x1, const v /* * If the weights are identical, use the offset to force uniqueness. */ - if (m1->ms_map.sm_start < m2->ms_map.sm_start) + if (m1->ms_start < m2->ms_start) return (-1); - if (m1->ms_map.sm_start > m2->ms_map.sm_start) + if (m1->ms_start > m2->ms_start) return (1); ASSERT3P(m1, ==, m2); @@ -191,6 +525,138 @@ metaslab_compare(const void *x1, const v return (0); } +/* + * Verify that the space accounting on disk matches the in-core range_trees. + */ +void +metaslab_verify_space(metaslab_t *msp, uint64_t txg) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + uint64_t allocated = 0; + uint64_t freed = 0; + uint64_t sm_free_space, msp_free_space; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + /* + * We can only verify the metaslab space when we're called + * from syncing context with a loaded metaslab that has an allocated + * space map. Calling this in non-syncing context does not + * provide a consistent view of the metaslab since we're performing + * allocations in the future. + */ + if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || + !msp->ms_loaded) + return; + + sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) - + space_map_alloc_delta(msp->ms_sm); + + /* + * Account for future allocations since we would have already + * deducted that space from the ms_freetree. + */ + for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { + allocated += + range_tree_space(msp->ms_alloctree[(txg + t) & TXG_MASK]); + } + freed = range_tree_space(msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]); + + msp_free_space = range_tree_space(msp->ms_tree) + allocated + + msp->ms_deferspace + freed; + + VERIFY3U(sm_free_space, ==, msp_free_space); +} + +/* + * ========================================================================== + * Metaslab groups + * ========================================================================== + */ +/* + * Update the allocatable flag and the metaslab group's capacity. + * The allocatable flag is set to true if the capacity is below + * the zfs_mg_noalloc_threshold or has a fragmentation value that is + * greater than zfs_mg_fragmentation_threshold. If a metaslab group + * transitions from allocatable to non-allocatable or vice versa then the + * metaslab group's class is updated to reflect the transition. + */ +static void +metaslab_group_alloc_update(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + metaslab_class_t *mc = mg->mg_class; + vdev_stat_t *vs = &vd->vdev_stat; + boolean_t was_allocatable; + boolean_t was_initialized; + + ASSERT(vd == vd->vdev_top); + + mutex_enter(&mg->mg_lock); + was_allocatable = mg->mg_allocatable; + was_initialized = mg->mg_initialized; + + mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / + (vs->vs_space + 1); + + mutex_enter(&mc->mc_lock); + + /* + * If the metaslab group was just added then it won't + * have any space until we finish syncing out this txg. + * At that point we will consider it initialized and available + * for allocations. We also don't consider non-activated + * metaslab groups (e.g. vdevs that are in the middle of being removed) + * to be initialized, because they can't be used for allocation. + */ + mg->mg_initialized = metaslab_group_initialized(mg); + if (!was_initialized && mg->mg_initialized) { + mc->mc_groups++; + } else if (was_initialized && !mg->mg_initialized) { + ASSERT3U(mc->mc_groups, >, 0); + mc->mc_groups--; + } + if (mg->mg_initialized) + mg->mg_no_free_space = B_FALSE; + + /* + * A metaslab group is considered allocatable if it has plenty + * of free space or is not heavily fragmented. We only take + * fragmentation into account if the metaslab group has a valid + * fragmentation metric (i.e. a value between 0 and 100). + */ + mg->mg_allocatable = (mg->mg_activation_count > 0 && + mg->mg_free_capacity > zfs_mg_noalloc_threshold && + (mg->mg_fragmentation == ZFS_FRAG_INVALID || + mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); + + /* + * The mc_alloc_groups maintains a count of the number of + * groups in this metaslab class that are still above the + * zfs_mg_noalloc_threshold. This is used by the allocating + * threads to determine if they should avoid allocations to + * a given group. The allocator will avoid allocations to a group + * if that group has reached or is below the zfs_mg_noalloc_threshold + * and there are still other groups that are above the threshold. + * When a group transitions from allocatable to non-allocatable or + * vice versa we update the metaslab class to reflect that change. + * When the mc_alloc_groups value drops to 0 that means that all + * groups have reached the zfs_mg_noalloc_threshold making all groups + * eligible for allocations. This effectively means that all devices + * are balanced again. + */ + if (was_allocatable && !mg->mg_allocatable) + mc->mc_alloc_groups--; + else if (!was_allocatable && mg->mg_allocatable) + mc->mc_alloc_groups++; + mutex_exit(&mc->mc_lock); + + mutex_exit(&mg->mg_lock); +} + metaslab_group_t * metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) { @@ -203,6 +669,12 @@ metaslab_group_create(metaslab_class_t * mg->mg_vd = vd; mg->mg_class = mc; mg->mg_activation_count = 0; + mg->mg_initialized = B_FALSE; + mg->mg_no_free_space = B_TRUE; + refcount_create_tracked(&mg->mg_alloc_queue_depth); + + mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, + minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); return (mg); } @@ -219,8 +691,10 @@ metaslab_group_destroy(metaslab_group_t */ ASSERT(mg->mg_activation_count <= 0); + taskq_destroy(mg->mg_taskq); avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); + refcount_destroy(&mg->mg_alloc_queue_depth); kmem_free(mg, sizeof (metaslab_group_t)); } @@ -241,6 +715,7 @@ metaslab_group_activate(metaslab_group_t return; mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); + metaslab_group_alloc_update(mg); if ((mgprev = mc->mc_rotor) == NULL) { mg->mg_prev = mg; @@ -253,6 +728,7 @@ metaslab_group_activate(metaslab_group_t mgnext->mg_prev = mg; } mc->mc_rotor = mg; + metaslab_class_minblocksize_update(mc); } void @@ -271,6 +747,9 @@ metaslab_group_passivate(metaslab_group_ return; } + taskq_wait(mg->mg_taskq); + metaslab_group_alloc_update(mg); + mgprev = mg->mg_prev; mgnext = mg->mg_next; @@ -284,22 +763,125 @@ metaslab_group_passivate(metaslab_group_ mg->mg_prev = NULL; mg->mg_next = NULL; + metaslab_class_minblocksize_update(mc); +} + +boolean_t +metaslab_group_initialized(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + vdev_stat_t *vs = &vd->vdev_stat; + + return (vs->vs_space != 0 && mg->mg_activation_count > 0); +} + +uint64_t +metaslab_group_get_space(metaslab_group_t *mg) +{ + return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); +} + +void +metaslab_group_histogram_verify(metaslab_group_t *mg) +{ + uint64_t *mg_hist; + vdev_t *vd = mg->mg_vd; + uint64_t ashift = vd->vdev_ashift; + int i; + + if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) + return; + + mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, + KM_SLEEP); + + ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, + SPACE_MAP_HISTOGRAM_SIZE + ashift); + + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp->ms_sm == NULL) + continue; + + for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) + mg_hist[i + ashift] += + msp->ms_sm->sm_phys->smp_histogram[i]; + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) + VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); + + kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); } static void -metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) +metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) { + metaslab_class_t *mc = mg->mg_class; + uint64_t ashift = mg->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_sm == NULL) + return; + + mutex_enter(&mg->mg_lock); + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + mg->mg_histogram[i + ashift] += + msp->ms_sm->sm_phys->smp_histogram[i]; + mc->mc_histogram[i + ashift] += + msp->ms_sm->sm_phys->smp_histogram[i]; + } + mutex_exit(&mg->mg_lock); +} + +void +metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) +{ + metaslab_class_t *mc = mg->mg_class; + uint64_t ashift = mg->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_sm == NULL) + return; + mutex_enter(&mg->mg_lock); + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + ASSERT3U(mg->mg_histogram[i + ashift], >=, + msp->ms_sm->sm_phys->smp_histogram[i]); + ASSERT3U(mc->mc_histogram[i + ashift], >=, + msp->ms_sm->sm_phys->smp_histogram[i]); + + mg->mg_histogram[i + ashift] -= + msp->ms_sm->sm_phys->smp_histogram[i]; + mc->mc_histogram[i + ashift] -= + msp->ms_sm->sm_phys->smp_histogram[i]; + } + mutex_exit(&mg->mg_lock); +} + +static void +metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) +{ ASSERT(msp->ms_group == NULL); + mutex_enter(&mg->mg_lock); msp->ms_group = mg; msp->ms_weight = 0; avl_add(&mg->mg_metaslab_tree, msp); mutex_exit(&mg->mg_lock); + + mutex_enter(&msp->ms_lock); + metaslab_group_histogram_add(mg, msp); + mutex_exit(&msp->ms_lock); } static void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) { + mutex_enter(&msp->ms_lock); + metaslab_group_histogram_remove(mg, msp); + mutex_exit(&msp->ms_lock); + mutex_enter(&mg->mg_lock); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); @@ -312,9 +894,9 @@ metaslab_group_sort(metaslab_group_t *mg { /* * Although in principle the weight can be any value, in - * practice we do not use values in the range [1, 510]. + * practice we do not use values in the range [1, 511]. */ - ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); + ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); @@ -326,131 +908,314 @@ metaslab_group_sort(metaslab_group_t *mg } /* - * ========================================================================== - * Common allocator routines - * ========================================================================== + * Calculate the fragmentation for a given metaslab group. We can use + * a simple average here since all metaslabs within the group must have + * the same size. The return value will be a value between 0 and 100 + * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this + * group have a fragmentation metric. */ -static int -metaslab_segsize_compare(const void *x1, const void *x2) +uint64_t +metaslab_group_fragmentation(metaslab_group_t *mg) { - const space_seg_t *s1 = x1; - const space_seg_t *s2 = x2; - uint64_t ss_size1 = s1->ss_end - s1->ss_start; - uint64_t ss_size2 = s2->ss_end - s2->ss_start; + vdev_t *vd = mg->mg_vd; + uint64_t fragmentation = 0; + uint64_t valid_ms = 0; - if (ss_size1 < ss_size2) - return (-1); - if (ss_size1 > ss_size2) - return (1); + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; - if (s1->ss_start < s2->ss_start) - return (-1); - if (s1->ss_start > s2->ss_start) - return (1); + if (msp->ms_fragmentation == ZFS_FRAG_INVALID) + continue; - return (0); + valid_ms++; + fragmentation += msp->ms_fragmentation; + } + + if (valid_ms <= vd->vdev_ms_count / 2) + return (ZFS_FRAG_INVALID); + + fragmentation /= valid_ms; + ASSERT3U(fragmentation, <=, 100); + return (fragmentation); } /* - * This is a helper function that can be used by the allocator to find - * a suitable block to allocate. This will search the specified AVL - * tree looking for a block that matches the specified criteria. + * Determine if a given metaslab group should skip allocations. A metaslab + * group should avoid allocations if its free capacity is less than the + * zfs_mg_noalloc_threshold or its fragmentation metric is greater than + * zfs_mg_fragmentation_threshold and there is at least one metaslab group + * that can still handle allocations. If the allocation throttle is enabled + * then we skip allocations to devices that have reached their maximum + * allocation queue depth unless the selected metaslab group is the only + * eligible group remaining. */ -static uint64_t -metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, - uint64_t align) +static boolean_t +metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, + uint64_t psize) { - space_seg_t *ss, ssearch; - avl_index_t where; + spa_t *spa = mg->mg_vd->vdev_spa; + metaslab_class_t *mc = mg->mg_class; - ssearch.ss_start = *cursor; - ssearch.ss_end = *cursor + size; + /* + * We can only consider skipping this metaslab group if it's + * in the normal metaslab class and there are other metaslab + * groups to select from. Otherwise, we always consider it eligible + * for allocations. + */ + if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) + return (B_TRUE); + + /* + * If the metaslab group's mg_allocatable flag is set (see comments + * in metaslab_group_alloc_update() for more information) and + * the allocation throttle is disabled then allow allocations to this + * device. However, if the allocation throttle is enabled then + * check if we have reached our allocation limit (mg_alloc_queue_depth) + * to determine if we should allow allocations to this metaslab group. + * If all metaslab groups are no longer considered allocatable + * (mc_alloc_groups == 0) or we're trying to allocate the smallest + * gang block size then we allow allocations on this metaslab group + * regardless of the mg_allocatable or throttle settings. + */ + if (mg->mg_allocatable) { + metaslab_group_t *mgp; + int64_t qdepth; + uint64_t qmax = mg->mg_max_alloc_queue_depth; - ss = avl_find(t, &ssearch, &where); - if (ss == NULL) - ss = avl_nearest(t, where, AVL_AFTER); + if (!mc->mc_alloc_throttle_enabled) + return (B_TRUE); - while (ss != NULL) { - uint64_t offset = P2ROUNDUP(ss->ss_start, align); + /* + * If this metaslab group does not have any free space, then + * there is no point in looking further. + */ + if (mg->mg_no_free_space) + return (B_FALSE); - if (offset + size <= ss->ss_end) { - *cursor = offset + size; - return (offset); - } - ss = AVL_NEXT(t, ss); + qdepth = refcount_count(&mg->mg_alloc_queue_depth); + + /* + * If this metaslab group is below its qmax or it's + * the only allocatable metasable group, then attempt + * to allocate from it. + */ + if (qdepth < qmax || mc->mc_alloc_groups == 1) + return (B_TRUE); + ASSERT3U(mc->mc_alloc_groups, >, 1); + + /* + * Since this metaslab group is at or over its qmax, we + * need to determine if there are metaslab groups after this + * one that might be able to handle this allocation. This is + * racy since we can't hold the locks for all metaslab + * groups at the same time when we make this check. + */ + for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { + qmax = mgp->mg_max_alloc_queue_depth; + + qdepth = refcount_count(&mgp->mg_alloc_queue_depth); + + /* + * If there is another metaslab group that + * might be able to handle the allocation, then + * we return false so that we skip this group. + */ + if (qdepth < qmax && !mgp->mg_no_free_space) + return (B_FALSE); + } + + /* + * We didn't find another group to handle the allocation + * so we can't skip this metaslab group even though + * we are at or over our qmax. + */ + return (B_TRUE); + + } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { + return (B_TRUE); } + return (B_FALSE); +} - /* - * If we know we've searched the whole map (*cursor == 0), give up. - * Otherwise, reset the cursor to the beginning and try again. - */ - if (*cursor == 0) - return (-1ULL); +/* + * ========================================================================== + * Range tree callbacks + * ========================================================================== + */ - *cursor = 0; - return (metaslab_block_picker(t, cursor, size, align)); +/* + * Comparison function for the private size-ordered tree. Tree is sorted + * by size, larger sizes at the end of the tree. + */ +static int +metaslab_rangesize_compare(const void *x1, const void *x2) +{ + const range_seg_t *r1 = x1; + const range_seg_t *r2 = x2; + uint64_t rs_size1 = r1->rs_end - r1->rs_start; + uint64_t rs_size2 = r2->rs_end - r2->rs_start; + + if (rs_size1 < rs_size2) + return (-1); + if (rs_size1 > rs_size2) + return (1); + + if (r1->rs_start < r2->rs_start) + return (-1); + + if (r1->rs_start > r2->rs_start) + return (1); + + return (0); } +/* + * Create any block allocator specific components. The current allocators + * rely on using both a size-ordered range_tree_t and an array of uint64_t's. + */ static void -metaslab_pp_load(space_map_t *sm) +metaslab_rt_create(range_tree_t *rt, void *arg) { - space_seg_t *ss; - - ASSERT(sm->sm_ppd == NULL); - sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); + metaslab_t *msp = arg; - sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); - avl_create(sm->sm_pp_root, metaslab_segsize_compare, - sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); + ASSERT3P(rt->rt_arg, ==, msp); + ASSERT(msp->ms_tree == NULL); - for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) - avl_add(sm->sm_pp_root, ss); + avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, + sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); } +/* + * Destroy the block allocator specific components. + */ static void -metaslab_pp_unload(space_map_t *sm) +metaslab_rt_destroy(range_tree_t *rt, void *arg) { - void *cookie = NULL; + metaslab_t *msp = arg; - kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); - sm->sm_ppd = NULL; + ASSERT3P(rt->rt_arg, ==, msp); + ASSERT3P(msp->ms_tree, ==, rt); + ASSERT0(avl_numnodes(&msp->ms_size_tree)); - while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { - /* tear down the tree */ - } + avl_destroy(&msp->ms_size_tree); +} + +static void +metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + metaslab_t *msp = arg; - avl_destroy(sm->sm_pp_root); - kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); - sm->sm_pp_root = NULL; + ASSERT3P(rt->rt_arg, ==, msp); + ASSERT3P(msp->ms_tree, ==, rt); + VERIFY(!msp->ms_condensing); + avl_add(&msp->ms_size_tree, rs); } -/* ARGSUSED */ static void -metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) +metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) { - /* No need to update cursor */ + metaslab_t *msp = arg; + + ASSERT3P(rt->rt_arg, ==, msp); + ASSERT3P(msp->ms_tree, ==, rt); + VERIFY(!msp->ms_condensing); + avl_remove(&msp->ms_size_tree, rs); } -/* ARGSUSED */ static void -metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) +metaslab_rt_vacate(range_tree_t *rt, void *arg) { - /* No need to update cursor */ + metaslab_t *msp = arg; + + ASSERT3P(rt->rt_arg, ==, msp); + ASSERT3P(msp->ms_tree, ==, rt); + + /* + * Normally one would walk the tree freeing nodes along the way. + * Since the nodes are shared with the range trees we can avoid + * walking all nodes and just reinitialize the avl tree. The nodes + * will be freed by the range tree, so we don't want to free them here. + */ + avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, + sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); } +static range_tree_ops_t metaslab_rt_ops = { + metaslab_rt_create, + metaslab_rt_destroy, + metaslab_rt_add, + metaslab_rt_remove, + metaslab_rt_vacate +}; + +/* + * ========================================================================== + * Common allocator routines + * ========================================================================== + */ + /* * Return the maximum contiguous segment within the metaslab. */ uint64_t -metaslab_pp_maxsize(space_map_t *sm) +metaslab_block_maxsize(metaslab_t *msp) { - avl_tree_t *t = sm->sm_pp_root; - space_seg_t *ss; + avl_tree_t *t = &msp->ms_size_tree; + range_seg_t *rs; - if (t == NULL || (ss = avl_last(t)) == NULL) + if (t == NULL || (rs = avl_last(t)) == NULL) return (0ULL); - return (ss->ss_end - ss->ss_start); + return (rs->rs_end - rs->rs_start); +} + +static range_seg_t * +metaslab_block_find(avl_tree_t *t, uint64_t start, uint64_t size) +{ + range_seg_t *rs, rsearch; + avl_index_t where; + + rsearch.rs_start = start; + rsearch.rs_end = start + size; + + rs = avl_find(t, &rsearch, &where); + if (rs == NULL) { + rs = avl_nearest(t, where, AVL_AFTER); + } + + return (rs); +} + +/* + * This is a helper function that can be used by the allocator to find + * a suitable block to allocate. This will search the specified AVL + * tree looking for a block that matches the specified criteria. + */ +static uint64_t +metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, + uint64_t align) +{ + range_seg_t *rs = metaslab_block_find(t, *cursor, size); + + while (rs != NULL) { + uint64_t offset = P2ROUNDUP(rs->rs_start, align); + + if (offset + size <= rs->rs_end) { + *cursor = offset + size; + return (offset); + } + rs = AVL_NEXT(t, rs); + } + + /* + * If we know we've searched the whole map (*cursor == 0), give up. + * Otherwise, reset the cursor to the beginning and try again. + */ + if (*cursor == 0) + return (-1ULL); + + *cursor = 0; + return (metaslab_block_picker(t, cursor, size, align)); } /* @@ -459,30 +1224,24 @@ metaslab_pp_maxsize(space_map_t *sm) * ========================================================================== */ static uint64_t -metaslab_ff_alloc(space_map_t *sm, uint64_t size) +metaslab_ff_alloc(metaslab_t *msp, uint64_t size) { - avl_tree_t *t = &sm->sm_root; + /* + * Find the largest power of 2 block size that evenly divides the + * requested size. This is used to try to allocate blocks with similar + * alignment from the same area of the metaslab (i.e. same cursor + * bucket) but it does not guarantee that other allocations sizes + * may exist in the same region. + */ uint64_t align = size & -size; - uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; + avl_tree_t *t = &msp->ms_tree->rt_root; return (metaslab_block_picker(t, cursor, size, align)); } -/* ARGSUSED */ -boolean_t -metaslab_ff_fragmented(space_map_t *sm) -{ - return (B_TRUE); -} - -static space_map_ops_t metaslab_ff_ops = { - metaslab_pp_load, - metaslab_pp_unload, - metaslab_ff_alloc, - metaslab_pp_claim, - metaslab_pp_free, - metaslab_pp_maxsize, - metaslab_ff_fragmented +static metaslab_ops_t metaslab_ff_ops = { + metaslab_ff_alloc }; /* @@ -494,16 +1253,24 @@ static space_map_ops_t metaslab_ff_ops = * ========================================================================== */ static uint64_t -metaslab_df_alloc(space_map_t *sm, uint64_t size) +metaslab_df_alloc(metaslab_t *msp, uint64_t size) { - avl_tree_t *t = &sm->sm_root; + /* + * Find the largest power of 2 block size that evenly divides the + * requested size. This is used to try to allocate blocks with similar + * alignment from the same area of the metaslab (i.e. same cursor + * bucket) but it does not guarantee that other allocations sizes + * may exist in the same region. + */ uint64_t align = size & -size; - uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; - uint64_t max_size = metaslab_pp_maxsize(sm); - int free_pct = sm->sm_space * 100 / sm->sm_size; + uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; + range_tree_t *rt = msp->ms_tree; + avl_tree_t *t = &rt->rt_root; + uint64_t max_size = metaslab_block_maxsize(msp); + int free_pct = range_tree_space(rt) * 100 / msp->ms_size; - ASSERT(MUTEX_HELD(sm->sm_lock)); - ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); if (max_size < size) return (-1ULL); @@ -514,216 +1281,265 @@ metaslab_df_alloc(space_map_t *sm, uint6 */ if (max_size < metaslab_df_alloc_threshold || free_pct < metaslab_df_free_pct) { - t = sm->sm_pp_root; + t = &msp->ms_size_tree; *cursor = 0; } return (metaslab_block_picker(t, cursor, size, 1ULL)); } -static boolean_t -metaslab_df_fragmented(space_map_t *sm) -{ - uint64_t max_size = metaslab_pp_maxsize(sm); - int free_pct = sm->sm_space * 100 / sm->sm_size; - - if (max_size >= metaslab_df_alloc_threshold && - free_pct >= metaslab_df_free_pct) - return (B_FALSE); - - return (B_TRUE); -} - -static space_map_ops_t metaslab_df_ops = { - metaslab_pp_load, - metaslab_pp_unload, - metaslab_df_alloc, - metaslab_pp_claim, - metaslab_pp_free, - metaslab_pp_maxsize, - metaslab_df_fragmented +static metaslab_ops_t metaslab_df_ops = { + metaslab_df_alloc }; /* * ========================================================================== - * Other experimental allocators + * Cursor fit block allocator - + * Select the largest region in the metaslab, set the cursor to the beginning + * of the range and the cursor_end to the end of the range. As allocations + * are made advance the cursor. Continue allocating from the cursor until + * the range is exhausted and then find a new range. * ========================================================================== */ static uint64_t -metaslab_cdf_alloc(space_map_t *sm, uint64_t size) +metaslab_cf_alloc(metaslab_t *msp, uint64_t size) { - avl_tree_t *t = &sm->sm_root; - uint64_t *cursor = (uint64_t *)sm->sm_ppd; - uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; - uint64_t max_size = metaslab_pp_maxsize(sm); - uint64_t rsize = size; + range_tree_t *rt = msp->ms_tree; + avl_tree_t *t = &msp->ms_size_tree; + uint64_t *cursor = &msp->ms_lbas[0]; + uint64_t *cursor_end = &msp->ms_lbas[1]; uint64_t offset = 0; - ASSERT(MUTEX_HELD(sm->sm_lock)); - ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); - - if (max_size < size) - return (-1ULL); + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); - ASSERT3U(*extent_end, >=, *cursor); + ASSERT3U(*cursor_end, >=, *cursor); - /* - * If we're running low on space switch to using the size - * sorted AVL tree (best-fit). - */ - if ((*cursor + size) > *extent_end) { + if ((*cursor + size) > *cursor_end) { + range_seg_t *rs; - t = sm->sm_pp_root; - *cursor = *extent_end = 0; + rs = avl_last(&msp->ms_size_tree); + if (rs == NULL || (rs->rs_end - rs->rs_start) < size) + return (-1ULL); - if (max_size > 2 * SPA_MAXBLOCKSIZE) - rsize = MIN(metaslab_min_alloc_size, max_size); - offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); - if (offset != -1) - *cursor = offset + size; - } else { - offset = metaslab_block_picker(t, cursor, rsize, 1ULL); + *cursor = rs->rs_start; + *cursor_end = rs->rs_end; } - ASSERT3U(*cursor, <=, *extent_end); + + offset = *cursor; + *cursor += size; + return (offset); } -static boolean_t -metaslab_cdf_fragmented(space_map_t *sm) -{ - uint64_t max_size = metaslab_pp_maxsize(sm); - - if (max_size > (metaslab_min_alloc_size * 10)) - return (B_FALSE); - return (B_TRUE); -} - -static space_map_ops_t metaslab_cdf_ops = { - metaslab_pp_load, - metaslab_pp_unload, - metaslab_cdf_alloc, - metaslab_pp_claim, - metaslab_pp_free, - metaslab_pp_maxsize, - metaslab_cdf_fragmented +static metaslab_ops_t metaslab_cf_ops = { + metaslab_cf_alloc }; +/* + * ========================================================================== + * New dynamic fit allocator - + * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift + * contiguous blocks. If no region is found then just use the largest segment + * that remains. + * ========================================================================== + */ + +/* + * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) + * to request from the allocator. + */ +uint64_t metaslab_ndf_clump_shift = 4; + static uint64_t -metaslab_ndf_alloc(space_map_t *sm, uint64_t size) +metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) { - avl_tree_t *t = &sm->sm_root; + avl_tree_t *t = &msp->ms_tree->rt_root; avl_index_t where; - space_seg_t *ss, ssearch; - uint64_t *cursor = (uint64_t *)sm->sm_ppd; - uint64_t max_size = metaslab_pp_maxsize(sm); + range_seg_t *rs, rsearch; + uint64_t hbit = highbit64(size); + uint64_t *cursor = &msp->ms_lbas[hbit - 1]; + uint64_t max_size = metaslab_block_maxsize(msp); - ASSERT(MUTEX_HELD(sm->sm_lock)); - ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); if (max_size < size) return (-1ULL); - ssearch.ss_start = *cursor; - ssearch.ss_end = *cursor + size; + rsearch.rs_start = *cursor; + rsearch.rs_end = *cursor + size; - ss = avl_find(t, &ssearch, &where); - if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { - t = sm->sm_pp_root; - - if (max_size > 2 * SPA_MAXBLOCKSIZE) - size = MIN(metaslab_min_alloc_size, max_size); - - ssearch.ss_start = 0; - ssearch.ss_end = size; - ss = avl_find(t, &ssearch, &where); - if (ss == NULL) - ss = avl_nearest(t, where, AVL_AFTER); - ASSERT(ss != NULL); - } - - if (ss != NULL) { - if (ss->ss_start + size <= ss->ss_end) { - *cursor = ss->ss_start + size; - return (ss->ss_start); - } + rs = avl_find(t, &rsearch, &where); + if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { + t = &msp->ms_size_tree; + + rsearch.rs_start = 0; + rsearch.rs_end = MIN(max_size, + 1ULL << (hbit + metaslab_ndf_clump_shift)); + rs = avl_find(t, &rsearch, &where); + if (rs == NULL) + rs = avl_nearest(t, where, AVL_AFTER); + ASSERT(rs != NULL); + } + + if ((rs->rs_end - rs->rs_start) >= size) { + *cursor = rs->rs_start + size; + return (rs->rs_start); } return (-1ULL); } -static boolean_t -metaslab_ndf_fragmented(space_map_t *sm) -{ - uint64_t max_size = metaslab_pp_maxsize(sm); - - if (max_size > (metaslab_min_alloc_size * 10)) - return (B_FALSE); - return (B_TRUE); -} - - -static space_map_ops_t metaslab_ndf_ops = { - metaslab_pp_load, - metaslab_pp_unload, - metaslab_ndf_alloc, - metaslab_pp_claim, - metaslab_pp_free, - metaslab_pp_maxsize, - metaslab_ndf_fragmented +static metaslab_ops_t metaslab_ndf_ops = { + metaslab_ndf_alloc }; -space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; +metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; /* * ========================================================================== * Metaslabs * ========================================================================== */ -metaslab_t * -metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, - uint64_t start, uint64_t size, uint64_t txg) + +/* + * Wait for any in-progress metaslab loads to complete. + */ +void +metaslab_load_wait(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + while (msp->ms_loading) { + ASSERT(!msp->ms_loaded); + cv_wait(&msp->ms_load_cv, &msp->ms_lock); + } +} + +int +metaslab_load(metaslab_t *msp) +{ + int error = 0; + boolean_t success = B_FALSE; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(!msp->ms_loaded); + ASSERT(!msp->ms_loading); + + msp->ms_loading = B_TRUE; + + /* + * If the space map has not been allocated yet, then treat + * all the space in the metaslab as free and add it to the + * ms_tree. + */ + if (msp->ms_sm != NULL) + error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); + else + range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); + + success = (error == 0); + msp->ms_loading = B_FALSE; + + if (success) { + ASSERT3P(msp->ms_group, !=, NULL); + msp->ms_loaded = B_TRUE; + + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_walk(msp->ms_defertree[t], + range_tree_remove, msp->ms_tree); + } + msp->ms_max_size = metaslab_block_maxsize(msp); + } + cv_broadcast(&msp->ms_load_cv); + return (error); +} + +void +metaslab_unload(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + range_tree_vacate(msp->ms_tree, NULL, NULL); + msp->ms_loaded = B_FALSE; + msp->ms_weight &= ~METASLAB_ACTIVE_MASK; + msp->ms_max_size = 0; +} + +int +metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, + metaslab_t **msp) { vdev_t *vd = mg->mg_vd; - metaslab_t *msp; + objset_t *mos = vd->vdev_spa->spa_meta_objset; + metaslab_t *ms; + int error; + + ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); + mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); + ms->ms_id = id; + ms->ms_start = id << vd->vdev_ms_shift; + ms->ms_size = 1ULL << vd->vdev_ms_shift; + + /* + * We only open space map objects that already exist. All others + * will be opened when we finally allocate an object for it. + */ + if (object != 0) { + error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, + ms->ms_size, vd->vdev_ashift, &ms->ms_lock); - msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); - mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); + if (error != 0) { + kmem_free(ms, sizeof (metaslab_t)); + return (error); + } - msp->ms_smo_syncing = *smo; + ASSERT(ms->ms_sm != NULL); + } /* - * We create the main space map here, but we don't create the - * allocmaps and freemaps until metaslab_sync_done(). This serves + * We create the main range tree here, but we don't create the + * alloctree and freetree until metaslab_sync_done(). This serves * two purposes: it allows metaslab_sync_done() to detect the * addition of new space; and for debugging, it ensures that we'd * data fault on any attempt to use this metaslab before it's ready. */ - space_map_create(&msp->ms_map, start, size, - vd->vdev_ashift, &msp->ms_lock); + ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock); + metaslab_group_add(mg, ms); - metaslab_group_add(mg, msp); - - if (metaslab_debug && smo->smo_object != 0) { - mutex_enter(&msp->ms_lock); - VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, - SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); - mutex_exit(&msp->ms_lock); - } + metaslab_set_fragmentation(ms); /* * If we're opening an existing pool (txg == 0) or creating * a new one (txg == TXG_INITIAL), all space is available now. * If we're adding space to an existing pool, the new space * does not become available until after this txg has synced. + * The metaslab's weight will also be initialized when we sync + * out this txg. This ensures that we don't attempt to allocate + * from it before we have initialized it completely. */ if (txg <= TXG_INITIAL) - metaslab_sync_done(msp, 0); + metaslab_sync_done(ms, 0); + + /* + * If metaslab_debug_load is set and we're initializing a metaslab + * that has an allocated space map object then load the its space + * map so that can verify frees. + */ + if (metaslab_debug_load && ms->ms_sm != NULL) { + mutex_enter(&ms->ms_lock); + VERIFY0(metaslab_load(ms)); + mutex_exit(&ms->ms_lock); + } if (txg != 0) { vdev_dirty(vd, 0, NULL, txg); - vdev_dirty(vd, VDD_METASLAB, msp, txg); + vdev_dirty(vd, VDD_METASLAB, ms, txg); } - return (msp); + *msp = ms; + + return (0); } void @@ -731,52 +1547,187 @@ metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; - vdev_space_update(mg->mg_vd, - -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); - metaslab_group_remove(mg, msp); mutex_enter(&msp->ms_lock); + VERIFY(msp->ms_group == NULL); + vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), + 0, -msp->ms_size); + space_map_close(msp->ms_sm); - space_map_unload(&msp->ms_map); - space_map_destroy(&msp->ms_map); + metaslab_unload(msp); + range_tree_destroy(msp->ms_tree); for (int t = 0; t < TXG_SIZE; t++) { - space_map_destroy(&msp->ms_allocmap[t]); - space_map_destroy(&msp->ms_freemap[t]); + range_tree_destroy(msp->ms_alloctree[t]); + range_tree_destroy(msp->ms_freetree[t]); } - for (int t = 0; t < TXG_DEFER_SIZE; t++) - space_map_destroy(&msp->ms_defermap[t]); + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_destroy(msp->ms_defertree[t]); + } - ASSERT3S(msp->ms_deferspace, ==, 0); + ASSERT0(msp->ms_deferspace); mutex_exit(&msp->ms_lock); + cv_destroy(&msp->ms_load_cv); mutex_destroy(&msp->ms_lock); kmem_free(msp, sizeof (metaslab_t)); } -#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) -#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) -#define METASLAB_ACTIVE_MASK \ - (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) +#define FRAGMENTATION_TABLE_SIZE 17 + +/* + * This table defines a segment size based fragmentation metric that will + * allow each metaslab to derive its own fragmentation value. This is done + * by calculating the space in each bucket of the spacemap histogram and + * multiplying that by the fragmetation metric in this table. Doing + * this for all buckets and dividing it by the total amount of free + * space in this metaslab (i.e. the total free space in all buckets) gives + * us the fragmentation metric. This means that a high fragmentation metric + * equates to most of the free space being comprised of small segments. + * Conversely, if the metric is low, then most of the free space is in + * large segments. A 10% change in fragmentation equates to approximately + * double the number of segments. + * + * This table defines 0% fragmented space using 16MB segments. Testing has + * shown that segments that are greater than or equal to 16MB do not suffer + * from drastic performance problems. Using this value, we derive the rest + * of the table. Since the fragmentation value is never stored on disk, it + * is possible to change these calculations in the future. + */ +int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { + 100, /* 512B */ + 100, /* 1K */ + 98, /* 2K */ + 95, /* 4K */ + 90, /* 8K */ + 80, /* 16K */ + 70, /* 32K */ + 60, /* 64K */ + 50, /* 128K */ + 40, /* 256K */ + 30, /* 512K */ + 20, /* 1M */ + 15, /* 2M */ + 10, /* 4M */ + 5, /* 8M */ + 0 /* 16M */ +}; + +/* + * Calclate the metaslab's fragmentation metric. A return value + * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does + * not support this metric. Otherwise, the return value should be in the + * range [0, 100]. + */ +static void +metaslab_set_fragmentation(metaslab_t *msp) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + uint64_t fragmentation = 0; + uint64_t total = 0; + boolean_t feature_enabled = spa_feature_is_enabled(spa, + SPA_FEATURE_SPACEMAP_HISTOGRAM); + + if (!feature_enabled) { + msp->ms_fragmentation = ZFS_FRAG_INVALID; + return; + } + + /* + * A null space map means that the entire metaslab is free + * and thus is not fragmented. + */ + if (msp->ms_sm == NULL) { + msp->ms_fragmentation = 0; + return; + } + + /* + * If this metaslab's space map has not been upgraded, flag it + * so that we upgrade next time we encounter it. + */ + if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { + uint64_t txg = spa_syncing_txg(spa); + vdev_t *vd = msp->ms_group->mg_vd; + + if (spa_writeable(spa)) { + msp->ms_condense_wanted = B_TRUE; + vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); + spa_dbgmsg(spa, "txg %llu, requesting force condense: " + "msp %p, vd %p", txg, msp, vd); + } + msp->ms_fragmentation = ZFS_FRAG_INVALID; + return; + } + + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + uint64_t space = 0; + uint8_t shift = msp->ms_sm->sm_shift; + + int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, + FRAGMENTATION_TABLE_SIZE - 1); + + if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) + continue; + + space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); + total += space; + ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); + fragmentation += space * zfs_frag_table[idx]; + } + + if (total > 0) + fragmentation /= total; + ASSERT3U(fragmentation, <=, 100); + + msp->ms_fragmentation = fragmentation; +} + +/* + * Compute a weight -- a selection preference value -- for the given metaslab. + * This is based on the amount of free space, the level of fragmentation, + * the LBA range, and whether the metaslab is loaded. + */ static uint64_t -metaslab_weight(metaslab_t *msp) +metaslab_space_weight(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; - space_map_t *sm = &msp->ms_map; - space_map_obj_t *smo = &msp->ms_smo; vdev_t *vd = mg->mg_vd; uint64_t weight, space; ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(!vd->vdev_removing); /* * The baseline weight is the metaslab's free space. */ - space = sm->sm_size - smo->smo_alloc; + space = msp->ms_size - space_map_allocated(msp->ms_sm); + + if (metaslab_fragmentation_factor_enabled && + msp->ms_fragmentation != ZFS_FRAG_INVALID) { + /* + * Use the fragmentation information to inversely scale + * down the baseline weight. We need to ensure that we + * don't exclude this metaslab completely when it's 100% + * fragmented. To avoid this we reduce the fragmented value + * by 1. + */ + space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; + + /* + * If space < SPA_MINBLOCKSIZE, then we will not allocate from + * this metaslab again. The fragmentation metric may have + * decreased the space to something smaller than + * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE + * so that we can consume any remaining space. + */ + if (space > 0 && space < SPA_MINBLOCKSIZE) + space = SPA_MINBLOCKSIZE; + } weight = space; /* @@ -788,219 +1739,669 @@ metaslab_weight(metaslab_t *msp) * In effect, this means that we'll select the metaslab with the most * free bandwidth rather than simply the one with the most free space. */ - weight = 2 * weight - - ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; - ASSERT(weight >= space && weight <= 2 * space); + if (metaslab_lba_weighting_enabled) { + weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; + ASSERT(weight >= space && weight <= 2 * space); + } /* - * For locality, assign higher weight to metaslabs which have - * a lower offset than what we've already activated. + * If this metaslab is one we're actively using, adjust its + * weight to make it preferable to any inactive metaslab so + * we'll polish it off. If the fragmentation on this metaslab + * has exceed our threshold, then don't mark it active. */ - if (sm->sm_start <= mg->mg_bonus_area) - weight *= (metaslab_smo_bonus_pct / 100); - ASSERT(weight >= space && - weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); + if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && + msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { + weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + } + + WEIGHT_SET_SPACEBASED(weight); + return (weight); +} + +/* + * Return the weight of the specified metaslab, according to the segment-based + * weighting algorithm. The metaslab must be loaded. This function can + * be called within a sync pass since it relies only on the metaslab's + * range tree which is always accurate when the metaslab is loaded. + */ +static uint64_t +metaslab_weight_from_range_tree(metaslab_t *msp) +{ + uint64_t weight = 0; + uint32_t segments = 0; + + ASSERT(msp->ms_loaded); + + for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; + i--) { + uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; + int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; + + segments <<= 1; + segments += msp->ms_tree->rt_histogram[i]; - if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { /* - * If this metaslab is one we're actively using, adjust its - * weight to make it preferable to any inactive metaslab so - * we'll polish it off. + * The range tree provides more precision than the space map + * and must be downgraded so that all values fit within the + * space map's histogram. This allows us to compare loaded + * vs. unloaded metaslabs to determine which metaslab is + * considered "best". */ - weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + if (i > max_idx) + continue; + + if (segments != 0) { + WEIGHT_SET_COUNT(weight, segments); + WEIGHT_SET_INDEX(weight, i); + WEIGHT_SET_ACTIVE(weight, 0); + break; + } } return (weight); } -static void -metaslab_prefetch(metaslab_group_t *mg) +/* + * Calculate the weight based on the on-disk histogram. This should only + * be called after a sync pass has completely finished since the on-disk + * information is updated in metaslab_sync(). + */ +static uint64_t +metaslab_weight_from_spacemap(metaslab_t *msp) { - spa_t *spa = mg->mg_vd->vdev_spa; - metaslab_t *msp; - avl_tree_t *t = &mg->mg_metaslab_tree; - int m; + uint64_t weight = 0; - mutex_enter(&mg->mg_lock); + for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { + if (msp->ms_sm->sm_phys->smp_histogram[i] != 0) { + WEIGHT_SET_COUNT(weight, + msp->ms_sm->sm_phys->smp_histogram[i]); + WEIGHT_SET_INDEX(weight, i + + msp->ms_sm->sm_shift); + WEIGHT_SET_ACTIVE(weight, 0); + break; + } + } + return (weight); +} + +/* + * Compute a segment-based weight for the specified metaslab. The weight + * is determined by highest bucket in the histogram. The information + * for the highest bucket is encoded into the weight value. + */ +static uint64_t +metaslab_segment_weight(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + uint64_t weight = 0; + uint8_t shift = mg->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); /* - * Prefetch the next potential metaslabs + * The metaslab is completely free. */ - for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { - space_map_t *sm = &msp->ms_map; - space_map_obj_t *smo = &msp->ms_smo; + if (space_map_allocated(msp->ms_sm) == 0) { + int idx = highbit64(msp->ms_size) - 1; + int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; + + if (idx < max_idx) { + WEIGHT_SET_COUNT(weight, 1ULL); + WEIGHT_SET_INDEX(weight, idx); + } else { + WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); + WEIGHT_SET_INDEX(weight, max_idx); + } + WEIGHT_SET_ACTIVE(weight, 0); + ASSERT(!WEIGHT_IS_SPACEBASED(weight)); - /* If we have reached our prefetch limit then we're done */ - if (m >= metaslab_prefetch_limit) - break; + return (weight); + } - if (!sm->sm_loaded && smo->smo_object != 0) { - mutex_exit(&mg->mg_lock); - dmu_prefetch(spa_meta_objset(spa), smo->smo_object, - 0ULL, smo->smo_objsize); - mutex_enter(&mg->mg_lock); - } + ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); + + /* + * If the metaslab is fully allocated then just make the weight 0. + */ + if (space_map_allocated(msp->ms_sm) == msp->ms_size) + return (0); + /* + * If the metaslab is already loaded, then use the range tree to + * determine the weight. Otherwise, we rely on the space map information + * to generate the weight. + */ + if (msp->ms_loaded) { + weight = metaslab_weight_from_range_tree(msp); + } else { + weight = metaslab_weight_from_spacemap(msp); } - mutex_exit(&mg->mg_lock); + + /* + * If the metaslab was active the last time we calculated its weight + * then keep it active. We want to consume the entire region that + * is associated with this weight. + */ + if (msp->ms_activation_weight != 0 && weight != 0) + WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); + return (weight); } -static int -metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) +/* + * Determine if we should attempt to allocate from this metaslab. If the + * metaslab has a maximum size then we can quickly determine if the desired + * allocation size can be satisfied. Otherwise, if we're using segment-based + * weighting then we can determine the maximum allocation that this metaslab + * can accommodate based on the index encoded in the weight. If we're using + * space-based weights then rely on the entire weight (excluding the weight + * type bit). + */ +boolean_t +metaslab_should_allocate(metaslab_t *msp, uint64_t asize) { - metaslab_group_t *mg = msp->ms_group; - space_map_t *sm = &msp->ms_map; - space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; + boolean_t should_allocate; + + if (msp->ms_max_size != 0) + return (msp->ms_max_size >= asize); + + if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { + /* + * The metaslab segment weight indicates segments in the + * range [2^i, 2^(i+1)), where i is the index in the weight. + * Since the asize might be in the middle of the range, we + * should attempt the allocation if asize < 2^(i+1). + */ + should_allocate = (asize < + 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); + } else { + should_allocate = (asize <= + (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); + } + return (should_allocate); +} + +static uint64_t +metaslab_weight(metaslab_t *msp) +{ + vdev_t *vd = msp->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + uint64_t weight; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * This vdev is in the process of being removed so there is nothing + * for us to do here. + */ + if (vd->vdev_removing) { + ASSERT0(space_map_allocated(msp->ms_sm)); + ASSERT0(vd->vdev_ms_shift); + return (0); + } + + metaslab_set_fragmentation(msp); + + /* + * Update the maximum size if the metaslab is loaded. This will + * ensure that we get an accurate maximum size if newly freed space + * has been added back into the free tree. + */ + if (msp->ms_loaded) + msp->ms_max_size = metaslab_block_maxsize(msp); + + /* + * Segment-based weighting requires space map histogram support. + */ + if (zfs_metaslab_segment_weight_enabled && + spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && + (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == + sizeof (space_map_phys_t))) { + weight = metaslab_segment_weight(msp); + } else { + weight = metaslab_space_weight(msp); + } + return (weight); +} +static int +metaslab_activate(metaslab_t *msp, uint64_t activation_weight) +{ ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - space_map_load_wait(sm); - if (!sm->sm_loaded) { - int error = space_map_load(sm, sm_ops, SM_FREE, - &msp->ms_smo, - spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); - if (error) { + metaslab_load_wait(msp); + if (!msp->ms_loaded) { + int error = metaslab_load(msp); + if (error) { metaslab_group_sort(msp->ms_group, msp, 0); return (error); } - for (int t = 0; t < TXG_DEFER_SIZE; t++) - space_map_walk(&msp->ms_defermap[t], - space_map_claim, sm); - } - /* - * Track the bonus area as we activate new metaslabs. - */ - if (sm->sm_start > mg->mg_bonus_area) { - mutex_enter(&mg->mg_lock); - mg->mg_bonus_area = sm->sm_start; - mutex_exit(&mg->mg_lock); - } - - /* - * If we were able to load the map then make sure - * that this map is still able to satisfy our request. - */ - if (msp->ms_weight < size) - return (ENOSPC); - + msp->ms_activation_weight = msp->ms_weight; metaslab_group_sort(msp->ms_group, msp, msp->ms_weight | activation_weight); } - ASSERT(sm->sm_loaded); + ASSERT(msp->ms_loaded); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); return (0); } static void -metaslab_passivate(metaslab_t *msp, uint64_t size) +metaslab_passivate(metaslab_t *msp, uint64_t weight) { + uint64_t size = weight & ~METASLAB_WEIGHT_TYPE; + /* * If size < SPA_MINBLOCKSIZE, then we will not allocate from * this metaslab again. In that case, it had better be empty, * or we would be leaving space on the table. */ - ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); - metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); + ASSERT(size >= SPA_MINBLOCKSIZE || + range_tree_space(msp->ms_tree) == 0); + ASSERT0(weight & METASLAB_ACTIVE_MASK); + + msp->ms_activation_weight = 0; + metaslab_group_sort(msp->ms_group, msp, weight); ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); } /* + * Segment-based metaslabs are activated once and remain active until + * we either fail an allocation attempt (similar to space-based metaslabs) + * or have exhausted the free space in zfs_metaslab_switch_threshold + * buckets since the metaslab was activated. This function checks to see + * if we've exhaused the zfs_metaslab_switch_threshold buckets in the + * metaslab and passivates it proactively. This will allow us to select a + * metaslabs with larger contiguous region if any remaining within this + * metaslab group. If we're in sync pass > 1, then we continue using this + * metaslab so that we don't dirty more block and cause more sync passes. + */ +void +metaslab_segment_may_passivate(metaslab_t *msp) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + + if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) + return; + + /* + * Since we are in the middle of a sync pass, the most accurate + * information that is accessible to us is the in-core range tree + * histogram; calculate the new weight based on that information. + */ + uint64_t weight = metaslab_weight_from_range_tree(msp); + int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); + int current_idx = WEIGHT_GET_INDEX(weight); + + if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) + metaslab_passivate(msp, weight); +} + +static void +metaslab_preload(void *arg) +{ + metaslab_t *msp = arg; + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + + ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); + + mutex_enter(&msp->ms_lock); + metaslab_load_wait(msp); + if (!msp->ms_loaded) + (void) metaslab_load(msp); + msp->ms_selected_txg = spa_syncing_txg(spa); + mutex_exit(&msp->ms_lock); +} + +static void +metaslab_group_preload(metaslab_group_t *mg) +{ + spa_t *spa = mg->mg_vd->vdev_spa; + metaslab_t *msp; + avl_tree_t *t = &mg->mg_metaslab_tree; + int m = 0; + + if (spa_shutting_down(spa) || !metaslab_preload_enabled) { + taskq_wait(mg->mg_taskq); + return; + } + + mutex_enter(&mg->mg_lock); + /* + * Load the next potential metaslabs + */ + for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { + /* + * We preload only the maximum number of metaslabs specified + * by metaslab_preload_limit. If a metaslab is being forced + * to condense then we preload it too. This will ensure + * that force condensing happens in the next txg. + */ + if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { + continue; + } + + VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, + msp, TQ_SLEEP) != 0); + } + mutex_exit(&mg->mg_lock); +} + +/* + * Determine if the space map's on-disk footprint is past our tolerance + * for inefficiency. We would like to use the following criteria to make + * our decision: + * + * 1. The size of the space map object should not dramatically increase as a + * result of writing out the free space range tree. + * + * 2. The minimal on-disk space map representation is zfs_condense_pct/100 + * times the size than the free space range tree representation + * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). + * + * 3. The on-disk size of the space map should actually decrease. + * + * Checking the first condition is tricky since we don't want to walk + * the entire AVL tree calculating the estimated on-disk size. Instead we + * use the size-ordered range tree in the metaslab and calculate the + * size required to write out the largest segment in our free tree. If the + * size required to represent that segment on disk is larger than the space + * map object then we avoid condensing this map. + * + * To determine the second criterion we use a best-case estimate and assume + * each segment can be represented on-disk as a single 64-bit entry. We refer + * to this best-case estimate as the space map's minimal form. + * + * Unfortunately, we cannot compute the on-disk size of the space map in this + * context because we cannot accurately compute the effects of compression, etc. + * Instead, we apply the heuristic described in the block comment for + * zfs_metaslab_condense_block_threshold - we only condense if the space used + * is greater than a threshold number of blocks. + */ +static boolean_t +metaslab_should_condense(metaslab_t *msp) +{ + space_map_t *sm = msp->ms_sm; + range_seg_t *rs; + uint64_t size, entries, segsz, object_size, optimal_size, record_size; + dmu_object_info_t doi; + uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(msp->ms_loaded); + + /* + * Use the ms_size_tree range tree, which is ordered by size, to + * obtain the largest segment in the free tree. We always condense + * metaslabs that are empty and metaslabs for which a condense + * request has been made. + */ + rs = avl_last(&msp->ms_size_tree); + if (rs == NULL || msp->ms_condense_wanted) + return (B_TRUE); + + /* + * Calculate the number of 64-bit entries this segment would + * require when written to disk. If this single segment would be + * larger on-disk than the entire current on-disk structure, then + * clearly condensing will increase the on-disk structure size. + */ + size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; + entries = size / (MIN(size, SM_RUN_MAX)); + segsz = entries * sizeof (uint64_t); + + optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); + object_size = space_map_length(msp->ms_sm); + + dmu_object_info_from_db(sm->sm_dbuf, &doi); + record_size = MAX(doi.doi_data_block_size, vdev_blocksize); + + return (segsz <= object_size && + object_size >= (optimal_size * zfs_condense_pct / 100) && + object_size > zfs_metaslab_condense_block_threshold * record_size); +} + +/* + * Condense the on-disk space map representation to its minimized form. + * The minimized form consists of a small number of allocations followed by + * the entries of the free range tree. + */ +static void +metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; + range_tree_t *condense_tree; + space_map_t *sm = msp->ms_sm; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(msp->ms_loaded); + + + spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, " + "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, + msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, + msp->ms_group->mg_vd->vdev_spa->spa_name, + space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root), + msp->ms_condense_wanted ? "TRUE" : "FALSE"); + + msp->ms_condense_wanted = B_FALSE; + + /* + * Create an range tree that is 100% allocated. We remove segments + * that have been freed in this txg, any deferred frees that exist, + * and any allocation in the future. Removing segments should be + * a relatively inexpensive operation since we expect these trees to + * have a small number of nodes. + */ + condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); + range_tree_add(condense_tree, msp->ms_start, msp->ms_size); + + /* + * Remove what's been freed in this txg from the condense_tree. + * Since we're in sync_pass 1, we know that all the frees from + * this txg are in the freetree. + */ + range_tree_walk(freetree, range_tree_remove, condense_tree); + + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_walk(msp->ms_defertree[t], + range_tree_remove, condense_tree); + } + + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { + range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], + range_tree_remove, condense_tree); + } + + /* + * We're about to drop the metaslab's lock thus allowing + * other consumers to change it's content. Set the + * metaslab's ms_condensing flag to ensure that + * allocations on this metaslab do not occur while we're + * in the middle of committing it to disk. This is only critical + * for the ms_tree as all other range trees use per txg + * views of their content. + */ + msp->ms_condensing = B_TRUE; + + mutex_exit(&msp->ms_lock); + space_map_truncate(sm, tx); + mutex_enter(&msp->ms_lock); + + /* + * While we would ideally like to create a space map representation + * that consists only of allocation records, doing so can be + * prohibitively expensive because the in-core free tree can be + * large, and therefore computationally expensive to subtract + * from the condense_tree. Instead we sync out two trees, a cheap + * allocation only tree followed by the in-core free tree. While not + * optimal, this is typically close to optimal, and much cheaper to + * compute. + */ + space_map_write(sm, condense_tree, SM_ALLOC, tx); + range_tree_vacate(condense_tree, NULL, NULL); + range_tree_destroy(condense_tree); + + space_map_write(sm, msp->ms_tree, SM_FREE, tx); + msp->ms_condensing = B_FALSE; +} + +/* * Write a metaslab to disk in the context of the specified transaction group. */ void metaslab_sync(metaslab_t *msp, uint64_t txg) { - vdev_t *vd = msp->ms_group->mg_vd; + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; spa_t *spa = vd->vdev_spa; objset_t *mos = spa_meta_objset(spa); - space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; - space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; - space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; - space_map_t *sm = &msp->ms_map; - space_map_obj_t *smo = &msp->ms_smo_syncing; - dmu_buf_t *db; + range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; + range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK]; + range_tree_t **freed_tree = + &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; dmu_tx_t *tx; + uint64_t object = space_map_object(msp->ms_sm); ASSERT(!vd->vdev_ishole); - if (allocmap->sm_space == 0 && freemap->sm_space == 0) + /* + * This metaslab has just been added so there's no work to do now. + */ + if (*freetree == NULL) { + ASSERT3P(alloctree, ==, NULL); + return; + } + + ASSERT3P(alloctree, !=, NULL); + ASSERT3P(*freetree, !=, NULL); + ASSERT3P(*freed_tree, !=, NULL); + + /* + * Normally, we don't want to process a metaslab if there + * are no allocations or frees to perform. However, if the metaslab + * is being forced to condense we need to let it through. + */ + if (range_tree_space(alloctree) == 0 && + range_tree_space(*freetree) == 0 && + !msp->ms_condense_wanted) return; /* * The only state that can actually be changing concurrently with - * metaslab_sync() is the metaslab's ms_map. No other thread can - * be modifying this txg's allocmap, freemap, freed_map, or smo. - * Therefore, we only hold ms_lock to satify space_map ASSERTs. - * We drop it whenever we call into the DMU, because the DMU - * can call down to us (e.g. via zio_free()) at any time. + * metaslab_sync() is the metaslab's ms_tree. No other thread can + * be modifying this txg's alloctree, freetree, freed_tree, or + * space_map_phys_t. Therefore, we only hold ms_lock to satify + * space map ASSERTs. We drop it whenever we call into the DMU, + * because the DMU can call down to us (e.g. via zio_free()) at + * any time. */ tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); - if (smo->smo_object == 0) { - ASSERT(smo->smo_objsize == 0); - ASSERT(smo->smo_alloc == 0); - smo->smo_object = dmu_object_alloc(mos, - DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, - DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); - ASSERT(smo->smo_object != 0); - dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * - (sm->sm_start >> vd->vdev_ms_shift), - sizeof (uint64_t), &smo->smo_object, tx); + if (msp->ms_sm == NULL) { + uint64_t new_object; + + new_object = space_map_alloc(mos, tx); + VERIFY3U(new_object, !=, 0); + + VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, + msp->ms_start, msp->ms_size, vd->vdev_ashift, + &msp->ms_lock)); + ASSERT(msp->ms_sm != NULL); } mutex_enter(&msp->ms_lock); - space_map_walk(freemap, space_map_add, freed_map); + /* + * Note: metaslab_condense() clears the space map's histogram. + * Therefore we must verify and remove this histogram before + * condensing. + */ + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + metaslab_group_histogram_remove(mg, msp); + + if (msp->ms_loaded && spa_sync_pass(spa) == 1 && + metaslab_should_condense(msp)) { + metaslab_condense(msp, txg, tx); + } else { + space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); + space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); + } + + if (msp->ms_loaded) { + /* + * When the space map is loaded, we have an accruate + * histogram in the range tree. This gives us an opportunity + * to bring the space map's histogram up-to-date so we clear + * it first before updating it. + */ + space_map_histogram_clear(msp->ms_sm); + space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); - if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= - 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { /* - * The in-core space map representation is twice as compact - * as the on-disk one, so it's time to condense the latter - * by generating a pure allocmap from first principles. - * - * This metaslab is 100% allocated, - * minus the content of the in-core map (sm), - * minus what's been freed this txg (freed_map), - * minus deferred frees (ms_defermap[]), - * minus allocations from txgs in the future - * (because they haven't been committed yet). - */ - space_map_vacate(allocmap, NULL, NULL); - space_map_vacate(freemap, NULL, NULL); - - space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); - - space_map_walk(sm, space_map_remove, allocmap); - space_map_walk(freed_map, space_map_remove, allocmap); - - for (int t = 0; t < TXG_DEFER_SIZE; t++) - space_map_walk(&msp->ms_defermap[t], - space_map_remove, allocmap); - - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) - space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], - space_map_remove, allocmap); + * Since we've cleared the histogram we need to add back + * any free space that has already been processed, plus + * any deferred space. This allows the on-disk histogram + * to accurately reflect all free space even if some space + * is not yet available for allocation (i.e. deferred). + */ + space_map_histogram_add(msp->ms_sm, *freed_tree, tx); - mutex_exit(&msp->ms_lock); - space_map_truncate(smo, mos, tx); - mutex_enter(&msp->ms_lock); + /* + * Add back any deferred free space that has not been + * added back into the in-core free tree yet. This will + * ensure that we don't end up with a space map histogram + * that is completely empty unless the metaslab is fully + * allocated. + */ + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + space_map_histogram_add(msp->ms_sm, + msp->ms_defertree[t], tx); + } + } + + /* + * Always add the free space from this sync pass to the space + * map histogram. We want to make sure that the on-disk histogram + * accounts for all free space. If the space map is not loaded, + * then we will lose some accuracy but will correct it the next + * time we load the space map. + */ + space_map_histogram_add(msp->ms_sm, *freetree, tx); + + metaslab_group_histogram_add(mg, msp); + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + + /* + * For sync pass 1, we avoid traversing this txg's free range tree + * and instead will just swap the pointers for freetree and + * freed_tree. We can safely do this since the freed_tree is + * guaranteed to be empty on the initial pass. + */ + if (spa_sync_pass(spa) == 1) { + range_tree_swap(freetree, freed_tree); + } else { + range_tree_vacate(*freetree, range_tree_add, *freed_tree); } + range_tree_vacate(alloctree, NULL, NULL); - space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); - space_map_sync(freemap, SM_FREE, smo, mos, tx); + ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_alloctree[TXG_CLEAN(txg) & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); mutex_exit(&msp->ms_lock); - VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); - dmu_buf_will_dirty(db, tx); - ASSERT3U(db->db_size, >=, sizeof (*smo)); - bcopy(smo, db->db_data, sizeof (*smo)); - dmu_buf_rele(db, FTAG); - + if (object != space_map_object(msp->ms_sm)) { + object = space_map_object(msp->ms_sm); + dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * + msp->ms_id, sizeof (uint64_t), &object, tx); + } dmu_tx_commit(tx); } @@ -1011,14 +2412,13 @@ metaslab_sync(metaslab_t *msp, uint64_t void metaslab_sync_done(metaslab_t *msp, uint64_t txg) { - space_map_obj_t *smo = &msp->ms_smo; - space_map_obj_t *smosync = &msp->ms_smo_syncing; - space_map_t *sm = &msp->ms_map; - space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; - space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; + spa_t *spa = vd->vdev_spa; + range_tree_t **freed_tree; + range_tree_t **defer_tree; int64_t alloc_delta, defer_delta; + boolean_t defer_allowed = B_TRUE; ASSERT(!vd->vdev_ishole); @@ -1026,46 +2426,79 @@ metaslab_sync_done(metaslab_t *msp, uint /* * If this metaslab is just becoming available, initialize its - * allocmaps and freemaps and add its capacity to the vdev. + * alloctrees, freetrees, and defertree and add its capacity to + * the vdev. */ - if (freed_map->sm_size == 0) { + if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) { for (int t = 0; t < TXG_SIZE; t++) { - space_map_create(&msp->ms_allocmap[t], sm->sm_start, - sm->sm_size, sm->sm_shift, sm->sm_lock); - space_map_create(&msp->ms_freemap[t], sm->sm_start, - sm->sm_size, sm->sm_shift, sm->sm_lock); + ASSERT(msp->ms_alloctree[t] == NULL); + ASSERT(msp->ms_freetree[t] == NULL); + + msp->ms_alloctree[t] = range_tree_create(NULL, msp, + &msp->ms_lock); + msp->ms_freetree[t] = range_tree_create(NULL, msp, + &msp->ms_lock); + } + + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + ASSERT(msp->ms_defertree[t] == NULL); + + msp->ms_defertree[t] = range_tree_create(NULL, msp, + &msp->ms_lock); } - for (int t = 0; t < TXG_DEFER_SIZE; t++) - space_map_create(&msp->ms_defermap[t], sm->sm_start, - sm->sm_size, sm->sm_shift, sm->sm_lock); + vdev_space_update(vd, 0, 0, msp->ms_size); + } + + freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; + defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; - vdev_space_update(vd, 0, 0, sm->sm_size); + uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - + metaslab_class_get_alloc(spa_normal_class(spa)); + if (free_space <= spa_get_slop_space(spa)) { + defer_allowed = B_FALSE; } - alloc_delta = smosync->smo_alloc - smo->smo_alloc; - defer_delta = freed_map->sm_space - defer_map->sm_space; + defer_delta = 0; + alloc_delta = space_map_alloc_delta(msp->ms_sm); + if (defer_allowed) { + defer_delta = range_tree_space(*freed_tree) - + range_tree_space(*defer_tree); + } else { + defer_delta -= range_tree_space(*defer_tree); + } vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); - ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); - ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); + ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); /* - * If there's a space_map_load() in progress, wait for it to complete + * If there's a metaslab_load() in progress, wait for it to complete * so that we have a consistent view of the in-core space map. - * Then, add defer_map (oldest deferred frees) to this map and - * transfer freed_map (this txg's frees) to defer_map. */ - space_map_load_wait(sm); - space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); - space_map_vacate(freed_map, space_map_add, defer_map); + metaslab_load_wait(msp); + + /* + * Move the frees from the defer_tree back to the free + * range tree (if it's loaded). Swap the freed_tree and the + * defer_tree -- this is safe to do because we've just emptied out + * the defer_tree. + */ + range_tree_vacate(*defer_tree, + msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); + if (defer_allowed) { + range_tree_swap(freed_tree, defer_tree); + } else { + range_tree_vacate(*freed_tree, + msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); + } - *smo = *smosync; + space_map_update(msp->ms_sm); msp->ms_deferspace += defer_delta; ASSERT3S(msp->ms_deferspace, >=, 0); - ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); + ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); if (msp->ms_deferspace != 0) { /* * Keep syncing this metaslab until all deferred frees @@ -1075,22 +2508,25 @@ metaslab_sync_done(metaslab_t *msp, uint } /* - * If the map is loaded but no longer active, evict it as soon as all - * future allocations have synced. (If we unloaded it now and then - * loaded a moment later, the map wouldn't reflect those allocations.) - */ - if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - int evictable = 1; - - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) - if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) - evictable = 0; + * Calculate the new weights before unloading any metaslabs. + * This will give us the most accurate weighting. + */ + metaslab_group_sort(mg, msp, metaslab_weight(msp)); - if (evictable && !metaslab_debug) - space_map_unload(sm); - } + /* + * If the metaslab is loaded and we've not tried to load or allocate + * from it in 'metaslab_unload_delay' txgs, then unload it. + */ + if (msp->ms_loaded && + msp->ms_selected_txg + metaslab_unload_delay < txg) { + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { + VERIFY0(range_tree_space( + msp->ms_alloctree[(txg + t) & TXG_MASK])); + } - metaslab_group_sort(mg, msp, metaslab_weight(msp)); + if (!metaslab_debug_unload) + metaslab_unload(msp); + } mutex_exit(&msp->ms_lock); } @@ -1098,27 +2534,13 @@ metaslab_sync_done(metaslab_t *msp, uint void metaslab_sync_reassess(metaslab_group_t *mg) { - vdev_t *vd = mg->mg_vd; - - /* - * Re-evaluate all metaslabs which have lower offsets than the - * bonus area. - */ - for (int m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - - if (msp->ms_map.sm_start > mg->mg_bonus_area) - break; - - mutex_enter(&msp->ms_lock); - metaslab_group_sort(mg, msp, metaslab_weight(msp)); - mutex_exit(&msp->ms_lock); - } + metaslab_group_alloc_update(mg); + mg->mg_fragmentation = metaslab_group_fragmentation(mg); /* - * Prefetch the next potential metaslabs + * Preload the next potential metaslabs */ - metaslab_prefetch(mg); + metaslab_group_preload(mg); } static uint64_t @@ -1126,7 +2548,7 @@ metaslab_distance(metaslab_t *msp, dva_t { uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; - uint64_t start = msp->ms_map.sm_start >> ms_shift; + uint64_t start = msp->ms_id; if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) return (1ULL << 63); @@ -1138,13 +2560,205 @@ metaslab_distance(metaslab_t *msp, dva_t return (0); } +/* + * ========================================================================== + * Metaslab allocation tracing facility + * ========================================================================== + */ +kstat_t *metaslab_trace_ksp; +kstat_named_t metaslab_trace_over_limit; + +void +metaslab_alloc_trace_init(void) +{ + ASSERT(metaslab_alloc_trace_cache == NULL); + metaslab_alloc_trace_cache = kmem_cache_create( + "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + metaslab_trace_ksp = kstat_create("zfs", 0, "metaslab_trace_stats", + "misc", KSTAT_TYPE_NAMED, 1, KSTAT_FLAG_VIRTUAL); + if (metaslab_trace_ksp != NULL) { + metaslab_trace_ksp->ks_data = &metaslab_trace_over_limit; + kstat_named_init(&metaslab_trace_over_limit, + "metaslab_trace_over_limit", KSTAT_DATA_UINT64); + kstat_install(metaslab_trace_ksp); + } +} + +void +metaslab_alloc_trace_fini(void) +{ + if (metaslab_trace_ksp != NULL) { + kstat_delete(metaslab_trace_ksp); + metaslab_trace_ksp = NULL; + } + kmem_cache_destroy(metaslab_alloc_trace_cache); + metaslab_alloc_trace_cache = NULL; +} + +/* + * Add an allocation trace element to the allocation tracing list. + */ +static void +metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, + metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset) +{ + if (!metaslab_trace_enabled) + return; + + /* + * When the tracing list reaches its maximum we remove + * the second element in the list before adding a new one. + * By removing the second element we preserve the original + * entry as a clue to what allocations steps have already been + * performed. + */ + if (zal->zal_size == metaslab_trace_max_entries) { + metaslab_alloc_trace_t *mat_next; +#ifdef DEBUG + panic("too many entries in allocation list"); +#endif + atomic_inc_64(&metaslab_trace_over_limit.value.ui64); + zal->zal_size--; + mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); + list_remove(&zal->zal_list, mat_next); + kmem_cache_free(metaslab_alloc_trace_cache, mat_next); + } + + metaslab_alloc_trace_t *mat = + kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); + list_link_init(&mat->mat_list_node); + mat->mat_mg = mg; + mat->mat_msp = msp; + mat->mat_size = psize; + mat->mat_dva_id = dva_id; + mat->mat_offset = offset; + mat->mat_weight = 0; + + if (msp != NULL) + mat->mat_weight = msp->ms_weight; + + /* + * The list is part of the zio so locking is not required. Only + * a single thread will perform allocations for a given zio. + */ + list_insert_tail(&zal->zal_list, mat); + zal->zal_size++; + + ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); +} + +void +metaslab_trace_init(zio_alloc_list_t *zal) +{ + list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), + offsetof(metaslab_alloc_trace_t, mat_list_node)); + zal->zal_size = 0; +} + +void +metaslab_trace_fini(zio_alloc_list_t *zal) +{ + metaslab_alloc_trace_t *mat; + + while ((mat = list_remove_head(&zal->zal_list)) != NULL) + kmem_cache_free(metaslab_alloc_trace_cache, mat); + list_destroy(&zal->zal_list); + zal->zal_size = 0; +} + +/* + * ========================================================================== + * Metaslab block operations + * ========================================================================== + */ + +static void +metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags) +{ + if (!(flags & METASLAB_ASYNC_ALLOC) || + flags & METASLAB_DONT_THROTTLE) + return; + + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + if (!mg->mg_class->mc_alloc_throttle_enabled) + return; + + (void) refcount_add(&mg->mg_alloc_queue_depth, tag); +} + +void +metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags) +{ + if (!(flags & METASLAB_ASYNC_ALLOC) || + flags & METASLAB_DONT_THROTTLE) + return; + + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + if (!mg->mg_class->mc_alloc_throttle_enabled) + return; + + (void) refcount_remove(&mg->mg_alloc_queue_depth, tag); +} + +void +metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) +{ +#ifdef ZFS_DEBUG + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + + for (int d = 0; d < ndvas; d++) { + uint64_t vdev = DVA_GET_VDEV(&dva[d]); + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag)); + } +#endif +} + +static uint64_t +metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) +{ + uint64_t start; + range_tree_t *rt = msp->ms_tree; + metaslab_class_t *mc = msp->ms_group->mg_class; + + VERIFY(!msp->ms_condensing); + + start = mc->mc_ops->msop_alloc(msp, size); + if (start != -1ULL) { + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + + VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); + range_tree_remove(rt, start, size); + + if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) + vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); + + range_tree_add(msp->ms_alloctree[txg & TXG_MASK], start, size); + + /* Track the last successful allocation */ + msp->ms_alloc_txg = txg; + metaslab_verify_space(msp, txg); + } + + /* + * Now that we've attempted the allocation we need to update the + * metaslab's maximum block size since it may have changed. + */ + msp->ms_max_size = metaslab_block_maxsize(msp); + return (start); +} + static uint64_t -metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, - uint64_t min_distance, dva_t *dva, int d) +metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, + uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) { metaslab_t *msp = NULL; uint64_t offset = -1ULL; - avl_tree_t *t = &mg->mg_metaslab_tree; uint64_t activation_weight; uint64_t target_distance; int i; @@ -1157,33 +2771,70 @@ metaslab_group_alloc(metaslab_group_t *m } } + metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); + search->ms_weight = UINT64_MAX; + search->ms_start = 0; for (;;) { boolean_t was_active; + avl_tree_t *t = &mg->mg_metaslab_tree; + avl_index_t idx; mutex_enter(&mg->mg_lock); - for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { - if (msp->ms_weight < size) { - mutex_exit(&mg->mg_lock); - return (-1ULL); + + /* + * Find the metaslab with the highest weight that is less + * than what we've already tried. In the common case, this + * means that we will examine each metaslab at most once. + * Note that concurrent callers could reorder metaslabs + * by activation/passivation once we have dropped the mg_lock. + * If a metaslab is activated by another thread, and we fail + * to allocate from the metaslab we have selected, we may + * not try the newly-activated metaslab, and instead activate + * another metaslab. This is not optimal, but generally + * does not cause any problems (a possible exception being + * if every metaslab is completely full except for the + * the newly-activated metaslab which we fail to examine). + */ + msp = avl_find(t, search, &idx); + if (msp == NULL) + msp = avl_nearest(t, idx, AVL_AFTER); + for (; msp != NULL; msp = AVL_NEXT(t, msp)) { + + if (!metaslab_should_allocate(msp, asize)) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_TOO_SMALL); + continue; } + /* + * If the selected metaslab is condensing, skip it. + */ + if (msp->ms_condensing) + continue; + was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; if (activation_weight == METASLAB_WEIGHT_PRIMARY) break; target_distance = min_distance + - (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); + (space_map_allocated(msp->ms_sm) != 0 ? 0 : + min_distance >> 1); - for (i = 0; i < d; i++) + for (i = 0; i < d; i++) { if (metaslab_distance(msp, &dva[i]) < target_distance) break; + } if (i == d) break; } mutex_exit(&mg->mg_lock); - if (msp == NULL) + if (msp == NULL) { + kmem_free(search, sizeof (*search)); return (-1ULL); + } + search->ms_weight = msp->ms_weight; + search->ms_start = msp->ms_start + 1; mutex_enter(&msp->ms_lock); @@ -1191,11 +2842,11 @@ metaslab_group_alloc(metaslab_group_t *m * Ensure that the metaslab we have selected is still * capable of handling our request. It's possible that * another thread may have changed the weight while we - * were blocked on the metaslab lock. + * were blocked on the metaslab lock. We check the + * active status first to see if we need to reselect + * a new metaslab. */ - if (msp->ms_weight < size || (was_active && - !(msp->ms_weight & METASLAB_ACTIVE_MASK) && - activation_weight == METASLAB_WEIGHT_PRIMARY)) { + if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { mutex_exit(&msp->ms_lock); continue; } @@ -1208,53 +2859,157 @@ metaslab_group_alloc(metaslab_group_t *m continue; } - if (metaslab_activate(msp, activation_weight, size) != 0) { + if (metaslab_activate(msp, activation_weight) != 0) { + mutex_exit(&msp->ms_lock); + continue; + } + msp->ms_selected_txg = txg; + + /* + * Now that we have the lock, recheck to see if we should + * continue to use this metaslab for this allocation. The + * the metaslab is now loaded so metaslab_should_allocate() can + * accurately determine if the allocation attempt should + * proceed. + */ + if (!metaslab_should_allocate(msp, asize)) { + /* Passivate this metaslab and select a new one. */ + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_TOO_SMALL); + goto next; + } + + /* + * If this metaslab is currently condensing then pick again as + * we can't manipulate this metaslab until it's committed + * to disk. + */ + if (msp->ms_condensing) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_CONDENSING); mutex_exit(&msp->ms_lock); continue; } - if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) + offset = metaslab_block_alloc(msp, asize, txg); + metaslab_trace_add(zal, mg, msp, asize, d, offset); + + if (offset != -1ULL) { + /* Proactively passivate the metaslab, if needed */ + metaslab_segment_may_passivate(msp); break; + } +next: + ASSERT(msp->ms_loaded); - metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); + /* + * We were unable to allocate from this metaslab so determine + * a new weight for this metaslab. Now that we have loaded + * the metaslab we can provide a better hint to the metaslab + * selector. + * + * For space-based metaslabs, we use the maximum block size. + * This information is only available when the metaslab + * is loaded and is more accurate than the generic free + * space weight that was calculated by metaslab_weight(). + * This information allows us to quickly compare the maximum + * available allocation in the metaslab to the allocation + * size being requested. + * + * For segment-based metaslabs, determine the new weight + * based on the highest bucket in the range tree. We + * explicitly use the loaded segment weight (i.e. the range + * tree histogram) since it contains the space that is + * currently available for allocation and is accurate + * even within a sync pass. + */ + if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { + uint64_t weight = metaslab_block_maxsize(msp); + WEIGHT_SET_SPACEBASED(weight); + metaslab_passivate(msp, weight); + } else { + metaslab_passivate(msp, + metaslab_weight_from_range_tree(msp)); + } + /* + * We have just failed an allocation attempt, check + * that metaslab_should_allocate() agrees. Otherwise, + * we may end up in an infinite loop retrying the same + * metaslab. + */ + ASSERT(!metaslab_should_allocate(msp, asize)); mutex_exit(&msp->ms_lock); } + mutex_exit(&msp->ms_lock); + kmem_free(search, sizeof (*search)); + return (offset); +} - if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) - vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); - - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); +static uint64_t +metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, + uint64_t asize, uint64_t txg, uint64_t min_distance, dva_t *dva, int d) +{ + uint64_t offset; + ASSERT(mg->mg_initialized); - mutex_exit(&msp->ms_lock); + offset = metaslab_group_alloc_normal(mg, zal, asize, txg, + min_distance, dva, d); + mutex_enter(&mg->mg_lock); + if (offset == -1ULL) { + mg->mg_failed_allocations++; + metaslab_trace_add(zal, mg, NULL, asize, d, + TRACE_GROUP_FAILURE); + if (asize == SPA_GANGBLOCKSIZE) { + /* + * This metaslab group was unable to allocate + * the minimum gang block size so it must be out of + * space. We must notify the allocation throttle + * to start skipping allocation attempts to this + * metaslab group until more space becomes available. + * Note: this failure cannot be caused by the + * allocation throttle since the allocation throttle + * is only responsible for skipping devices and + * not failing block allocations. + */ + mg->mg_no_free_space = B_TRUE; + } + } + mg->mg_allocations++; + mutex_exit(&mg->mg_lock); return (offset); } /* + * If we have to write a ditto block (i.e. more than one DVA for a given BP) + * on the same vdev as an existing DVA of this BP, then try to allocate it + * at least (vdev_asize / (2 ^ ditto_same_vdev_distance_shift)) away from the + * existing DVAs. + */ +int ditto_same_vdev_distance_shift = 3; + +/* * Allocate a block for the specified i/o. */ static int metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, - dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) + dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, + zio_alloc_list_t *zal) { metaslab_group_t *mg, *rotor; vdev_t *vd; - int dshift = 3; - int all_zero; - int zio_lock = B_FALSE; - boolean_t allocatable; - uint64_t offset = -1ULL; - uint64_t asize; - uint64_t distance; + boolean_t try_hard = B_FALSE; ASSERT(!DVA_IS_VALID(&dva[d])); /* * For testing, make some blocks above a certain size be gang blocks. */ - if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) - return (ENOSPC); + if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) { + metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG); + return (SET_ERROR(ENOSPC)); + } /* * Start at the rotor and loop through all mgs until we find something. @@ -1311,47 +3066,78 @@ metaslab_alloc_dva(spa_t *spa, metaslab_ rotor = mg; top: - all_zero = B_TRUE; do { - ASSERT(mg->mg_activation_count == 1); + boolean_t allocatable; + ASSERT(mg->mg_activation_count == 1); vd = mg->mg_vd; /* * Don't allocate from faulted devices. */ - if (zio_lock) { + if (try_hard) { spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); allocatable = vdev_allocatable(vd); spa_config_exit(spa, SCL_ZIO, FTAG); } else { allocatable = vdev_allocatable(vd); } - if (!allocatable) + + /* + * Determine if the selected metaslab group is eligible + * for allocations. If we're ganging then don't allow + * this metaslab group to skip allocations since that would + * inadvertently return ENOSPC and suspend the pool + * even though space is still available. + */ + if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { + allocatable = metaslab_group_allocatable(mg, rotor, + psize); + } + + if (!allocatable) { + metaslab_trace_add(zal, mg, NULL, psize, d, + TRACE_NOT_ALLOCATABLE); goto next; + } + + ASSERT(mg->mg_initialized); /* - * Avoid writing single-copy data to a failing vdev + * Avoid writing single-copy data to a failing, + * non-redundant vdev, unless we've already tried all + * other vdevs. */ if ((vd->vdev_stat.vs_write_errors > 0 || vd->vdev_state < VDEV_STATE_HEALTHY) && - d == 0 && dshift == 3) { - all_zero = B_FALSE; + d == 0 && !try_hard && vd->vdev_children == 0) { + metaslab_trace_add(zal, mg, NULL, psize, d, + TRACE_VDEV_ERROR); goto next; } ASSERT(mg->mg_class == mc); - distance = vd->vdev_asize >> dshift; - if (distance <= (1ULL << vd->vdev_ms_shift)) - distance = 0; - else - all_zero = B_FALSE; + /* + * If we don't need to try hard, then require that the + * block be 1/8th of the device away from any other DVAs + * in this BP. If we are trying hard, allow any offset + * to be used (distance=0). + */ + uint64_t distance = 0; + if (!try_hard) { + distance = vd->vdev_asize >> + ditto_same_vdev_distance_shift; + if (distance <= (1ULL << vd->vdev_ms_shift)) + distance = 0; + } - asize = vdev_psize_to_asize(vd, psize); + uint64_t asize = vdev_psize_to_asize(vd, psize); ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); - offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d); + uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, + distance, dva, d); + if (offset != -1ULL) { /* * If we've just selected this metaslab group, @@ -1359,22 +3145,30 @@ top: * over- or under-used relative to the pool, * and set an allocation bias to even it out. */ - if (mc->mc_aliquot == 0) { + if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { vdev_stat_t *vs = &vd->vdev_stat; int64_t vu, cu; - /* - * Determine percent used in units of 0..1024. - * (This is just to avoid floating point.) - */ - vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); - cu = (mc->mc_alloc << 10) / (mc->mc_space + 1); + vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); + cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); /* - * Bias by at most +/- 25% of the aliquot. + * Calculate how much more or less we should + * try to allocate from this device during + * this iteration around the rotor. + * For example, if a device is 80% full + * and the pool is 20% full then we should + * reduce allocations by 60% on this device. + * + * mg_bias = (20 - 80) * 512K / 100 = -307K + * + * This reduces allocations by 307K for this + * iteration. */ mg->mg_bias = ((cu - vu) * - (int64_t)mg->mg_aliquot) / (1024 * 4); + (int64_t)mg->mg_aliquot) / 100; + } else if (!metaslab_bias_enabled) { + mg->mg_bias = 0; } if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= @@ -1395,21 +3189,18 @@ next: mc->mc_aliquot = 0; } while ((mg = mg->mg_next) != rotor); - if (!all_zero) { - dshift++; - ASSERT(dshift < 64); - goto top; - } - - if (!allocatable && !zio_lock) { - dshift = 3; - zio_lock = B_TRUE; + /* + * If we haven't tried hard, do so now. + */ + if (!try_hard) { + try_hard = B_TRUE; goto top; } bzero(&dva[d], sizeof (dva_t)); - return (ENOSPC); + metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC); + return (SET_ERROR(ENOSPC)); } /* @@ -1446,13 +3237,23 @@ metaslab_free_dva(spa_t *spa, const dva_ mutex_enter(&msp->ms_lock); if (now) { - space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], + range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], offset, size); - space_map_free(&msp->ms_map, offset, size); + + VERIFY(!msp->ms_condensing); + VERIFY3U(offset, >=, msp->ms_start); + VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); + VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, + msp->ms_size); + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + range_tree_add(msp->ms_tree, offset, size); + msp->ms_max_size = metaslab_block_maxsize(msp); } else { - if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) + if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) vdev_dirty(vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); + range_tree_add(msp->ms_freetree[txg & TXG_MASK], + offset, size); } mutex_exit(&msp->ms_lock); @@ -1478,7 +3279,7 @@ metaslab_claim_dva(spa_t *spa, const dva if ((vd = vdev_lookup_top(spa, vdev)) == NULL || (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) - return (ENXIO); + return (SET_ERROR(ENXIO)); msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; @@ -1487,23 +3288,27 @@ metaslab_claim_dva(spa_t *spa, const dva mutex_enter(&msp->ms_lock); - if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); + if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); - if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) - error = ENOENT; + if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) + error = SET_ERROR(ENOENT); if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); } - space_map_claim(&msp->ms_map, offset, size); + VERIFY(!msp->ms_condensing); + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); + range_tree_remove(msp->ms_tree, offset, size); if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ - if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) + if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) vdev_dirty(vd, VDD_METASLAB, msp, txg); - space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); + range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); } mutex_exit(&msp->ms_lock); @@ -1511,9 +3316,58 @@ metaslab_claim_dva(spa_t *spa, const dva return (0); } +/* + * Reserve some allocation slots. The reservation system must be called + * before we call into the allocator. If there aren't any available slots + * then the I/O will be throttled until an I/O completes and its slots are + * freed up. The function returns true if it was successful in placing + * the reservation. + */ +boolean_t +metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, + int flags) +{ + uint64_t available_slots = 0; + boolean_t slot_reserved = B_FALSE; + + ASSERT(mc->mc_alloc_throttle_enabled); + mutex_enter(&mc->mc_lock); + + uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots); + if (reserved_slots < mc->mc_alloc_max_slots) + available_slots = mc->mc_alloc_max_slots - reserved_slots; + + if (slots <= available_slots || GANG_ALLOCATION(flags)) { + /* + * We reserve the slots individually so that we can unreserve + * them individually when an I/O completes. + */ + for (int d = 0; d < slots; d++) { + reserved_slots = refcount_add(&mc->mc_alloc_slots, zio); + } + zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; + slot_reserved = B_TRUE; + } + + mutex_exit(&mc->mc_lock); + return (slot_reserved); +} + +void +metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) +{ + ASSERT(mc->mc_alloc_throttle_enabled); + mutex_enter(&mc->mc_lock); + for (int d = 0; d < slots; d++) { + (void) refcount_remove(&mc->mc_alloc_slots, zio); + } + mutex_exit(&mc->mc_lock); +} + int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, - int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) + int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, + zio_alloc_list_t *zal, zio_t *zio) { dva_t *dva = bp->blk_dva; dva_t *hintdva = hintbp->blk_dva; @@ -1526,24 +3380,35 @@ metaslab_alloc(spa_t *spa, metaslab_clas if (mc->mc_rotor == NULL) { /* no vdevs in this class */ spa_config_exit(spa, SCL_ALLOC, FTAG); - return (ENOSPC); + return (SET_ERROR(ENOSPC)); } ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); ASSERT(BP_GET_NDVAS(bp) == 0); ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); + ASSERT3P(zal, !=, NULL); for (int d = 0; d < ndvas; d++) { error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, - txg, flags); - if (error) { + txg, flags, zal); + if (error != 0) { for (d--; d >= 0; d--) { metaslab_free_dva(spa, &dva[d], txg, B_TRUE); + metaslab_group_alloc_decrement(spa, + DVA_GET_VDEV(&dva[d]), zio, flags); bzero(&dva[d], sizeof (dva_t)); } spa_config_exit(spa, SCL_ALLOC, FTAG); return (error); + } else { + /* + * Update the metaslab group's queue depth + * based on the newly allocated dva. + */ + metaslab_group_alloc_increment(spa, + DVA_GET_VDEV(&dva[d]), zio, flags); } + } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); @@ -1602,3 +3467,28 @@ metaslab_claim(spa_t *spa, const blkptr_ return (error); } + +void +metaslab_check_free(spa_t *spa, const blkptr_t *bp) +{ + if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) + return; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + for (int i = 0; i < BP_GET_NDVAS(bp); i++) { + uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); + vdev_t *vd = vdev_lookup_top(spa, vdev); + uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); + uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); + metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + if (msp->ms_loaded) + range_tree_verify(msp->ms_tree, offset, size); + + for (int j = 0; j < TXG_SIZE; j++) + range_tree_verify(msp->ms_freetree[j], offset, size); + for (int j = 0; j < TXG_DEFER_SIZE; j++) + range_tree_verify(msp->ms_defertree[j], offset, size); + } + spa_config_exit(spa, SCL_VDEV, FTAG); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/multilist.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/multilist.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/multilist.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/multilist.c 30 Aug 2015 02:17:54 -0000 @@ -0,0 +1,366 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + */ + +#include +#include + +/* needed for spa_get_random() */ +#include + +/* + * Given the object contained on the list, return a pointer to the + * object's multilist_node_t structure it contains. + */ +static multilist_node_t * +multilist_d2l(multilist_t *ml, void *obj) +{ + return ((multilist_node_t *)((char *)obj + ml->ml_offset)); +} + +/* + * Initialize a new mutlilist using the parameters specified. + * + * - 'size' denotes the size of the structure containing the + * multilist_node_t. + * - 'offset' denotes the byte offset of the mutlilist_node_t within + * the structure that contains it. + * - 'num' specifies the number of internal sublists to create. + * - 'index_func' is used to determine which sublist to insert into + * when the multilist_insert() function is called; as well as which + * sublist to remove from when multilist_remove() is called. The + * requirements this function must meet, are the following: + * + * - It must always return the same value when called on the same + * object (to ensure the object is removed from the list it was + * inserted into). + * + * - It must return a value in the range [0, number of sublists). + * The multilist_get_num_sublists() function may be used to + * determine the number of sublists in the multilist. + * + * Also, in order to reduce internal contention between the sublists + * during insertion and removal, this function should choose evenly + * between all available sublists when inserting. This isn't a hard + * requirement, but a general rule of thumb in order to garner the + * best multi-threaded performance out of the data structure. + */ +void +multilist_create(multilist_t *ml, size_t size, size_t offset, unsigned int num, + multilist_sublist_index_func_t *index_func) +{ + ASSERT3P(ml, !=, NULL); + ASSERT3U(size, >, 0); + ASSERT3U(size, >=, offset + sizeof (multilist_node_t)); + ASSERT3U(num, >, 0); + ASSERT3P(index_func, !=, NULL); + + ml->ml_offset = offset; + ml->ml_num_sublists = num; + ml->ml_index_func = index_func; + + ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) * + ml->ml_num_sublists, KM_SLEEP); + + ASSERT3P(ml->ml_sublists, !=, NULL); + + for (int i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&mls->mls_list, size, offset); + } +} + +/* + * Destroy the given multilist object, and free up any memory it holds. + */ +void +multilist_destroy(multilist_t *ml) +{ + ASSERT(multilist_is_empty(ml)); + + for (int i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + + ASSERT(list_is_empty(&mls->mls_list)); + + list_destroy(&mls->mls_list); + mutex_destroy(&mls->mls_lock); + } + + ASSERT3P(ml->ml_sublists, !=, NULL); + kmem_free(ml->ml_sublists, + sizeof (multilist_sublist_t) * ml->ml_num_sublists); + + ml->ml_num_sublists = 0; + ml->ml_offset = 0; +} + +/* + * Insert the given object into the multilist. + * + * This function will insert the object specified into the sublist + * determined using the function given at multilist creation time. + * + * The sublist locks are automatically acquired if not already held, to + * ensure consistency when inserting and removing from multiple threads. + */ +void +multilist_insert(multilist_t *ml, void *obj) +{ + unsigned int sublist_idx = ml->ml_index_func(ml, obj); + multilist_sublist_t *mls; + boolean_t need_lock; + + DTRACE_PROBE3(multilist__insert, multilist_t *, ml, + unsigned int, sublist_idx, void *, obj); + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + + mls = &ml->ml_sublists[sublist_idx]; + + /* + * Note: Callers may already hold the sublist lock by calling + * multilist_sublist_lock(). Here we rely on MUTEX_HELD() + * returning TRUE if and only if the current thread holds the + * lock. While it's a little ugly to make the lock recursive in + * this way, it works and allows the calling code to be much + * simpler -- otherwise it would have to pass around a flag + * indicating that it already has the lock. + */ + need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + ASSERT(!multilist_link_active(multilist_d2l(ml, obj))); + + multilist_sublist_insert_head(mls, obj); + + if (need_lock) + mutex_exit(&mls->mls_lock); +} + +/* + * Remove the given object from the multilist. + * + * This function will remove the object specified from the sublist + * determined using the function given at multilist creation time. + * + * The necessary sublist locks are automatically acquired, to ensure + * consistency when inserting and removing from multiple threads. + */ +void +multilist_remove(multilist_t *ml, void *obj) +{ + unsigned int sublist_idx = ml->ml_index_func(ml, obj); + multilist_sublist_t *mls; + boolean_t need_lock; + + DTRACE_PROBE3(multilist__remove, multilist_t *, ml, + unsigned int, sublist_idx, void *, obj); + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + + mls = &ml->ml_sublists[sublist_idx]; + /* See comment in multilist_insert(). */ + need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + ASSERT(multilist_link_active(multilist_d2l(ml, obj))); + + multilist_sublist_remove(mls, obj); + + if (need_lock) + mutex_exit(&mls->mls_lock); +} + +/* + * Check to see if this multilist object is empty. + * + * This will return TRUE if it finds all of the sublists of this + * multilist to be empty, and FALSE otherwise. Each sublist lock will be + * automatically acquired as necessary. + * + * If concurrent insertions and removals are occurring, the semantics + * of this function become a little fuzzy. Instead of locking all + * sublists for the entire call time of the function, each sublist is + * only locked as it is individually checked for emptiness. Thus, it's + * possible for this function to return TRUE with non-empty sublists at + * the time the function returns. This would be due to another thread + * inserting into a given sublist, after that specific sublist was check + * and deemed empty, but before all sublists have been checked. + */ +int +multilist_is_empty(multilist_t *ml) +{ + for (int i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + /* See comment in multilist_insert(). */ + boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + if (!list_is_empty(&mls->mls_list)) { + if (need_lock) + mutex_exit(&mls->mls_lock); + + return (FALSE); + } + + if (need_lock) + mutex_exit(&mls->mls_lock); + } + + return (TRUE); +} + +/* Return the number of sublists composing this multilist */ +unsigned int +multilist_get_num_sublists(multilist_t *ml) +{ + return (ml->ml_num_sublists); +} + +/* Return a randomly selected, valid sublist index for this multilist */ +unsigned int +multilist_get_random_index(multilist_t *ml) +{ + return (spa_get_random(ml->ml_num_sublists)); +} + +/* Lock and return the sublist specified at the given index */ +multilist_sublist_t * +multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) +{ + multilist_sublist_t *mls; + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + mls = &ml->ml_sublists[sublist_idx]; + mutex_enter(&mls->mls_lock); + + return (mls); +} + +void +multilist_sublist_unlock(multilist_sublist_t *mls) +{ + mutex_exit(&mls->mls_lock); +} + +/* + * We're allowing any object to be inserted into this specific sublist, + * but this can lead to trouble if multilist_remove() is called to + * remove this object. Specifically, if calling ml_index_func on this + * object returns an index for sublist different than what is passed as + * a parameter here, any call to multilist_remove() with this newly + * inserted object is undefined! (the call to multilist_remove() will + * remove the object from a list that it isn't contained in) + */ +void +multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_head(&mls->mls_list, obj); +} + +/* please see comment above multilist_sublist_insert_head */ +void +multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_tail(&mls->mls_list, obj); +} + +/* + * Move the object one element forward in the list. + * + * This function will move the given object forward in the list (towards + * the head) by one object. So, in essence, it will swap its position in + * the list with its "prev" pointer. If the given object is already at the + * head of the list, it cannot be moved forward any more than it already + * is, so no action is taken. + * + * NOTE: This function **must not** remove any object from the list other + * than the object given as the parameter. This is relied upon in + * arc_evict_state_impl(). + */ +void +multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj) +{ + void *prev = list_prev(&mls->mls_list, obj); + + ASSERT(MUTEX_HELD(&mls->mls_lock)); + ASSERT(!list_is_empty(&mls->mls_list)); + + /* 'obj' must be at the head of the list, nothing to do */ + if (prev == NULL) + return; + + list_remove(&mls->mls_list, obj); + list_insert_before(&mls->mls_list, prev, obj); +} + +void +multilist_sublist_remove(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_remove(&mls->mls_list, obj); +} + +void * +multilist_sublist_head(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_head(&mls->mls_list)); +} + +void * +multilist_sublist_tail(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_tail(&mls->mls_list)); +} + +void * +multilist_sublist_next(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_next(&mls->mls_list, obj)); +} + +void * +multilist_sublist_prev(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_prev(&mls->mls_list, obj)); +} + +void +multilist_link_init(multilist_node_t *link) +{ + list_link_init(link); +} + +int +multilist_link_active(multilist_node_t *link) +{ + return (list_link_active(link)); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/range_tree.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/range_tree.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/range_tree.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/range_tree.c 1 Nov 2014 10:04:53 -0000 @@ -0,0 +1,411 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +kmem_cache_t *range_seg_cache; + +void +range_tree_init(void) +{ + ASSERT(range_seg_cache == NULL); + range_seg_cache = kmem_cache_create("range_seg_cache", + sizeof (range_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +range_tree_fini(void) +{ + kmem_cache_destroy(range_seg_cache); + range_seg_cache = NULL; +} + +void +range_tree_stat_verify(range_tree_t *rt) +{ + range_seg_t *rs; + uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; + int i; + + for (rs = avl_first(&rt->rt_root); rs != NULL; + rs = AVL_NEXT(&rt->rt_root, rs)) { + uint64_t size = rs->rs_end - rs->rs_start; + int idx = highbit64(size) - 1; + + hist[idx]++; + ASSERT3U(hist[idx], !=, 0); + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + if (hist[i] != rt->rt_histogram[i]) { + zfs_dbgmsg("i=%d, hist=%p, hist=%llu, rt_hist=%llu", + i, hist, hist[i], rt->rt_histogram[i]); + } + VERIFY3U(hist[i], ==, rt->rt_histogram[i]); + } +} + +static void +range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) +{ + uint64_t size = rs->rs_end - rs->rs_start; + int idx = highbit64(size) - 1; + + ASSERT(size != 0); + ASSERT3U(idx, <, + sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); + + ASSERT(MUTEX_HELD(rt->rt_lock)); + rt->rt_histogram[idx]++; + ASSERT3U(rt->rt_histogram[idx], !=, 0); +} + +static void +range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) +{ + uint64_t size = rs->rs_end - rs->rs_start; + int idx = highbit64(size) - 1; + + ASSERT(size != 0); + ASSERT3U(idx, <, + sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); + + ASSERT(MUTEX_HELD(rt->rt_lock)); + ASSERT3U(rt->rt_histogram[idx], !=, 0); + rt->rt_histogram[idx]--; +} + +/* + * NOTE: caller is responsible for all locking. + */ +static int +range_tree_seg_compare(const void *x1, const void *x2) +{ + const range_seg_t *r1 = x1; + const range_seg_t *r2 = x2; + + if (r1->rs_start < r2->rs_start) { + if (r1->rs_end > r2->rs_start) + return (0); + return (-1); + } + if (r1->rs_start > r2->rs_start) { + if (r1->rs_start < r2->rs_end) + return (0); + return (1); + } + return (0); +} + +range_tree_t * +range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp) +{ + range_tree_t *rt; + + rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); + + avl_create(&rt->rt_root, range_tree_seg_compare, + sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); + + rt->rt_lock = lp; + rt->rt_ops = ops; + rt->rt_arg = arg; + + if (rt->rt_ops != NULL) + rt->rt_ops->rtop_create(rt, rt->rt_arg); + + return (rt); +} + +void +range_tree_destroy(range_tree_t *rt) +{ + VERIFY0(rt->rt_space); + + if (rt->rt_ops != NULL) + rt->rt_ops->rtop_destroy(rt, rt->rt_arg); + + avl_destroy(&rt->rt_root); + kmem_free(rt, sizeof (*rt)); +} + +void +range_tree_add(void *arg, uint64_t start, uint64_t size) +{ + range_tree_t *rt = arg; + avl_index_t where; + range_seg_t rsearch, *rs_before, *rs_after, *rs; + uint64_t end = start + size; + boolean_t merge_before, merge_after; + + ASSERT(MUTEX_HELD(rt->rt_lock)); + VERIFY(size != 0); + + rsearch.rs_start = start; + rsearch.rs_end = end; + rs = avl_find(&rt->rt_root, &rsearch, &where); + + if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) { + zfs_panic_recover("zfs: allocating allocated segment" + "(offset=%llu size=%llu)\n", + (longlong_t)start, (longlong_t)size); + return; + } + + /* Make sure we don't overlap with either of our neighbors */ + VERIFY(rs == NULL); + + rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); + rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); + + merge_before = (rs_before != NULL && rs_before->rs_end == start); + merge_after = (rs_after != NULL && rs_after->rs_start == end); + + if (merge_before && merge_after) { + avl_remove(&rt->rt_root, rs_before); + if (rt->rt_ops != NULL) { + rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); + rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); + } + + range_tree_stat_decr(rt, rs_before); + range_tree_stat_decr(rt, rs_after); + + rs_after->rs_start = rs_before->rs_start; + kmem_cache_free(range_seg_cache, rs_before); + rs = rs_after; + } else if (merge_before) { + if (rt->rt_ops != NULL) + rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); + + range_tree_stat_decr(rt, rs_before); + + rs_before->rs_end = end; + rs = rs_before; + } else if (merge_after) { + if (rt->rt_ops != NULL) + rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); + + range_tree_stat_decr(rt, rs_after); + + rs_after->rs_start = start; + rs = rs_after; + } else { + rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); + rs->rs_start = start; + rs->rs_end = end; + avl_insert(&rt->rt_root, rs, where); + } + + if (rt->rt_ops != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + + range_tree_stat_incr(rt, rs); + rt->rt_space += size; +} + +void +range_tree_remove(void *arg, uint64_t start, uint64_t size) +{ + range_tree_t *rt = arg; + avl_index_t where; + range_seg_t rsearch, *rs, *newseg; + uint64_t end = start + size; + boolean_t left_over, right_over; + + ASSERT(MUTEX_HELD(rt->rt_lock)); + VERIFY3U(size, !=, 0); + VERIFY3U(size, <=, rt->rt_space); + + rsearch.rs_start = start; + rsearch.rs_end = end; + rs = avl_find(&rt->rt_root, &rsearch, &where); + + /* Make sure we completely overlap with someone */ + if (rs == NULL) { + zfs_panic_recover("zfs: freeing free segment " + "(offset=%llu size=%llu)", + (longlong_t)start, (longlong_t)size); + return; + } + VERIFY3U(rs->rs_start, <=, start); + VERIFY3U(rs->rs_end, >=, end); + + left_over = (rs->rs_start != start); + right_over = (rs->rs_end != end); + + range_tree_stat_decr(rt, rs); + + if (rt->rt_ops != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + + if (left_over && right_over) { + newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); + newseg->rs_start = end; + newseg->rs_end = rs->rs_end; + range_tree_stat_incr(rt, newseg); + + rs->rs_end = start; + + avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); + if (rt->rt_ops != NULL) + rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); + } else if (left_over) { + rs->rs_end = start; + } else if (right_over) { + rs->rs_start = end; + } else { + avl_remove(&rt->rt_root, rs); + kmem_cache_free(range_seg_cache, rs); + rs = NULL; + } + + if (rs != NULL) { + range_tree_stat_incr(rt, rs); + + if (rt->rt_ops != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + } + + rt->rt_space -= size; +} + +static range_seg_t * +range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) +{ + avl_index_t where; + range_seg_t rsearch; + uint64_t end = start + size; + + ASSERT(MUTEX_HELD(rt->rt_lock)); + VERIFY(size != 0); + + rsearch.rs_start = start; + rsearch.rs_end = end; + return (avl_find(&rt->rt_root, &rsearch, &where)); +} + +static range_seg_t * +range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) +{ + range_seg_t *rs = range_tree_find_impl(rt, start, size); + if (rs != NULL && rs->rs_start <= start && rs->rs_end >= start + size) + return (rs); + return (NULL); +} + +void +range_tree_verify(range_tree_t *rt, uint64_t off, uint64_t size) +{ + range_seg_t *rs; + + mutex_enter(rt->rt_lock); + rs = range_tree_find(rt, off, size); + if (rs != NULL) + panic("freeing free block; rs=%p", (void *)rs); + mutex_exit(rt->rt_lock); +} + +boolean_t +range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) +{ + return (range_tree_find(rt, start, size) != NULL); +} + +/* + * Ensure that this range is not in the tree, regardless of whether + * it is currently in the tree. + */ +void +range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) +{ + range_seg_t *rs; + + while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { + uint64_t free_start = MAX(rs->rs_start, start); + uint64_t free_end = MIN(rs->rs_end, start + size); + range_tree_remove(rt, free_start, free_end - free_start); + } +} + +void +range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) +{ + range_tree_t *rt; + + ASSERT(MUTEX_HELD((*rtsrc)->rt_lock)); + ASSERT0(range_tree_space(*rtdst)); + ASSERT0(avl_numnodes(&(*rtdst)->rt_root)); + + rt = *rtsrc; + *rtsrc = *rtdst; + *rtdst = rt; +} + +void +range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) +{ + range_seg_t *rs; + void *cookie = NULL; + + ASSERT(MUTEX_HELD(rt->rt_lock)); + + if (rt->rt_ops != NULL) + rt->rt_ops->rtop_vacate(rt, rt->rt_arg); + + while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { + if (func != NULL) + func(arg, rs->rs_start, rs->rs_end - rs->rs_start); + kmem_cache_free(range_seg_cache, rs); + } + + bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); + rt->rt_space = 0; +} + +void +range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) +{ + range_seg_t *rs; + + ASSERT(MUTEX_HELD(rt->rt_lock)); + + for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) + func(arg, rs->rs_start, rs->rs_end - rs->rs_start); +} + +uint64_t +range_tree_space(range_tree_t *rt) +{ + return (rt->rt_space); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/refcount.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/refcount.c,v retrieving revision 1.1.1.1 diff -u -p -r1.1.1.1 refcount.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/refcount.c 7 Aug 2009 18:33:07 -0000 1.1.1.1 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/refcount.c 10 Oct 2016 11:09:56 -0000 @@ -19,29 +19,31 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include -#if defined(DEBUG) || !defined(_KERNEL) +#ifdef ZFS_DEBUG #ifdef _KERNEL int reference_tracking_enable = FALSE; /* runs out of memory too easily */ +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN, + &reference_tracking_enable, 0, + "Track reference holders to refcount_t objects, used mostly by ZFS"); #else int reference_tracking_enable = TRUE; #endif -int reference_history = 4; /* tunable */ +int reference_history = 3; /* tunable */ static kmem_cache_t *reference_cache; static kmem_cache_t *reference_history_cache; void -refcount_init(void) +refcount_sysinit(void) { reference_cache = kmem_cache_create("reference_cache", sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); @@ -67,6 +69,21 @@ refcount_create(refcount_t *rc) offsetof(reference_t, ref_link)); rc->rc_count = 0; rc->rc_removed_count = 0; + rc->rc_tracked = reference_tracking_enable; +} + +void +refcount_create_tracked(refcount_t *rc) +{ + refcount_create(rc); + rc->rc_tracked = B_TRUE; +} + +void +refcount_create_untracked(refcount_t *rc) +{ + refcount_create(rc); + rc->rc_tracked = B_FALSE; } void @@ -99,31 +116,29 @@ refcount_destroy(refcount_t *rc) int refcount_is_zero(refcount_t *rc) { - ASSERT(rc->rc_count >= 0); return (rc->rc_count == 0); } int64_t refcount_count(refcount_t *rc) { - ASSERT(rc->rc_count >= 0); return (rc->rc_count); } int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder) { - reference_t *ref; + reference_t *ref = NULL; int64_t count; - if (reference_tracking_enable) { + if (rc->rc_tracked) { ref = kmem_cache_alloc(reference_cache, KM_SLEEP); ref->ref_holder = holder; ref->ref_number = number; } mutex_enter(&rc->rc_mtx); ASSERT(rc->rc_count >= 0); - if (reference_tracking_enable) + if (rc->rc_tracked) list_insert_head(&rc->rc_list, ref); rc->rc_count += number; count = rc->rc_count; @@ -147,7 +162,7 @@ refcount_remove_many(refcount_t *rc, uin mutex_enter(&rc->rc_mtx); ASSERT(rc->rc_count >= number); - if (!reference_tracking_enable) { + if (!rc->rc_tracked) { rc->rc_count -= number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); @@ -164,7 +179,7 @@ refcount_remove_many(refcount_t *rc, uin KM_SLEEP); list_insert_head(&rc->rc_removed, ref); rc->rc_removed_count++; - if (rc->rc_removed_count >= reference_history) { + if (rc->rc_removed_count > reference_history) { ref = list_tail(&rc->rc_removed); list_remove(&rc->rc_removed, ref); kmem_cache_free(reference_history_cache, @@ -192,4 +207,115 @@ refcount_remove(refcount_t *rc, void *ho return (refcount_remove_many(rc, 1, holder)); } -#endif +void +refcount_transfer(refcount_t *dst, refcount_t *src) +{ + int64_t count, removed_count; + list_t list, removed; + + list_create(&list, sizeof (reference_t), + offsetof(reference_t, ref_link)); + list_create(&removed, sizeof (reference_t), + offsetof(reference_t, ref_link)); + + mutex_enter(&src->rc_mtx); + count = src->rc_count; + removed_count = src->rc_removed_count; + src->rc_count = 0; + src->rc_removed_count = 0; + list_move_tail(&list, &src->rc_list); + list_move_tail(&removed, &src->rc_removed); + mutex_exit(&src->rc_mtx); + + mutex_enter(&dst->rc_mtx); + dst->rc_count += count; + dst->rc_removed_count += removed_count; + list_move_tail(&dst->rc_list, &list); + list_move_tail(&dst->rc_removed, &removed); + mutex_exit(&dst->rc_mtx); + + list_destroy(&list); + list_destroy(&removed); +} + +void +refcount_transfer_ownership(refcount_t *rc, void *current_holder, + void *new_holder) +{ + reference_t *ref; + boolean_t found = B_FALSE; + + mutex_enter(&rc->rc_mtx); + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return; + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == current_holder) { + ref->ref_holder = new_holder; + found = B_TRUE; + break; + } + } + ASSERT(found); + mutex_exit(&rc->rc_mtx); +} + +/* + * If tracking is enabled, return true if a reference exists that matches + * the "holder" tag. If tracking is disabled, then return true if a reference + * might be held. + */ +boolean_t +refcount_held(refcount_t *rc, void *holder) +{ + reference_t *ref; + + mutex_enter(&rc->rc_mtx); + + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return (rc->rc_count > 0); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder) { + mutex_exit(&rc->rc_mtx); + return (B_TRUE); + } + } + mutex_exit(&rc->rc_mtx); + return (B_FALSE); +} + +/* + * If tracking is enabled, return true if a reference does not exist that + * matches the "holder" tag. If tracking is disabled, always return true + * since the reference might not be held. + */ +boolean_t +refcount_not_held(refcount_t *rc, void *holder) +{ + reference_t *ref; + + mutex_enter(&rc->rc_mtx); + + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return (B_TRUE); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder) { + mutex_exit(&rc->rc_mtx); + return (B_FALSE); + } + } + mutex_exit(&rc->rc_mtx); + return (B_TRUE); +} +#endif /* ZFS_DEBUG */ Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c,v retrieving revision 1.3 diff -u -p -r1.3 rrwlock.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c 27 Feb 2010 23:43:53 -0000 1.3 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c 2 May 2017 18:05:37 -0000 @@ -22,6 +22,9 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ #include #include @@ -72,8 +75,9 @@ uint_t rrw_tsd_key; typedef struct rrw_node { - struct rrw_node *rn_next; - rrwlock_t *rn_rrl; + struct rrw_node *rn_next; + rrwlock_t *rn_rrl; + void *rn_tag; } rrw_node_t; static rrw_node_t * @@ -95,13 +99,14 @@ rrn_find(rrwlock_t *rrl) * Add a node to the head of the singly linked list. */ static void -rrn_add(rrwlock_t *rrl) +rrn_add(rrwlock_t *rrl, void *tag) { rrw_node_t *rn; rn = kmem_alloc(sizeof (*rn), KM_SLEEP); rn->rn_rrl = rrl; rn->rn_next = tsd_get(rrw_tsd_key); + rn->rn_tag = tag; VERIFY(tsd_set(rrw_tsd_key, rn) == 0); } @@ -110,7 +115,7 @@ rrn_add(rrwlock_t *rrl) * thread's list and return TRUE; otherwise return FALSE. */ static boolean_t -rrn_find_and_remove(rrwlock_t *rrl) +rrn_find_and_remove(rrwlock_t *rrl, void *tag) { rrw_node_t *rn; rrw_node_t *prev = NULL; @@ -119,7 +124,7 @@ rrn_find_and_remove(rrwlock_t *rrl) return (B_FALSE); for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { - if (rn->rn_rrl == rrl) { + if (rn->rn_rrl == rrl && rn->rn_tag == tag) { if (prev) prev->rn_next = rn->rn_next; else @@ -133,7 +138,7 @@ rrn_find_and_remove(rrwlock_t *rrl) } void -rrw_init(rrwlock_t *rrl) +rrw_init(rrwlock_t *rrl, boolean_t track_all) { mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); @@ -141,6 +146,7 @@ rrw_init(rrwlock_t *rrl) refcount_create(&rrl->rr_anon_rcount); refcount_create(&rrl->rr_linked_rcount); rrl->rr_writer_wanted = B_FALSE; + rrl->rr_track_all = track_all; } void @@ -154,11 +160,12 @@ rrw_destroy(rrwlock_t *rrl) } static void -rrw_enter_read(rrwlock_t *rrl, void *tag) +rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag) { mutex_enter(&rrl->rr_lock); #if !defined(DEBUG) && defined(_KERNEL) - if (!rrl->rr_writer && !rrl->rr_writer_wanted) { + if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted && + !rrl->rr_track_all) { rrl->rr_anon_rcount.rc_count++; mutex_exit(&rrl->rr_lock); return; @@ -168,14 +175,14 @@ rrw_enter_read(rrwlock_t *rrl, void *tag ASSERT(rrl->rr_writer != curthread); ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); - while (rrl->rr_writer || (rrl->rr_writer_wanted && - refcount_is_zero(&rrl->rr_anon_rcount) && + while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted && + refcount_is_zero(&rrl->rr_anon_rcount) && !prio && rrn_find(rrl) == NULL)) cv_wait(&rrl->rr_cv, &rrl->rr_lock); - if (rrl->rr_writer_wanted) { + if (rrl->rr_writer_wanted || rrl->rr_track_all) { /* may or may not be a re-entrant enter */ - rrn_add(rrl); + rrn_add(rrl, tag); (void) refcount_add(&rrl->rr_linked_rcount, tag); } else { (void) refcount_add(&rrl->rr_anon_rcount, tag); @@ -184,7 +191,26 @@ rrw_enter_read(rrwlock_t *rrl, void *tag mutex_exit(&rrl->rr_lock); } -static void +void +rrw_enter_read(rrwlock_t *rrl, void *tag) +{ + rrw_enter_read_impl(rrl, B_FALSE, tag); +} + +/* + * take a read lock even if there are pending write lock requests. if we want + * to take a lock reentrantly, but from different threads (that have a + * relationship to each other), the normal detection mechanism to overrule + * the pending writer does not work, so we have to give an explicit hint here. + */ +void +rrw_enter_read_prio(rrwlock_t *rrl, void *tag) +{ + rrw_enter_read_impl(rrl, B_TRUE, tag); +} + + +void rrw_enter_write(rrwlock_t *rrl) { mutex_enter(&rrl->rr_lock); @@ -230,10 +256,12 @@ rrw_exit(rrwlock_t *rrl, void *tag) if (rrl->rr_writer == NULL) { int64_t count; - if (rrn_find_and_remove(rrl)) + if (rrn_find_and_remove(rrl, tag)) { count = refcount_remove(&rrl->rr_linked_rcount, tag); - else + } else { + ASSERT(!rrl->rr_track_all); count = refcount_remove(&rrl->rr_anon_rcount, tag); + } if (count == 0) cv_broadcast(&rrl->rr_cv); } else { @@ -246,6 +274,11 @@ rrw_exit(rrwlock_t *rrl, void *tag) mutex_exit(&rrl->rr_lock); } +/* + * If the lock was created with track_all, rrw_held(RW_READER) will return + * B_TRUE iff the current thread has the lock for reader. Otherwise it may + * return B_TRUE if any thread has the lock for reader. + */ boolean_t rrw_held(rrwlock_t *rrl, krw_t rw) { @@ -256,9 +289,107 @@ rrw_held(rrwlock_t *rrl, krw_t rw) held = (rrl->rr_writer == curthread); } else { held = (!refcount_is_zero(&rrl->rr_anon_rcount) || - !refcount_is_zero(&rrl->rr_linked_rcount)); + rrn_find(rrl) != NULL); } mutex_exit(&rrl->rr_lock); return (held); } + +void +rrw_tsd_destroy(void *arg) +{ + rrw_node_t *rn = arg; + if (rn != NULL) { + panic("thread %p terminating with rrw lock %p held", + (void *)curthread, (void *)rn->rn_rrl); + } +} + +/* + * A reader-mostly lock implementation, tuning above reader-writer locks + * for hightly parallel read acquisitions, while pessimizing writes. + * + * The idea is to split single busy lock into array of locks, so that + * each reader can lock only one of them for read, depending on result + * of simple hash function. That proportionally reduces lock congestion. + * Writer same time has to sequentially aquire write on all the locks. + * That makes write aquisition proportionally slower, but in places where + * it is used (filesystem unmount) performance is not critical. + * + * All the functions below are direct wrappers around functions above. + */ +void +rrm_init(rrmlock_t *rrl, boolean_t track_all) +{ + int i; + + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_init(&rrl->locks[i], track_all); +} + +void +rrm_destroy(rrmlock_t *rrl) +{ + int i; + + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_destroy(&rrl->locks[i]); +} + +void +rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag) +{ + if (rw == RW_READER) + rrm_enter_read(rrl, tag); + else + rrm_enter_write(rrl); +} + +/* + * This maps the current thread to a specific lock. Note that the lock + * must be released by the same thread that acquired it. We do this + * mapping by taking the thread pointer mod a prime number. We examine + * only the low 32 bits of the thread pointer, because 32-bit division + * is faster than 64-bit division, and the high 32 bits have little + * entropy anyway. + */ +#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS) + +void +rrm_enter_read(rrmlock_t *rrl, void *tag) +{ + rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag); +} + +void +rrm_enter_write(rrmlock_t *rrl) +{ + int i; + + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_enter_write(&rrl->locks[i]); +} + +void +rrm_exit(rrmlock_t *rrl, void *tag) +{ + int i; + + if (rrl->locks[0].rr_writer == curthread) { + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_exit(&rrl->locks[i], tag); + } else { + rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag); + } +} + +boolean_t +rrm_held(rrmlock_t *rrl, krw_t rw) +{ + if (rw == RW_WRITER) { + return (rrw_held(&rrl->locks[0], rw)); + } else { + return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw)); + } +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sa.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/sa.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/sa.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sa.c 11 May 2017 16:36:37 -0000 @@ -0,0 +1,2028 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright 2011 iXsystems, Inc + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ZFS System attributes: + * + * A generic mechanism to allow for arbitrary attributes + * to be stored in a dnode. The data will be stored in the bonus buffer of + * the dnode and if necessary a special "spill" block will be used to handle + * overflow situations. The spill block will be sized to fit the data + * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the + * spill block is stored at the end of the current bonus buffer. Any + * attributes that would be in the way of the blkptr_t will be relocated + * into the spill block. + * + * Attribute registration: + * + * Stored persistently on a per dataset basis + * a mapping between attribute "string" names and their actual attribute + * numeric values, length, and byteswap function. The names are only used + * during registration. All attributes are known by their unique attribute + * id value. If an attribute can have a variable size then the value + * 0 will be used to indicate this. + * + * Attribute Layout: + * + * Attribute layouts are a way to compactly store multiple attributes, but + * without taking the overhead associated with managing each attribute + * individually. Since you will typically have the same set of attributes + * stored in the same order a single table will be used to represent that + * layout. The ZPL for example will usually have only about 10 different + * layouts (regular files, device files, symlinks, + * regular files + scanstamp, files/dir with extended attributes, and then + * you have the possibility of all of those minus ACL, because it would + * be kicked out into the spill block) + * + * Layouts are simply an array of the attributes and their + * ordering i.e. [0, 1, 4, 5, 2] + * + * Each distinct layout is given a unique layout number and that is whats + * stored in the header at the beginning of the SA data buffer. + * + * A layout only covers a single dbuf (bonus or spill). If a set of + * attributes is split up between the bonus buffer and a spill buffer then + * two different layouts will be used. This allows us to byteswap the + * spill without looking at the bonus buffer and keeps the on disk format of + * the bonus and spill buffer the same. + * + * Adding a single attribute will cause the entire set of attributes to + * be rewritten and could result in a new layout number being constructed + * as part of the rewrite if no such layout exists for the new set of + * attribues. The new attribute will be appended to the end of the already + * existing attributes. + * + * Both the attribute registration and attribute layout information are + * stored in normal ZAP attributes. Their should be a small number of + * known layouts and the set of attributes is assumed to typically be quite + * small. + * + * The registered attributes and layout "table" information is maintained + * in core and a special "sa_os_t" is attached to the objset_t. + * + * A special interface is provided to allow for quickly applying + * a large set of attributes at once. sa_replace_all_by_template() is + * used to set an array of attributes. This is used by the ZPL when + * creating a brand new file. The template that is passed into the function + * specifies the attribute, size for variable length attributes, location of + * data and special "data locator" function if the data isn't in a contiguous + * location. + * + * Byteswap implications: + * + * Since the SA attributes are not entirely self describing we can't do + * the normal byteswap processing. The special ZAP layout attribute and + * attribute registration attributes define the byteswap function and the + * size of the attributes, unless it is variable sized. + * The normal ZFS byteswapping infrastructure assumes you don't need + * to read any objects in order to do the necessary byteswapping. Whereas + * SA attributes can only be properly byteswapped if the dataset is opened + * and the layout/attribute ZAP attributes are available. Because of this + * the SA attributes will be byteswapped when they are first accessed by + * the SA code that will read the SA data. + */ + +typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t, + uint16_t length, int length_idx, boolean_t, void *userp); + +static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype); +static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab); +static void *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, + void *data); +static void sa_idx_tab_rele(objset_t *os, void *arg); +static void sa_copy_data(sa_data_locator_t *func, void *start, void *target, + int buflen); +static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, + sa_data_op_t action, sa_data_locator_t *locator, void *datastart, + uint16_t buflen, dmu_tx_t *tx); + +arc_byteswap_func_t *sa_bswap_table[] = { + byteswap_uint64_array, + byteswap_uint32_array, + byteswap_uint16_array, + byteswap_uint8_array, + zfs_acl_byteswap, +}; + +#define SA_COPY_DATA(f, s, t, l) \ + { \ + if (f == NULL) { \ + if (l == 8) { \ + *(uint64_t *)t = *(uint64_t *)s; \ + } else if (l == 16) { \ + *(uint64_t *)t = *(uint64_t *)s; \ + *(uint64_t *)((uintptr_t)t + 8) = \ + *(uint64_t *)((uintptr_t)s + 8); \ + } else { \ + bcopy(s, t, l); \ + } \ + } else \ + sa_copy_data(f, s, t, l); \ + } + +/* + * This table is fixed and cannot be changed. Its purpose is to + * allow the SA code to work with both old/new ZPL file systems. + * It contains the list of legacy attributes. These attributes aren't + * stored in the "attribute" registry zap objects, since older ZPL file systems + * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will + * use this static table. + */ +sa_attr_reg_t sa_legacy_attrs[] = { + {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, + {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, + {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, + {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, + {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, + {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, + {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, + {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, + {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, + {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, + {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, + {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, + {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, + {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, + {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, + {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, +}; + +/* + * This is only used for objects of type DMU_OT_ZNODE + */ +sa_attr_type_t sa_legacy_zpl_layout[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +}; + +/* + * Special dummy layout used for buffers with no attributes. + */ +sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; + +static int sa_legacy_attr_count = 16; +static kmem_cache_t *sa_cache = NULL; + +/*ARGSUSED*/ +static int +sa_cache_constructor(void *buf, void *unused, int kmflag) +{ + sa_handle_t *hdl = buf; + +#ifdef __NetBSD__ + hdl = unused; +#endif + mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL); + return (0); +} + +/*ARGSUSED*/ +static void +sa_cache_destructor(void *buf, void *unused) +{ + sa_handle_t *hdl = buf; + +#ifdef __NetBSD__ + hdl = unused; +#endif + mutex_destroy(&hdl->sa_lock); +} + +void +sa_cache_init(void) +{ + sa_cache = kmem_cache_create("sa_cache", + sizeof (sa_handle_t), 0, sa_cache_constructor, + sa_cache_destructor, NULL, NULL, NULL, 0); +} + +void +sa_cache_fini(void) +{ + if (sa_cache) + kmem_cache_destroy(sa_cache); +} + +static int +layout_num_compare(const void *arg1, const void *arg2) +{ + const sa_lot_t *node1 = arg1; + const sa_lot_t *node2 = arg2; + + if (node1->lot_num > node2->lot_num) + return (1); + else if (node1->lot_num < node2->lot_num) + return (-1); + return (0); +} + +static int +layout_hash_compare(const void *arg1, const void *arg2) +{ + const sa_lot_t *node1 = arg1; + const sa_lot_t *node2 = arg2; + + if (node1->lot_hash > node2->lot_hash) + return (1); + if (node1->lot_hash < node2->lot_hash) + return (-1); + if (node1->lot_instance > node2->lot_instance) + return (1); + if (node1->lot_instance < node2->lot_instance) + return (-1); + return (0); +} + +boolean_t +sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count) +{ + int i; + + if (count != tbf->lot_attr_count) + return (1); + + for (i = 0; i != count; i++) { + if (attrs[i] != tbf->lot_attrs[i]) + return (1); + } + return (0); +} + +#define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF]) + +static uint64_t +sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count) +{ + int i; + uint64_t crc = -1ULL; + + for (i = 0; i != attr_count; i++) + crc ^= SA_ATTR_HASH(attrs[i]); + + return (crc); +} + +static int +sa_get_spill(sa_handle_t *hdl) +{ + int rc; + if (hdl->sa_spill == NULL) { + if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL, + &hdl->sa_spill)) == 0) + VERIFY(0 == sa_build_index(hdl, SA_SPILL)); + } else { + rc = 0; + } + + return (rc); +} + +/* + * Main attribute lookup/update function + * returns 0 for success or non zero for failures + * + * Operates on bulk array, first failure will abort further processing + */ +int +sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, + sa_data_op_t data_op, dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + int i; + int error = 0; + sa_buf_type_t buftypes; + + buftypes = 0; + + ASSERT(count > 0); + for (i = 0; i != count; i++) { + ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs); + + bulk[i].sa_addr = NULL; + /* First check the bonus buffer */ + + if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT( + hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) { + SA_ATTR_INFO(sa, hdl->sa_bonus_tab, + SA_GET_HDR(hdl, SA_BONUS), + bulk[i].sa_attr, bulk[i], SA_BONUS, hdl); + if (tx && !(buftypes & SA_BONUS)) { + dmu_buf_will_dirty(hdl->sa_bonus, tx); + buftypes |= SA_BONUS; + } + } + if (bulk[i].sa_addr == NULL && + ((error = sa_get_spill(hdl)) == 0)) { + if (TOC_ATTR_PRESENT( + hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) { + SA_ATTR_INFO(sa, hdl->sa_spill_tab, + SA_GET_HDR(hdl, SA_SPILL), + bulk[i].sa_attr, bulk[i], SA_SPILL, hdl); + if (tx && !(buftypes & SA_SPILL) && + bulk[i].sa_size == bulk[i].sa_length) { + dmu_buf_will_dirty(hdl->sa_spill, tx); + buftypes |= SA_SPILL; + } + } + } + if (error && error != ENOENT) { + return ((error == ECKSUM) ? EIO : error); + } + + switch (data_op) { + case SA_LOOKUP: + if (bulk[i].sa_addr == NULL) + return (SET_ERROR(ENOENT)); + if (bulk[i].sa_data) { + SA_COPY_DATA(bulk[i].sa_data_func, + bulk[i].sa_addr, bulk[i].sa_data, + bulk[i].sa_size); + } + continue; + + case SA_UPDATE: + /* existing rewrite of attr */ + if (bulk[i].sa_addr && + bulk[i].sa_size == bulk[i].sa_length) { + SA_COPY_DATA(bulk[i].sa_data_func, + bulk[i].sa_data, bulk[i].sa_addr, + bulk[i].sa_length); + continue; + } else if (bulk[i].sa_addr) { /* attr size change */ + error = sa_modify_attrs(hdl, bulk[i].sa_attr, + SA_REPLACE, bulk[i].sa_data_func, + bulk[i].sa_data, bulk[i].sa_length, tx); + } else { /* adding new attribute */ + error = sa_modify_attrs(hdl, bulk[i].sa_attr, + SA_ADD, bulk[i].sa_data_func, + bulk[i].sa_data, bulk[i].sa_length, tx); + } + if (error) + return (error); + break; + } + } + return (error); +} + +static sa_lot_t * +sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, + uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx) +{ + sa_os_t *sa = os->os_sa; + sa_lot_t *tb, *findtb; + int i; + avl_index_t loc; + + ASSERT(MUTEX_HELD(&sa->sa_lock)); + tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP); + tb->lot_attr_count = attr_count; +#ifdef __NetBSD__ + if (attr_count != 0) +#endif + tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, + KM_SLEEP); + bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count); + tb->lot_num = lot_num; + tb->lot_hash = hash; + tb->lot_instance = 0; + + if (zapadd) { + char attr_name[8]; + + if (sa->sa_layout_attr_obj == 0) { + sa->sa_layout_attr_obj = zap_create_link(os, + DMU_OT_SA_ATTR_LAYOUTS, + sa->sa_master_obj, SA_LAYOUTS, tx); + } + + (void) snprintf(attr_name, sizeof (attr_name), + "%d", (int)lot_num); + VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj, + attr_name, 2, attr_count, attrs, tx)); + } + + list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t), + offsetof(sa_idx_tab_t, sa_next)); + + for (i = 0; i != attr_count; i++) { + if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0) + tb->lot_var_sizes++; + } + + avl_add(&sa->sa_layout_num_tree, tb); + + /* verify we don't have a hash collision */ + if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) { + for (; findtb && findtb->lot_hash == hash; + findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) { + if (findtb->lot_instance != tb->lot_instance) + break; + tb->lot_instance++; + } + } + avl_add(&sa->sa_layout_hash_tree, tb); + return (tb); +} + +static void +sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs, + int count, dmu_tx_t *tx, sa_lot_t **lot) +{ + sa_lot_t *tb, tbsearch; + avl_index_t loc; + sa_os_t *sa = os->os_sa; + boolean_t found = B_FALSE; + + mutex_enter(&sa->sa_lock); + tbsearch.lot_hash = hash; + tbsearch.lot_instance = 0; + tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc); + if (tb) { + for (; tb && tb->lot_hash == hash; + tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) { + if (sa_layout_equal(tb, attrs, count) == 0) { + found = B_TRUE; + break; + } + } + } + if (!found) { + tb = sa_add_layout_entry(os, attrs, count, + avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx); + } + mutex_exit(&sa->sa_lock); + *lot = tb; +} + +static int +sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx) +{ + int error; + uint32_t blocksize; + + if (size == 0) { + blocksize = SPA_MINBLOCKSIZE; + } else if (size > SPA_OLD_MAXBLOCKSIZE) { + ASSERT(0); + return (SET_ERROR(EFBIG)); + } else { + blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t); + } + + error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx); + ASSERT(error == 0); + return (error); +} + +static void +sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen) +{ + if (func == NULL) { + bcopy(datastart, target, buflen); + } else { + boolean_t start; + int bytes; + void *dataptr; + void *saptr = target; + uint32_t length; + + start = B_TRUE; + bytes = 0; + while (bytes < buflen) { + func(&dataptr, &length, buflen, start, datastart); + bcopy(dataptr, saptr, length); + saptr = (void *)((caddr_t)saptr + length); + bytes += length; + start = B_FALSE; + } + } +} + +/* + * Determine several different sizes + * first the sa header size + * the number of bytes to be stored + * if spill would occur the index in the attribute array is returned + * + * the boolean will_spill will be set when spilling is necessary. It + * is only set when the buftype is SA_BONUS + */ +static int +sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, + dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total, + boolean_t *will_spill) +{ + int var_size = 0; + int i; + int full_space; + int hdrsize; + int extra_hdrsize; + + if (buftype == SA_BONUS && sa->sa_force_spill) { + *total = 0; + *index = 0; + *will_spill = B_TRUE; + return (0); + } + + *index = -1; + *total = 0; + *will_spill = B_FALSE; + + extra_hdrsize = 0; + hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 : + sizeof (sa_hdr_phys_t); + + full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size; + ASSERT(IS_P2ALIGNED(full_space, 8)); + + for (i = 0; i != attr_count; i++) { + boolean_t is_var_sz; + + *total = P2ROUNDUP(*total, 8); + *total += attr_desc[i].sa_length; + if (*will_spill) + continue; + + is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0); + if (is_var_sz) { + var_size++; + } + + if (is_var_sz && var_size > 1) { + /* + * Don't worry that the spill block might overflow. + * It will be resized if needed in sa_build_layouts(). + */ + if (buftype == SA_SPILL || + P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) + + *total < full_space) { + /* + * Account for header space used by array of + * optional sizes of variable-length attributes. + * Record the extra header size in case this + * increase needs to be reversed due to + * spill-over. + */ + hdrsize += sizeof (uint16_t); + if (*index != -1) + extra_hdrsize += sizeof (uint16_t); + } else { + ASSERT(buftype == SA_BONUS); + if (*index == -1) + *index = i; + *will_spill = B_TRUE; + continue; + } + } + + /* + * find index of where spill *could* occur. + * Then continue to count of remainder attribute + * space. The sum is used later for sizing bonus + * and spill buffer. + */ + if (buftype == SA_BONUS && *index == -1 && + (*total + P2ROUNDUP(hdrsize, 8)) > + (full_space - sizeof (blkptr_t))) { + *index = i; + } + + if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space && + buftype == SA_BONUS) + *will_spill = B_TRUE; + } + + if (*will_spill) + hdrsize -= extra_hdrsize; + + hdrsize = P2ROUNDUP(hdrsize, 8); + return (hdrsize); +} + +#define BUF_SPACE_NEEDED(total, header) (total + header) + +/* + * Find layout that corresponds to ordering of attributes + * If not found a new layout number is created and added to + * persistent layout tables. + */ +static int +sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, + dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + uint64_t hash; + sa_buf_type_t buftype; + sa_hdr_phys_t *sahdr; + void *data_start; + int buf_space; + sa_attr_type_t *attrs, *attrs_start; + int i, lot_count; + int hdrsize; + int spillhdrsize = 0; + int used; + dmu_object_type_t bonustype; + sa_lot_t *lot; + int len_idx; + int spill_used; + boolean_t spilling; + + dmu_buf_will_dirty(hdl->sa_bonus, tx); + bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus); + + /* first determine bonus header size and sum of all attributes */ + hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus, + SA_BONUS, &i, &used, &spilling); + + if (used > SPA_OLD_MAXBLOCKSIZE) + return (SET_ERROR(EFBIG)); + + VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ? + MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) : + used + hdrsize, tx)); + + ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) || + bonustype == DMU_OT_SA); + + /* setup and size spill buffer when needed */ + if (spilling) { + boolean_t dummy; + + if (hdl->sa_spill == NULL) { + VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL, + &hdl->sa_spill) == 0); + } + dmu_buf_will_dirty(hdl->sa_spill, tx); + + spillhdrsize = sa_find_sizes(sa, &attr_desc[i], + attr_count - i, hdl->sa_spill, SA_SPILL, &i, + &spill_used, &dummy); + + if (spill_used > SPA_OLD_MAXBLOCKSIZE) + return (SET_ERROR(EFBIG)); + + buf_space = hdl->sa_spill->db_size - spillhdrsize; + if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) > + hdl->sa_spill->db_size) + VERIFY(0 == sa_resize_spill(hdl, + BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx)); + } + + /* setup starting pointers to lay down data */ + data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize); + sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data; + buftype = SA_BONUS; + + if (spilling) + buf_space = (sa->sa_force_spill) ? + 0 : SA_BLKPTR_SPACE - hdrsize; + else + buf_space = hdl->sa_bonus->db_size - hdrsize; + + attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, + KM_SLEEP); + lot_count = 0; + + for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) { + uint16_t length; + + ASSERT(IS_P2ALIGNED(data_start, 8)); + ASSERT(IS_P2ALIGNED(buf_space, 8)); + attrs[i] = attr_desc[i].sa_attr; + length = SA_REGISTERED_LEN(sa, attrs[i]); + if (length == 0) + length = attr_desc[i].sa_length; + else + VERIFY(length == attr_desc[i].sa_length); + + if (buf_space < length) { /* switch to spill buffer */ + VERIFY(spilling); + VERIFY(bonustype == DMU_OT_SA); + if (buftype == SA_BONUS && !sa->sa_force_spill) { + sa_find_layout(hdl->sa_os, hash, attrs_start, + lot_count, tx, &lot); + SA_SET_HDR(sahdr, lot->lot_num, hdrsize); + } + + buftype = SA_SPILL; + hash = -1ULL; + len_idx = 0; + + sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data; + sahdr->sa_magic = SA_MAGIC; + data_start = (void *)((uintptr_t)sahdr + + spillhdrsize); + attrs_start = &attrs[i]; + buf_space = hdl->sa_spill->db_size - spillhdrsize; + lot_count = 0; + } + hash ^= SA_ATTR_HASH(attrs[i]); + attr_desc[i].sa_addr = data_start; + attr_desc[i].sa_size = length; + SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data, + data_start, length); + if (sa->sa_attr_table[attrs[i]].sa_length == 0) { + sahdr->sa_lengths[len_idx++] = length; + } + VERIFY((uintptr_t)data_start % 8 == 0); + data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + + length), 8); + buf_space -= P2ROUNDUP(length, 8); + lot_count++; + } + + sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); + + /* + * Verify that old znodes always have layout number 0. + * Must be DMU_OT_SA for arbitrary layouts + */ + VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) || + (bonustype == DMU_OT_SA && lot->lot_num > 1)); + + if (bonustype == DMU_OT_SA) { + SA_SET_HDR(sahdr, lot->lot_num, + buftype == SA_BONUS ? hdrsize : spillhdrsize); + } + + kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count); + if (hdl->sa_bonus_tab) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); + hdl->sa_bonus_tab = NULL; + } + if (!sa->sa_force_spill) + VERIFY(0 == sa_build_index(hdl, SA_BONUS)); + if (hdl->sa_spill) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); + if (!spilling) { + /* + * remove spill block that is no longer needed. + */ + dmu_buf_rele(hdl->sa_spill, NULL); + hdl->sa_spill = NULL; + hdl->sa_spill_tab = NULL; + VERIFY(0 == dmu_rm_spill(hdl->sa_os, + sa_handle_object(hdl), tx)); + } else { + VERIFY(0 == sa_build_index(hdl, SA_SPILL)); + } + } + + return (0); +} + +static void +sa_free_attr_table(sa_os_t *sa) +{ + int i; + + if (sa->sa_attr_table == NULL) + return; + + for (i = 0; i != sa->sa_num_attrs; i++) { + if (sa->sa_attr_table[i].sa_name) + kmem_free(sa->sa_attr_table[i].sa_name, + strlen(sa->sa_attr_table[i].sa_name) + 1); + } + + kmem_free(sa->sa_attr_table, + sizeof (sa_attr_table_t) * sa->sa_num_attrs); + + sa->sa_attr_table = NULL; +} + +static int +sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) +{ + sa_os_t *sa = os->os_sa; + uint64_t sa_attr_count = 0; + uint64_t sa_reg_count = 0; + int error = 0; + uint64_t attr_value; + sa_attr_table_t *tb; + zap_cursor_t zc; + zap_attribute_t za; + int registered_count = 0; + int i; + dmu_objset_type_t ostype = dmu_objset_type(os); + + sa->sa_user_table = + kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP); + sa->sa_user_table_sz = count * sizeof (sa_attr_type_t); + + if (sa->sa_reg_attr_obj != 0) { + error = zap_count(os, sa->sa_reg_attr_obj, + &sa_attr_count); + + /* + * Make sure we retrieved a count and that it isn't zero + */ + if (error || (error == 0 && sa_attr_count == 0)) { + if (error == 0) + error = SET_ERROR(EINVAL); + goto bail; + } + sa_reg_count = sa_attr_count; + } + + if (ostype == DMU_OST_ZFS && sa_attr_count == 0) + sa_attr_count += sa_legacy_attr_count; + + /* Allocate attribute numbers for attributes that aren't registered */ + for (i = 0; i != count; i++) { + boolean_t found = B_FALSE; + int j; + + if (ostype == DMU_OST_ZFS) { + for (j = 0; j != sa_legacy_attr_count; j++) { + if (strcmp(reg_attrs[i].sa_name, + sa_legacy_attrs[j].sa_name) == 0) { + sa->sa_user_table[i] = + sa_legacy_attrs[j].sa_attr; + found = B_TRUE; + } + } + } + if (found) + continue; + + if (sa->sa_reg_attr_obj) + error = zap_lookup(os, sa->sa_reg_attr_obj, + reg_attrs[i].sa_name, 8, 1, &attr_value); + else + error = SET_ERROR(ENOENT); + switch (error) { + case ENOENT: + sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count; + sa_attr_count++; + break; + case 0: + sa->sa_user_table[i] = ATTR_NUM(attr_value); + break; + default: + goto bail; + } + } + + sa->sa_num_attrs = sa_attr_count; + tb = sa->sa_attr_table = + kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP); + + /* + * Attribute table is constructed from requested attribute list, + * previously foreign registered attributes, and also the legacy + * ZPL set of attributes. + */ + + if (sa->sa_reg_attr_obj) { + for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + uint64_t value; + value = za.za_first_integer; + + registered_count++; + tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value); + tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value); + tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value); + tb[ATTR_NUM(value)].sa_registered = B_TRUE; + + if (tb[ATTR_NUM(value)].sa_name) { + continue; + } + tb[ATTR_NUM(value)].sa_name = + kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP); + (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name, + strlen(za.za_name) +1); + } + zap_cursor_fini(&zc); + /* + * Make sure we processed the correct number of registered + * attributes + */ + if (registered_count != sa_reg_count) { + ASSERT(error != 0); + goto bail; + } + + } + + if (ostype == DMU_OST_ZFS) { + for (i = 0; i != sa_legacy_attr_count; i++) { + if (tb[i].sa_name) + continue; + tb[i].sa_attr = sa_legacy_attrs[i].sa_attr; + tb[i].sa_length = sa_legacy_attrs[i].sa_length; + tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap; + tb[i].sa_registered = B_FALSE; + tb[i].sa_name = + kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1, + KM_SLEEP); + (void) strlcpy(tb[i].sa_name, + sa_legacy_attrs[i].sa_name, + strlen(sa_legacy_attrs[i].sa_name) + 1); + } + } + + for (i = 0; i != count; i++) { + sa_attr_type_t attr_id; + + attr_id = sa->sa_user_table[i]; + if (tb[attr_id].sa_name) + continue; + + tb[attr_id].sa_length = reg_attrs[i].sa_length; + tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap; + tb[attr_id].sa_attr = attr_id; + tb[attr_id].sa_name = + kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP); + (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name, + strlen(reg_attrs[i].sa_name) + 1); + } + + sa->sa_need_attr_registration = + (sa_attr_count != registered_count); + + return (0); +bail: + kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t)); + sa->sa_user_table = NULL; + sa_free_attr_table(sa); + return ((error != 0) ? error : EINVAL); +} + +int +sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, + sa_attr_type_t **user_table) +{ + zap_cursor_t zc; + zap_attribute_t za; + sa_os_t *sa; + dmu_objset_type_t ostype = dmu_objset_type(os); + sa_attr_type_t *tb; + int error; + + mutex_enter(&os->os_user_ptr_lock); + if (os->os_sa) { + mutex_enter(&os->os_sa->sa_lock); + mutex_exit(&os->os_user_ptr_lock); + tb = os->os_sa->sa_user_table; + mutex_exit(&os->os_sa->sa_lock); + *user_table = tb; + return (0); + } + + sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP); + mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL); + sa->sa_master_obj = sa_obj; + + os->os_sa = sa; + mutex_enter(&sa->sa_lock); + mutex_exit(&os->os_user_ptr_lock); + avl_create(&sa->sa_layout_num_tree, layout_num_compare, + sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node)); + avl_create(&sa->sa_layout_hash_tree, layout_hash_compare, + sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node)); + + if (sa_obj) { + error = zap_lookup(os, sa_obj, SA_LAYOUTS, + 8, 1, &sa->sa_layout_attr_obj); + if (error != 0 && error != ENOENT) + goto fail; + error = zap_lookup(os, sa_obj, SA_REGISTRY, + 8, 1, &sa->sa_reg_attr_obj); + if (error != 0 && error != ENOENT) + goto fail; + } + + if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0) + goto fail; + + if (sa->sa_layout_attr_obj != 0) { + uint64_t layout_count; + + error = zap_count(os, sa->sa_layout_attr_obj, + &layout_count); + + /* + * Layout number count should be > 0 + */ + if (error || (error == 0 && layout_count == 0)) { + if (error == 0) + error = SET_ERROR(EINVAL); + goto fail; + } + + for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + sa_attr_type_t *lot_attrs; + uint64_t lot_num; + + lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) * + za.za_num_integers, KM_SLEEP); + + if ((error = (zap_lookup(os, sa->sa_layout_attr_obj, + za.za_name, 2, za.za_num_integers, + lot_attrs))) != 0) { + kmem_free(lot_attrs, sizeof (sa_attr_type_t) * + za.za_num_integers); + break; + } + VERIFY(ddi_strtoull(za.za_name, NULL, 10, + (unsigned long long *)&lot_num) == 0); + + (void) sa_add_layout_entry(os, lot_attrs, + za.za_num_integers, lot_num, + sa_layout_info_hash(lot_attrs, + za.za_num_integers), B_FALSE, NULL); + kmem_free(lot_attrs, sizeof (sa_attr_type_t) * + za.za_num_integers); + } + zap_cursor_fini(&zc); + + /* + * Make sure layout count matches number of entries added + * to AVL tree + */ + if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) { + ASSERT(error != 0); + goto fail; + } + } + + /* Add special layout number for old ZNODES */ + if (ostype == DMU_OST_ZFS) { + (void) sa_add_layout_entry(os, sa_legacy_zpl_layout, + sa_legacy_attr_count, 0, + sa_layout_info_hash(sa_legacy_zpl_layout, + sa_legacy_attr_count), B_FALSE, NULL); + + (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1, + 0, B_FALSE, NULL); + } + *user_table = os->os_sa->sa_user_table; + mutex_exit(&sa->sa_lock); + return (0); +fail: + os->os_sa = NULL; + sa_free_attr_table(sa); + if (sa->sa_user_table) + kmem_free(sa->sa_user_table, sa->sa_user_table_sz); + mutex_exit(&sa->sa_lock); + avl_destroy(&sa->sa_layout_hash_tree); + avl_destroy(&sa->sa_layout_num_tree); + mutex_destroy(&sa->sa_lock); + kmem_free(sa, sizeof (sa_os_t)); + return ((error == ECKSUM) ? EIO : error); +} + +void +sa_tear_down(objset_t *os) +{ + sa_os_t *sa = os->os_sa; + sa_lot_t *layout; + void *cookie; + + kmem_free(sa->sa_user_table, sa->sa_user_table_sz); + + /* Free up attr table */ + + sa_free_attr_table(sa); + + cookie = NULL; + while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) { + sa_idx_tab_t *tab; + while (tab = list_head(&layout->lot_idx_tab)) { + ASSERT(refcount_count(&tab->sa_refcount)); + sa_idx_tab_rele(os, tab); + } + } + + cookie = NULL; + while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) { +#ifdef __NetBSD__ + if (layout->lot_attr_count != 0) +#endif + kmem_free(layout->lot_attrs, + sizeof (sa_attr_type_t) * layout->lot_attr_count); + kmem_free(layout, sizeof (sa_lot_t)); + } + + avl_destroy(&sa->sa_layout_hash_tree); + avl_destroy(&sa->sa_layout_num_tree); + mutex_destroy(&sa->sa_lock); + + kmem_free(sa, sizeof (sa_os_t)); + os->os_sa = NULL; +} + +void +sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr, + uint16_t length, int length_idx, boolean_t var_length, void *userp) +{ + sa_idx_tab_t *idx_tab = userp; + + if (var_length) { + ASSERT(idx_tab->sa_variable_lengths); + idx_tab->sa_variable_lengths[length_idx] = length; + } + TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx, + (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr)); +} + +static void +sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type, + sa_iterfunc_t func, sa_lot_t *tab, void *userp) +{ + void *data_start; + sa_lot_t *tb = tab; + sa_lot_t search; + avl_index_t loc; + sa_os_t *sa = os->os_sa; + int i; + uint16_t *length_start = NULL; + uint8_t length_idx = 0; + + if (tab == NULL) { + search.lot_num = SA_LAYOUT_NUM(hdr, type); + tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); + ASSERT(tb); + } + + if (IS_SA_BONUSTYPE(type)) { + data_start = (void *)P2ROUNDUP(((uintptr_t)hdr + + offsetof(sa_hdr_phys_t, sa_lengths) + + (sizeof (uint16_t) * tb->lot_var_sizes)), 8); + length_start = hdr->sa_lengths; + } else { + data_start = hdr; + } + + for (i = 0; i != tb->lot_attr_count; i++) { + int attr_length, reg_length; + uint8_t idx_len; + + reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length; + if (reg_length) { + attr_length = reg_length; + idx_len = 0; + } else { + attr_length = length_start[length_idx]; + idx_len = length_idx++; + } + + func(hdr, data_start, tb->lot_attrs[i], attr_length, + idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp); + + data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + + attr_length), 8); + } +} + +/*ARGSUSED*/ +void +sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr, + uint16_t length, int length_idx, boolean_t variable_length, void *userp) +{ + sa_handle_t *hdl = userp; + sa_os_t *sa = hdl->sa_os->os_sa; + + sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length); +} + +void +sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype) +{ + sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype); + dmu_buf_impl_t *db; + sa_os_t *sa = hdl->sa_os->os_sa; + int num_lengths = 1; + int i; + + ASSERT(MUTEX_HELD(&sa->sa_lock)); + if (sa_hdr_phys->sa_magic == SA_MAGIC) + return; + + db = SA_GET_DB(hdl, buftype); + + if (buftype == SA_SPILL) { + arc_release(db->db_buf, NULL); + arc_buf_thaw(db->db_buf); + } + + sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic); + sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info); + + /* + * Determine number of variable lenghts in header + * The standard 8 byte header has one for free and a + * 16 byte header would have 4 + 1; + */ + if (SA_HDR_SIZE(sa_hdr_phys) > 8) + num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1; + for (i = 0; i != num_lengths; i++) + sa_hdr_phys->sa_lengths[i] = + BSWAP_16(sa_hdr_phys->sa_lengths[i]); + + sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA, + sa_byteswap_cb, NULL, hdl); + + if (buftype == SA_SPILL) + arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf); +} + +static int +sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) +{ + sa_hdr_phys_t *sa_hdr_phys; + dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype); + dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db); + sa_os_t *sa = hdl->sa_os->os_sa; + sa_idx_tab_t *idx_tab; + + sa_hdr_phys = SA_GET_HDR(hdl, buftype); + + mutex_enter(&sa->sa_lock); + + /* Do we need to byteswap? */ + + /* only check if not old znode */ + if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC && + sa_hdr_phys->sa_magic != 0) { + VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC); + sa_byteswap(hdl, buftype); + } + + idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys); + + if (buftype == SA_BONUS) + hdl->sa_bonus_tab = idx_tab; + else + hdl->sa_spill_tab = idx_tab; + + mutex_exit(&sa->sa_lock); + return (0); +} + +/*ARGSUSED*/ +static void +sa_evict_sync(void *dbu) +{ + panic("evicting sa dbuf\n"); +} + +static void +sa_idx_tab_rele(objset_t *os, void *arg) +{ + sa_os_t *sa = os->os_sa; + sa_idx_tab_t *idx_tab = arg; + + if (idx_tab == NULL) + return; + + mutex_enter(&sa->sa_lock); + if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) { + list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab); + if (idx_tab->sa_variable_lengths) + kmem_free(idx_tab->sa_variable_lengths, + sizeof (uint16_t) * + idx_tab->sa_layout->lot_var_sizes); + refcount_destroy(&idx_tab->sa_refcount); + kmem_free(idx_tab->sa_idx_tab, + sizeof (uint32_t) * sa->sa_num_attrs); + kmem_free(idx_tab, sizeof (sa_idx_tab_t)); + } + mutex_exit(&sa->sa_lock); +} + +static void +sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab) +{ + sa_os_t *sa = os->os_sa; + + ASSERT(MUTEX_HELD(&sa->sa_lock)); + (void) refcount_add(&idx_tab->sa_refcount, NULL); +} + +void +sa_handle_destroy(sa_handle_t *hdl) +{ + dmu_buf_t *db = hdl->sa_bonus; + + mutex_enter(&hdl->sa_lock); + (void) dmu_buf_remove_user(db, &hdl->sa_dbu); + + if (hdl->sa_bonus_tab) + sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); + + if (hdl->sa_spill_tab) + sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); + + dmu_buf_rele(hdl->sa_bonus, NULL); + + if (hdl->sa_spill) + dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL); + mutex_exit(&hdl->sa_lock); + + kmem_cache_free(sa_cache, hdl); +} + +int +sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, + sa_handle_type_t hdl_type, sa_handle_t **handlepp) +{ + int error = 0; + dmu_object_info_t doi; + sa_handle_t *handle = NULL; + +#ifdef ZFS_DEBUG + dmu_object_info_from_db(db, &doi); + ASSERT(doi.doi_bonus_type == DMU_OT_SA || + doi.doi_bonus_type == DMU_OT_ZNODE); +#endif + /* find handle, if it exists */ + /* if one doesn't exist then create a new one, and initialize it */ + + if (hdl_type == SA_HDL_SHARED) + handle = dmu_buf_get_user(db); + + if (handle == NULL) { + sa_handle_t *winner = NULL; + + handle = kmem_cache_alloc(sa_cache, KM_SLEEP); + handle->sa_dbu.dbu_evict_func_sync = NULL; + handle->sa_dbu.dbu_evict_func_async = NULL; + handle->sa_userp = userp; + handle->sa_bonus = db; + handle->sa_os = os; + handle->sa_spill = NULL; + handle->sa_bonus_tab = NULL; + handle->sa_spill_tab = NULL; + + error = sa_build_index(handle, SA_BONUS); + + if (hdl_type == SA_HDL_SHARED) { + dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL, + NULL); + winner = dmu_buf_set_user_ie(db, &handle->sa_dbu); + } + + if (winner != NULL) { + kmem_cache_free(sa_cache, handle); + handle = winner; + } + } + *handlepp = handle; + + return (error); +} + +int +sa_handle_get(objset_t *objset, uint64_t objid, void *userp, + sa_handle_type_t hdl_type, sa_handle_t **handlepp) +{ + dmu_buf_t *db; + int error; + + if (error = dmu_bonus_hold(objset, objid, NULL, &db)) + return (error); + + return (sa_handle_get_from_db(objset, db, userp, hdl_type, + handlepp)); +} + +int +sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db) +{ + return (dmu_bonus_hold(objset, obj_num, tag, db)); +} + +void +sa_buf_rele(dmu_buf_t *db, void *tag) +{ + dmu_buf_rele(db, tag); +} + +int +sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count) +{ + ASSERT(hdl); + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL)); +} + +int +sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_attr = attr; + bulk.sa_data = buf; + bulk.sa_length = buflen; + bulk.sa_data_func = NULL; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + error = sa_lookup_impl(hdl, &bulk, 1); + mutex_exit(&hdl->sa_lock); + return (error); +} + +#ifdef _KERNEL +int +sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_data = NULL; + bulk.sa_attr = attr; + bulk.sa_data_func = NULL; + + ASSERT(hdl); + + mutex_enter(&hdl->sa_lock); + if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) { + error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, + uio->uio_resid), UIO_READ, uio); + } + mutex_exit(&hdl->sa_lock); + return (error); + +} +#endif + +void * +sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data) +{ + sa_idx_tab_t *idx_tab; + sa_hdr_phys_t *hdr = (sa_hdr_phys_t *)data; + sa_os_t *sa = os->os_sa; + sa_lot_t *tb, search; + avl_index_t loc; + + /* + * Deterimine layout number. If SA node and header == 0 then + * force the index table to the dummy "1" empty layout. + * + * The layout number would only be zero for a newly created file + * that has not added any attributes yet, or with crypto enabled which + * doesn't write any attributes to the bonus buffer. + */ + + search.lot_num = SA_LAYOUT_NUM(hdr, bonustype); + + tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); + + /* Verify header size is consistent with layout information */ + ASSERT(tb); + ASSERT(IS_SA_BONUSTYPE(bonustype) && + SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) || + (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0)); + + /* + * See if any of the already existing TOC entries can be reused? + */ + + for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab; + idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) { + boolean_t valid_idx = B_TRUE; + int i; + + if (tb->lot_var_sizes != 0 && + idx_tab->sa_variable_lengths != NULL) { + for (i = 0; i != tb->lot_var_sizes; i++) { + if (hdr->sa_lengths[i] != + idx_tab->sa_variable_lengths[i]) { + valid_idx = B_FALSE; + break; + } + } + } + if (valid_idx) { + sa_idx_tab_hold(os, idx_tab); + return (idx_tab); + } + } + + /* No such luck, create a new entry */ + idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP); + idx_tab->sa_idx_tab = + kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP); + idx_tab->sa_layout = tb; + refcount_create(&idx_tab->sa_refcount); + if (tb->lot_var_sizes) + idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) * + tb->lot_var_sizes, KM_SLEEP); + + sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab, + tb, idx_tab); + sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */ + sa_idx_tab_hold(os, idx_tab); /* one for layout */ + list_insert_tail(&tb->lot_idx_tab, idx_tab); + return (idx_tab); +} + +void +sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len, + boolean_t start, void *userdata) +{ + ASSERT(start); + + *dataptr = userdata; + *len = total_len; +} + +static void +sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) +{ + uint64_t attr_value = 0; + sa_os_t *sa = hdl->sa_os->os_sa; + sa_attr_table_t *tb = sa->sa_attr_table; + int i; + + mutex_enter(&sa->sa_lock); + + if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) { + mutex_exit(&sa->sa_lock); + return; + } + + if (sa->sa_reg_attr_obj == 0) { + sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os, + DMU_OT_SA_ATTR_REGISTRATION, + sa->sa_master_obj, SA_REGISTRY, tx); + } + for (i = 0; i != sa->sa_num_attrs; i++) { + if (sa->sa_attr_table[i].sa_registered) + continue; + ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length, + tb[i].sa_byteswap); + VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj, + tb[i].sa_name, 8, 1, &attr_value, tx)); + tb[i].sa_registered = B_TRUE; + } + sa->sa_need_attr_registration = B_FALSE; + mutex_exit(&sa->sa_lock); +} + +/* + * Replace all attributes with attributes specified in template. + * If dnode had a spill buffer then those attributes will be + * also be replaced, possibly with just an empty spill block + * + * This interface is intended to only be used for bulk adding of + * attributes for a new file. It will also be used by the ZPL + * when converting and old formatted znode to native SA support. + */ +int +sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, + int attr_count, dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + + if (sa->sa_need_attr_registration) + sa_attr_register_sync(hdl, tx); + return (sa_build_layouts(hdl, attr_desc, attr_count, tx)); +} + +int +sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, + int attr_count, dmu_tx_t *tx) +{ + int error; + + mutex_enter(&hdl->sa_lock); + error = sa_replace_all_by_template_locked(hdl, attr_desc, + attr_count, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +/* + * Add/remove a single attribute or replace a variable-sized attribute value + * with a value of a different size, and then rewrite the entire set + * of attributes. + * Same-length attribute value replacement (including fixed-length attributes) + * is handled more efficiently by the upper layers. + */ +static int +sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, + sa_data_op_t action, sa_data_locator_t *locator, void *datastart, + uint16_t buflen, dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + dnode_t *dn; + sa_bulk_attr_t *attr_desc; + void *old_data[2]; + int bonus_attr_count = 0; + int bonus_data_size = 0; + int spill_data_size = 0; + int spill_attr_count = 0; + int error; + uint16_t length, reg_length; + int i, j, k, length_idx; + sa_hdr_phys_t *hdr; + sa_idx_tab_t *idx_tab; + int attr_count; + int count; + + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + + /* First make of copy of the old data */ + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn->dn_bonuslen != 0) { + bonus_data_size = hdl->sa_bonus->db_size; + old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); + bcopy(hdl->sa_bonus->db_data, old_data[0], + hdl->sa_bonus->db_size); + bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count; + } else { + old_data[0] = NULL; + } + DB_DNODE_EXIT(db); + + /* Bring spill buffer online if it isn't currently */ + + if ((error = sa_get_spill(hdl)) == 0) { + spill_data_size = hdl->sa_spill->db_size; + old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP); + bcopy(hdl->sa_spill->db_data, old_data[1], + hdl->sa_spill->db_size); + spill_attr_count = + hdl->sa_spill_tab->sa_layout->lot_attr_count; + } else if (error && error != ENOENT) { + if (old_data[0]) + kmem_free(old_data[0], bonus_data_size); + return (error); + } else { + old_data[1] = NULL; + } + + /* build descriptor of all attributes */ + + attr_count = bonus_attr_count + spill_attr_count; + if (action == SA_ADD) + attr_count++; + else if (action == SA_REMOVE) + attr_count--; + + attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP); + + /* + * loop through bonus and spill buffer if it exists, and + * build up new attr_descriptor to reset the attributes + */ + k = j = 0; + count = bonus_attr_count; + hdr = SA_GET_HDR(hdl, SA_BONUS); + idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS); + for (; k != 2; k++) { + /* + * Iterate over each attribute in layout. Fetch the + * size of variable-length attributes needing rewrite + * from sa_lengths[]. + */ + for (i = 0, length_idx = 0; i != count; i++) { + sa_attr_type_t attr; + + attr = idx_tab->sa_layout->lot_attrs[i]; + reg_length = SA_REGISTERED_LEN(sa, attr); + if (reg_length == 0) { + length = hdr->sa_lengths[length_idx]; + length_idx++; + } else { + length = reg_length; + } + if (attr == newattr) { + /* + * There is nothing to do for SA_REMOVE, + * so it is just skipped. + */ + if (action == SA_REMOVE) + continue; + + /* + * Duplicate attributes are not allowed, so the + * action can not be SA_ADD here. + */ + ASSERT3S(action, ==, SA_REPLACE); + + /* + * Only a variable-sized attribute can be + * replaced here, and its size must be changing. + */ + ASSERT3U(reg_length, ==, 0); + ASSERT3U(length, !=, buflen); + SA_ADD_BULK_ATTR(attr_desc, j, attr, + locator, datastart, buflen); + } else { + SA_ADD_BULK_ATTR(attr_desc, j, attr, + NULL, (void *) + (TOC_OFF(idx_tab->sa_idx_tab[attr]) + + (uintptr_t)old_data[k]), length); + } + } + if (k == 0 && hdl->sa_spill) { + hdr = SA_GET_HDR(hdl, SA_SPILL); + idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL); + count = spill_attr_count; + } else { + break; + } + } + if (action == SA_ADD) { + reg_length = SA_REGISTERED_LEN(sa, newattr); + IMPLY(reg_length != 0, reg_length == buflen); + SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator, + datastart, buflen); + } + ASSERT3U(j, ==, attr_count); + + error = sa_build_layouts(hdl, attr_desc, attr_count, tx); + + if (old_data[0]) + kmem_free(old_data[0], bonus_data_size); + if (old_data[1]) + kmem_free(old_data[1], spill_data_size); + kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count); + + return (error); +} + +static int +sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, + dmu_tx_t *tx) +{ + int error; + sa_os_t *sa = hdl->sa_os->os_sa; + dmu_object_type_t bonustype; + + bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS)); + + ASSERT(hdl); + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + + /* sync out registration table if necessary */ + if (sa->sa_need_attr_registration) + sa_attr_register_sync(hdl, tx); + + error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx); + if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb) + sa->sa_update_cb(hdl, tx); + + return (error); +} + +/* + * update or add new attribute + */ +int +sa_update(sa_handle_t *hdl, sa_attr_type_t type, + void *buf, uint32_t buflen, dmu_tx_t *tx) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_attr = type; + bulk.sa_data_func = NULL; + bulk.sa_length = buflen; + bulk.sa_data = buf; + + mutex_enter(&hdl->sa_lock); + error = sa_bulk_update_impl(hdl, &bulk, 1, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +int +sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr, + uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_attr = attr; + bulk.sa_data = userdata; + bulk.sa_data_func = locator; + bulk.sa_length = buflen; + + mutex_enter(&hdl->sa_lock); + error = sa_bulk_update_impl(hdl, &bulk, 1, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +/* + * Return size of an attribute + */ + +int +sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size) +{ + sa_bulk_attr_t bulk; + int error; + + bulk.sa_data = NULL; + bulk.sa_attr = attr; + bulk.sa_data_func = NULL; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) { + mutex_exit(&hdl->sa_lock); + return (error); + } + *size = bulk.sa_size; + + mutex_exit(&hdl->sa_lock); + return (0); +} + +int +sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) +{ + ASSERT(hdl); + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + return (sa_lookup_impl(hdl, attrs, count)); +} + +int +sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) +{ + int error; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + error = sa_bulk_lookup_locked(hdl, attrs, count); + mutex_exit(&hdl->sa_lock); + return (error); +} + +int +sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx) +{ + int error; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + error = sa_bulk_update_impl(hdl, attrs, count, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +int +sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx) +{ + int error; + + mutex_enter(&hdl->sa_lock); + error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL, + NULL, 0, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +void +sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi) +{ + dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi); +} + +void +sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) +{ + dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus, + blksize, nblocks); +} + +void +sa_set_userp(sa_handle_t *hdl, void *ptr) +{ + hdl->sa_userp = ptr; +} + +dmu_buf_t * +sa_get_db(sa_handle_t *hdl) +{ + return ((dmu_buf_t *)hdl->sa_bonus); +} + +void * +sa_get_userdata(sa_handle_t *hdl) +{ + return (hdl->sa_userp); +} + +void +sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func) +{ + ASSERT(MUTEX_HELD(&os->os_sa->sa_lock)); + os->os_sa->sa_update_cb = func; +} + +void +sa_register_update_callback(objset_t *os, sa_update_cb_t *func) +{ + + mutex_enter(&os->os_sa->sa_lock); + sa_register_update_callback_locked(os, func); + mutex_exit(&os->os_sa->sa_lock); +} + +uint64_t +sa_handle_object(sa_handle_t *hdl) +{ + return (hdl->sa_bonus->db_object); +} + +boolean_t +sa_enabled(objset_t *os) +{ + return (os->os_sa == NULL); +} + +int +sa_set_sa_object(objset_t *os, uint64_t sa_object) +{ + sa_os_t *sa = os->os_sa; + + if (sa->sa_master_obj) + return (1); + + sa->sa_master_obj = sa_object; + + return (0); +} + +int +sa_hdrsize(void *arg) +{ + sa_hdr_phys_t *hdr = arg; + + return (SA_HDR_SIZE(hdr)); +} + +void +sa_handle_lock(sa_handle_t *hdl) +{ + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); +} + +void +sa_handle_unlock(sa_handle_t *hdl) +{ + ASSERT(hdl); + mutex_exit(&hdl->sa_lock); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 sha256.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c 27 Feb 2010 22:31:01 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c 28 Jun 2017 00:06:33 -0000 @@ -22,19 +22,34 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ #include #include +#ifdef __FreeBSD__ +#ifdef _KERNEL +#include +#include +#else +#include +#include +#endif +#else #include +#endif +/*ARGSUSED*/ void -zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp) +zio_checksum_SHA256(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) { - SHA2_CTX ctx; + SHA256_CTX ctx; zio_cksum_t tmp; - SHA2Init(SHA256, &ctx); - SHA2Update(&ctx, buf, size); - SHA2Final(&tmp, &ctx); + SHA256_Init(&ctx); + SHA256_Update(&ctx, buf, size); + SHA256_Final((unsigned char *)&tmp, &ctx); /* * A prior implementation of this function had a @@ -48,3 +63,31 @@ zio_checksum_SHA256(const void *buf, uin zcp->zc_word[2] = BE_64(tmp.zc_word[2]); zcp->zc_word[3] = BE_64(tmp.zc_word[3]); } + +#ifndef __NetBSD__ +/*ARGSUSED*/ +void +zio_checksum_SHA512_native(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + SHA512_CTX ctx; + + SHA512_256_Init(&ctx); + SHA512_256_Update(&ctx, buf, size); + SHA512_256_Final((unsigned char *)zcp, &ctx); +} + +/*ARGSUSED*/ +void +zio_checksum_SHA512_byteswap(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + zio_cksum_t tmp; + + zio_checksum_SHA512_native(buf, size, ctx_template, &tmp); + zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); + zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); + zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); + zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]); +} +#endif Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/skein_zfs.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/skein_zfs.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/skein_zfs.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/skein_zfs.c 28 Jun 2017 00:19:57 -0000 @@ -0,0 +1,95 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ +#include +#include +#ifdef _KERNEL +#include +#else +#include +#endif + +/* + * Computes a native 256-bit skein MAC checksum. Please note that this + * function requires the presence of a ctx_template that should be allocated + * using zio_checksum_skein_tmpl_init. + */ +/*ARGSUSED*/ +void +zio_checksum_skein_native(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + Skein_512_Ctxt_t ctx; + + ASSERT(ctx_template != NULL); + bcopy(ctx_template, &ctx, sizeof (ctx)); + (void) Skein_512_Update(&ctx, buf, size); + (void) Skein_512_Final(&ctx, (uint8_t *)zcp); + bzero(&ctx, sizeof (ctx)); +} + +/* + * Byteswapped version of zio_checksum_skein_native. This just invokes + * the native checksum function and byteswaps the resulting checksum (since + * skein is internally endian-insensitive). + */ +void +zio_checksum_skein_byteswap(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + zio_cksum_t tmp; + + zio_checksum_skein_native(buf, size, ctx_template, &tmp); + zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); + zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); + zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); + zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]); +} + +/* + * Allocates a skein MAC template suitable for using in skein MAC checksum + * computations and returns a pointer to it. + */ +void * +zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) +{ + Skein_512_Ctxt_t *ctx; + + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); + (void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0, + salt->zcs_bytes, sizeof (salt->zcs_bytes)); + return (ctx); +} + +/* + * Frees a skein context template previously allocated using + * zio_checksum_skein_tmpl_init. + */ +void +zio_checksum_skein_tmpl_free(void *ctx_template) +{ + Skein_512_Ctxt_t *ctx = ctx_template; + + bzero(ctx, sizeof (*ctx)); + kmem_free(ctx, sizeof (*ctx)); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c,v retrieving revision 1.6 diff -u -p -r1.6 spa.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c 27 Mar 2014 15:50:48 -0000 1.6 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c 30 Apr 2017 14:02:00 -0000 @@ -1,4 +1,3 @@ - /* * CDDL HEADER START * @@ -21,11 +20,19 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 Martin Matuska . All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Toomas Soome */ /* + * SPA: Storage Pool Allocator + * * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. @@ -58,40 +65,60 @@ #include #include #include -#include #include #include +#include +#include +#include +#include +#include +#include +#include #ifdef _KERNEL -#include #include #include -#include -#include #include #endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" +/* Check hostid on import? */ +static int check_hostid = 1; + +/* + * The interval, in seconds, at which failed configuration cache file writes + * should be retried. + */ +static int zfs_ccw_retry_interval = 300; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, + "Check hostid on import?"); +TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval); +SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW, + &zfs_ccw_retry_interval, 0, + "Configuration cache file write, retry after failure, interval (seconds)"); + typedef enum zti_modes { - zti_mode_fixed, /* value is # of threads (min 1) */ - zti_mode_online_percent, /* value is % of online CPUs */ - zti_mode_batch, /* cpu-intensive; value is ignored */ - zti_mode_null, /* don't create a taskq */ - zti_nmodes + ZTI_MODE_FIXED, /* value is # of threads (min 1) */ + ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ + ZTI_MODE_NULL, /* don't create a taskq */ + ZTI_NMODES } zti_modes_t; -#define ZTI_FIX(n) { zti_mode_fixed, (n) } -#define ZTI_PCT(n) { zti_mode_online_percent, (n) } -#define ZTI_BATCH { zti_mode_batch, 0 } -#define ZTI_NULL { zti_mode_null, 0 } +#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } +#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } +#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } -#define ZTI_ONE ZTI_FIX(1) +#define ZTI_N(n) ZTI_P(n, 1) +#define ZTI_ONE ZTI_N(1) typedef struct zio_taskq_info { - enum zti_modes zti_mode; + zti_modes_t zti_mode; uint_t zti_value; + uint_t zti_count; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { @@ -99,31 +126,53 @@ static const char *const zio_taskq_types }; /* - * Define the taskq threads for the following I/O types: - * NULL, READ, WRITE, FREE, CLAIM, and IOCTL + * This table defines the taskq settings for each ZFS I/O type. When + * initializing a pool, we use this table to create an appropriately sized + * taskq. Some operations are low volume and therefore have a small, static + * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE + * macros. Other operations process a large amount of data; the ZTI_BATCH + * macro causes us to create a taskq oriented for throughput. Some operations + * are so high frequency and short-lived that the taskq itself can become a a + * point of lock contention. The ZTI_P(#, #) macro indicates that we need an + * additional degree of parallelism specified by the number of threads per- + * taskq and the number of taskqs; when dispatching an event in this case, the + * particular taskq is chosen at random. + * + * The different taskq priorities are to handle the different contexts (issue + * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that + * need to be handled with minimum delay. */ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, - { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, - { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, - { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ + { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ + { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ + { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ }; -static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); +static sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, const char *name); +static void spa_event_post(sysevent_t *ev); +static void spa_sync_version(void *arg, dmu_tx_t *tx); +static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, char **ereport); +static void spa_vdev_resilver_done(spa_t *spa); -uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ +uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ +#ifdef PSRSET_BIND id_t zio_taskq_psrset_bind = PS_NONE; +#endif +#ifdef SYSDC boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ uint_t zio_taskq_basedc = 80; /* base duty cycle */ +#endif boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ +extern int zfs_sync_pass_deferred_free; /* * This (illegal) pool name is used when temporarily importing a spa_t in order @@ -165,15 +214,16 @@ spa_prop_add_list(nvlist_t *nvl, zpool_p static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { - uint64_t size; - uint64_t alloc; - uint64_t cap, version; + vdev_t *rvd = spa->spa_root_vdev; + dsl_pool_t *pool = spa->spa_dsl_pool; + uint64_t size, alloc, cap, version; zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; + metaslab_class_t *mc = spa_normal_class(spa); ASSERT(MUTEX_HELD(&spa->spa_props_lock)); - if (spa->spa_root_vdev != NULL) { + if (rvd != NULL) { alloc = metaslab_class_get_alloc(spa_normal_class(spa)); size = metaslab_class_get_space(spa_normal_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); @@ -182,6 +232,13 @@ spa_prop_get_config(spa_t *spa, nvlist_t spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, + metaslab_class_fragmentation(mc), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, + metaslab_class_expandable_space(mc), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, + (spa_mode(spa) == FREAD), src); + cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); @@ -189,7 +246,7 @@ spa_prop_get_config(spa_t *spa, nvlist_t ddt_get_pool_dedup_ratio(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, - spa->spa_root_vdev->vdev_state, src); + rvd->vdev_state, src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) @@ -199,12 +256,49 @@ spa_prop_get_config(spa_t *spa, nvlist_t spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); } + if (pool != NULL) { + /* + * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, + * when opening pools before this version freedir will be NULL. + */ + if (pool->dp_free_dir != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, + dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, + src); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, + NULL, 0, src); + } + + if (pool->dp_leak_dir != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, + dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, + src); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, + NULL, 0, src); + } + } + spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); + if (spa->spa_comment != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, + 0, ZPROP_SRC_LOCAL); + } + if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, + MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, + SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); + } + if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, @@ -268,19 +362,18 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) dsl_dataset_t *ds = NULL; dp = spa_get_dsl(spa); - rw_enter(&dp->dp_config_rwlock, RW_READER); + dsl_pool_config_enter(dp, FTAG); if (err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &ds)) { - rw_exit(&dp->dp_config_rwlock); + dsl_pool_config_exit(dp, FTAG); break; } - strval = kmem_alloc( - MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, + strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); - rw_exit(&dp->dp_config_rwlock); + dsl_pool_config_exit(dp, FTAG); } else { strval = NULL; intval = za.za_first_integer; @@ -289,8 +382,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, prop, strval, intval, src); if (strval != NULL) - kmem_free(strval, - MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); + kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); break; @@ -332,27 +424,57 @@ spa_prop_validate(spa_t *spa, nvlist_t * { nvpair_t *elem; int error = 0, reset_bootfs = 0; - uint64_t objnum; + uint64_t objnum = 0; + boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { - zpool_prop_t prop; - char *propname, *strval; uint64_t intval; - objset_t *os; - char *slash; + char *strval, *slash, *check, *fname; + const char *propname = nvpair_name(elem); + zpool_prop_t prop = zpool_name_to_prop(propname); + + switch (prop) { + case ZPROP_INVAL: + if (!zpool_prop_feature(propname)) { + error = SET_ERROR(EINVAL); + break; + } - propname = nvpair_name(elem); + /* + * Sanitize the input. + */ + if (nvpair_type(elem) != DATA_TYPE_UINT64) { + error = SET_ERROR(EINVAL); + break; + } - if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) - return (EINVAL); + if (nvpair_value_uint64(elem, &intval) != 0) { + error = SET_ERROR(EINVAL); + break; + } + + if (intval != 0) { + error = SET_ERROR(EINVAL); + break; + } + + fname = strchr(propname, '@') + 1; + if (zfeature_lookup_name(fname, NULL) != 0) { + error = SET_ERROR(EINVAL); + break; + } + + has_feature = B_TRUE; + break; - switch (prop) { case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && - (intval < spa_version(spa) || intval > SPA_VERSION)) - error = EINVAL; + (intval < spa_version(spa) || + intval > SPA_VERSION_BEFORE_FEATURES || + has_feature)) + error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_DELEGATION: @@ -361,7 +483,7 @@ spa_prop_validate(spa_t *spa, nvlist_t * case ZPOOL_PROP_AUTOEXPAND: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) - error = EINVAL; + error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_BOOTFS: @@ -371,7 +493,7 @@ spa_prop_validate(spa_t *spa, nvlist_t * * the bootfs property cannot be set. */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { - error = ENOTSUP; + error = SET_ERROR(ENOTSUP); break; } @@ -379,7 +501,7 @@ spa_prop_validate(spa_t *spa, nvlist_t * * Make sure the vdev config is bootable */ if (!vdev_is_bootable(spa->spa_root_vdev)) { - error = ENOTSUP; + error = SET_ERROR(ENOTSUP); break; } @@ -388,7 +510,8 @@ spa_prop_validate(spa_t *spa, nvlist_t * error = nvpair_value_string(elem, &strval); if (!error) { - uint64_t compress; + objset_t *os; + uint64_t propval; if (strval == NULL || strval[0] == '\0') { objnum = zpool_prop_default_numeric( @@ -399,15 +522,20 @@ spa_prop_validate(spa_t *spa, nvlist_t * if (error = dmu_objset_hold(strval, FTAG, &os)) break; - /* Must be ZPL and not gzip compressed. */ + /* + * Must be ZPL, and its property settings + * must be supported by GRUB (compression + * is not gzip, and large blocks are not used). + */ if (dmu_objset_type(os) != DMU_OST_ZFS) { - error = ENOTSUP; - } else if ((error = dsl_prop_get_integer(strval, + error = SET_ERROR(ENOTSUP); + } else if ((error = + dsl_prop_get_int_ds(dmu_objset_ds(os), zfs_prop_to_name(ZFS_PROP_COMPRESSION), - &compress, NULL)) == 0 && - !BOOTFS_COMPRESS_VALID(compress)) { - error = ENOTSUP; + &propval)) == 0 && + !BOOTFS_COMPRESS_VALID(propval)) { + error = SET_ERROR(ENOTSUP); } else { objnum = dmu_objset_id(os); } @@ -419,7 +547,7 @@ spa_prop_validate(spa_t *spa, nvlist_t * error = nvpair_value_uint64(elem, &intval); if (!error && (intval < ZIO_FAILURE_MODE_WAIT || intval > ZIO_FAILURE_MODE_PANIC)) - error = EINVAL; + error = SET_ERROR(EINVAL); /* * This is a special case which only occurs when @@ -433,7 +561,7 @@ spa_prop_validate(spa_t *spa, nvlist_t * */ if (!error && spa_suspended(spa)) { spa->spa_failmode = intval; - error = EIO; + error = SET_ERROR(EIO); } break; @@ -448,7 +576,7 @@ spa_prop_validate(spa_t *spa, nvlist_t * break; if (strval[0] != '/') { - error = EINVAL; + error = SET_ERROR(EINVAL); break; } @@ -457,17 +585,36 @@ spa_prop_validate(spa_t *spa, nvlist_t * if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) - error = EINVAL; + error = SET_ERROR(EINVAL); + break; + + case ZPOOL_PROP_COMMENT: + if ((error = nvpair_value_string(elem, &strval)) != 0) + break; + for (check = strval; *check != '\0'; check++) { + /* + * The kernel doesn't have an easy isprint() + * check. For this kernel check, we merely + * check ASCII apart from DEL. Fix this if + * there is an easy-to-use kernel isprint(). + */ + if (*check >= 0x7f) { + error = SET_ERROR(EINVAL); + break; + } + } + if (strlen(strval) > ZPROP_MAX_COMMENT) + error = E2BIG; break; case ZPOOL_PROP_DEDUPDITTO: if (spa_version(spa) < SPA_VERSION_DEDUP) - error = ENOTSUP; + error = SET_ERROR(ENOTSUP); else error = nvpair_value_uint64(elem, &intval); if (error == 0 && intval != 0 && intval < ZIO_DEDUPDITTO_MIN) - error = EINVAL; + error = SET_ERROR(EINVAL); break; } @@ -517,31 +664,59 @@ int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; - nvpair_t *elem; + nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; - zpool_prop_t prop; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); - elem = NULL; while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { - if ((prop = zpool_name_to_prop( - nvpair_name(elem))) == ZPROP_INVAL) - return (EINVAL); + zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); + + if (prop == ZPOOL_PROP_CACHEFILE || + prop == ZPOOL_PROP_ALTROOT || + prop == ZPOOL_PROP_READONLY) + continue; + + if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { + uint64_t ver; - if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) + if (prop == ZPOOL_PROP_VERSION) { + VERIFY(nvpair_value_uint64(elem, &ver) == 0); + } else { + ASSERT(zpool_prop_feature(nvpair_name(elem))); + ver = SPA_VERSION_FEATURES; + need_sync = B_TRUE; + } + + /* Save time if the version is already set. */ + if (ver == spa_version(spa)) + continue; + + /* + * In addition to the pool directory object, we might + * create the pool properties object, the features for + * read object, the features for write object, or the + * feature descriptions object. + */ + error = dsl_sync_task(spa->spa_name, NULL, + spa_sync_version, &ver, + 6, ZFS_SPACE_CHECK_RESERVED); + if (error) + return (error); continue; + } need_sync = B_TRUE; break; } - if (need_sync) - return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, - spa, nvp, 3)); - else - return (0); + if (need_sync) { + return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, + nvp, 6, ZFS_SPACE_CHECK_RESERVED)); + } + + return (0); } /* @@ -558,6 +733,80 @@ spa_prop_clear_bootfs(spa_t *spa, uint64 } } +/*ARGSUSED*/ +static int +spa_change_guid_check(void *arg, dmu_tx_t *tx) +{ + uint64_t *newguid = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *rvd = spa->spa_root_vdev; + uint64_t vdev_state; + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_state = rvd->vdev_state; + spa_config_exit(spa, SCL_STATE, FTAG); + + if (vdev_state != VDEV_STATE_HEALTHY) + return (SET_ERROR(ENXIO)); + + ASSERT3U(spa_guid(spa), !=, *newguid); + + return (0); +} + +static void +spa_change_guid_sync(void *arg, dmu_tx_t *tx) +{ + uint64_t *newguid = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + uint64_t oldguid; + vdev_t *rvd = spa->spa_root_vdev; + + oldguid = spa_guid(spa); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + rvd->vdev_guid = *newguid; + rvd->vdev_guid_sum += (*newguid - oldguid); + vdev_config_dirty(rvd); + spa_config_exit(spa, SCL_STATE, FTAG); + + spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", + oldguid, *newguid); +} + +/* + * Change the GUID for the pool. This is done so that we can later + * re-import a pool built from a clone of our own vdevs. We will modify + * the root vdev's guid, our own pool guid, and then mark all of our + * vdevs dirty. Note that we must make sure that all our vdevs are + * online when we do this, or else any vdevs that weren't present + * would be orphaned from our pool. We are also going to issue a + * sysevent to update any watchers. + */ +int +spa_change_guid(spa_t *spa) +{ + int error; + uint64_t guid; + + mutex_enter(&spa->spa_vdev_top_lock); + mutex_enter(&spa_namespace_lock); + guid = spa_generate_guid(NULL); + + error = dsl_sync_task(spa->spa_name, spa_change_guid_check, + spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); + + if (error == 0) { + spa_config_sync(spa, B_FALSE, B_TRUE); + spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); + } + + mutex_exit(&spa_namespace_lock); + mutex_exit(&spa->spa_vdev_top_lock); + + return (error); +} + /* * ========================================================================== * SPA state manipulation (open/create/destroy/import/export) @@ -572,7 +821,7 @@ spa_error_entry_compare(const void *a, c int ret; ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, - sizeof (zbookmark_t)); + sizeof (zbookmark_phys_t)); if (ret < 0) return (-1); @@ -602,48 +851,141 @@ spa_get_errlists(spa_t *spa, avl_tree_t offsetof(spa_error_entry_t, se_avl)); } -static taskq_t * -spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, - uint_t value) +static void +spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { - uint_t flags = TASKQ_PREPOPULATE; + const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; + enum zti_modes mode = ztip->zti_mode; + uint_t value = ztip->zti_value; + uint_t count = ztip->zti_count; + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + char name[32]; + uint_t flags = 0; boolean_t batch = B_FALSE; - switch (mode) { - case zti_mode_null: - return (NULL); /* no taskq needed */ + if (mode == ZTI_MODE_NULL) { + tqs->stqs_count = 0; + tqs->stqs_taskq = NULL; + return; + } - case zti_mode_fixed: + ASSERT3U(count, >, 0); + + tqs->stqs_count = count; + tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); + + switch (mode) { + case ZTI_MODE_FIXED: ASSERT3U(value, >=, 1); value = MAX(value, 1); break; - case zti_mode_batch: + case ZTI_MODE_BATCH: batch = B_TRUE; flags |= TASKQ_THREADS_CPU_PCT; value = zio_taskq_batch_pct; break; - case zti_mode_online_percent: - flags |= TASKQ_THREADS_CPU_PCT; - break; - default: - panic("unrecognized mode for %s taskq (%u:%u) in " + panic("unrecognized mode for %s_%s taskq (%u:%u) in " "spa_activate()", - name, mode, value); + zio_type_name[t], zio_taskq_types[q], mode, value); break; } - if (zio_taskq_sysdc && spa->spa_proc != &p0) { - if (batch) - flags |= TASKQ_DC_BATCH; + for (uint_t i = 0; i < count; i++) { + taskq_t *tq; + + if (count > 1) { + (void) snprintf(name, sizeof (name), "%s_%s_%u", + zio_type_name[t], zio_taskq_types[q], i); + } else { + (void) snprintf(name, sizeof (name), "%s_%s", + zio_type_name[t], zio_taskq_types[q]); + } + +#ifdef SYSDC + if (zio_taskq_sysdc && spa->spa_proc != &p0) { + if (batch) + flags |= TASKQ_DC_BATCH; + + tq = taskq_create_sysdc(name, value, 50, INT_MAX, + spa->spa_proc, zio_taskq_basedc, flags); + } else { +#endif + pri_t pri = maxclsyspri; + /* + * The write issue taskq can be extremely CPU + * intensive. Run it at slightly lower priority + * than the other taskqs. + * FreeBSD notes: + * - numerically higher priorities are lower priorities; + * - if priorities divided by four (RQ_PPQ) are equal + * then a difference between them is insignificant. + */ + if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) +#ifdef illumos + pri--; +#else + pri += 4; +#endif + + tq = taskq_create_proc(name, value, pri, 50, + INT_MAX, spa->spa_proc, flags); +#ifdef SYSDC + } +#endif + + tqs->stqs_taskq[i] = tq; + } +} + +static void +spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) +{ + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + + if (tqs->stqs_taskq == NULL) { + ASSERT0(tqs->stqs_count); + return; + } + + for (uint_t i = 0; i < tqs->stqs_count; i++) { + ASSERT3P(tqs->stqs_taskq[i], !=, NULL); + taskq_destroy(tqs->stqs_taskq[i]); + } + + kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); + tqs->stqs_taskq = NULL; +} + +/* + * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. + * Note that a type may have multiple discrete taskqs to avoid lock contention + * on the taskq itself. In that case we choose which taskq at random by using + * the low bits of gethrtime(). + */ +void +spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) +{ + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + taskq_t *tq; + + ASSERT3P(tqs->stqs_taskq, !=, NULL); + ASSERT3U(tqs->stqs_count, !=, 0); - return (taskq_create_sysdc(name, value, 50, INT_MAX, - spa->spa_proc, zio_taskq_basedc, flags)); + if (tqs->stqs_count == 1) { + tq = tqs->stqs_taskq[0]; + } else { +#if defined(__FreeBSD__) && defined(_KERNEL) + tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count]; +#else + tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; +#endif } - return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, - spa->spa_proc, flags)); + + taskq_dispatch_ent(tq, func, arg, flags, ent); } static void @@ -651,39 +993,30 @@ spa_create_zio_taskqs(spa_t *spa) { for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { - const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; - enum zti_modes mode = ztip->zti_mode; - uint_t value = ztip->zti_value; - char name[32]; - - (void) snprintf(name, sizeof (name), - "%s_%s", zio_type_name[t], zio_taskq_types[q]); - - spa->spa_zio_taskq[t][q] = - spa_taskq_create(spa, name, mode, value); + spa_taskqs_init(spa, t, q); } } } #ifdef _KERNEL +#ifdef SPA_PROCESS static void spa_thread(void *arg) { callb_cpr_t cprinfo; spa_t *spa = arg; + user_t *pu = PTOU(curproc); CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, spa->spa_name); ASSERT(curproc != &p0); -#ifdef PORT_SOLARIS - user_t *pu = PTOU(curproc); - (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), "zpool-%s", spa->spa_name); (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); +#ifdef PSRSET_BIND /* bind this thread to the requested psrset */ if (zio_taskq_psrset_bind != PS_NONE) { pool_lock(); @@ -705,12 +1038,16 @@ spa_thread(void *arg) mutex_exit(&cpu_lock); pool_unlock(); } +#endif +#ifdef SYSDC if (zio_taskq_sysdc) { sysdc_thread_enter(curthread, 100, 0); } -#endif /* PORT_SOLARIS */ +#endif + spa->spa_proc = curproc; + spa->spa_did = curthread->t_did; spa_create_zio_taskqs(spa); @@ -731,9 +1068,10 @@ spa_thread(void *arg) cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ -/* mutex_enter(curproc->p_lock); - lwp_exit(curproc); */ + mutex_enter(&curproc->p_lock); + lwp_exit(); } +#endif /* SPA_PROCESS */ #endif /* @@ -755,7 +1093,8 @@ spa_activate(spa_t *spa, int mode) ASSERT(spa->spa_proc_state == SPA_PROC_NONE); ASSERT(spa->spa_proc == &p0); spa->spa_did = 0; -#if 0 + +#ifdef SPA_PROCESS /* Only create a process if we're going to be around a while. */ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, @@ -775,17 +1114,25 @@ spa_activate(spa_t *spa, int mode) spa->spa_name); #endif } - } -#endif + } +#endif /* SPA_PROCESS */ mutex_exit(&spa->spa_proc_lock); /* If we didn't create a process, we need to create our taskqs. */ + ASSERT(spa->spa_proc == &p0); if (spa->spa_proc == &p0) { spa_create_zio_taskqs(spa); } + /* + * Start TRIM thread. + */ + trim_thread_create(spa); + list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); + list_create(&spa->spa_evicting_os_list, sizeof (objset_t), + offsetof(objset_t, os_evicting_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); @@ -812,16 +1159,23 @@ spa_deactivate(spa_t *spa) ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); + /* + * Stop TRIM thread in case spa_unload() wasn't called directly + * before spa_deactivate(). + */ + trim_thread_destroy(spa); + + spa_evicting_os_wait(spa); + txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); + list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { - if (spa->spa_zio_taskq[t][q] != NULL) - taskq_destroy(spa->spa_zio_taskq[t][q]); - spa->spa_zio_taskq[t][q] = NULL; + spa_taskqs_fini(spa, t, q); } } @@ -857,6 +1211,7 @@ spa_deactivate(spa_t *spa) ASSERT(spa->spa_proc == &p0); mutex_exit(&spa->spa_proc_lock); +#ifdef SPA_PROCESS /* * We want to make sure spa_thread() has actually exited the ZFS * module, so that the module can't be unloaded out from underneath @@ -866,6 +1221,7 @@ spa_deactivate(spa_t *spa) thread_join(spa->spa_did); spa->spa_did = 0; } +#endif /* SPA_PROCESS */ } /* @@ -897,7 +1253,7 @@ spa_config_parse(spa_t *spa, vdev_t **vd if (error) { vdev_free(*vdp); *vdp = NULL; - return (EINVAL); + return (SET_ERROR(EINVAL)); } for (int c = 0; c < children; c++) { @@ -926,6 +1282,11 @@ spa_unload(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* + * Stop TRIM thread. + */ + trim_thread_destroy(spa); + + /* * Stop async tasks. */ spa_async_suspend(spa); @@ -939,13 +1300,39 @@ spa_unload(spa_t *spa) } /* + * Even though vdev_free() also calls vdev_metaslab_fini, we need + * to call it earlier, before we wait for async i/o to complete. + * This ensures that there is no async metaslab prefetching, by + * calling taskq_wait(mg_taskq). + */ + if (spa->spa_root_vdev != NULL) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) + vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); + spa_config_exit(spa, SCL_ALL, FTAG); + } + + /* * Wait for any outstanding async I/O to complete. */ if (spa->spa_async_zio_root != NULL) { - (void) zio_wait(spa->spa_async_zio_root); + for (int i = 0; i < max_ncpus; i++) + (void) zio_wait(spa->spa_async_zio_root[i]); + kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); spa->spa_async_zio_root = NULL; } + bpobj_close(&spa->spa_deferred_bpobj); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + /* + * Close all vdevs. + */ + if (spa->spa_root_vdev) + vdev_free(spa->spa_root_vdev); + ASSERT(spa->spa_root_vdev == NULL); + /* * Close the dsl pool. */ @@ -957,20 +1344,11 @@ spa_unload(spa_t *spa) ddt_unload(spa); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - /* * Drop and purge level 2 cache */ spa_l2cache_drop(spa); - /* - * Close all vdevs. - */ - if (spa->spa_root_vdev) - vdev_free(spa->spa_root_vdev); - ASSERT(spa->spa_root_vdev == NULL); - for (i = 0; i < spa->spa_spares.sav_count; i++) vdev_free(spa->spa_spares.sav_vdevs[i]); if (spa->spa_spares.sav_vdevs) { @@ -984,8 +1362,10 @@ spa_unload(spa_t *spa) } spa->spa_spares.sav_count = 0; - for (i = 0; i < spa->spa_l2cache.sav_count; i++) + for (i = 0; i < spa->spa_l2cache.sav_count; i++) { + vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); vdev_free(spa->spa_l2cache.sav_vdevs[i]); + } if (spa->spa_l2cache.sav_vdevs) { kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); @@ -999,6 +1379,11 @@ spa_unload(spa_t *spa) spa->spa_async_suspended = 0; + if (spa->spa_comment != NULL) { + spa_strfree(spa->spa_comment); + spa->spa_comment = NULL; + } + spa_config_exit(spa, SCL_ALL, FTAG); } @@ -1109,7 +1494,7 @@ spa_load_spares(spa_t *spa) KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, - spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); + spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); for (i = 0; i < spa->spa_spares.sav_count; i++) @@ -1143,6 +1528,7 @@ spa_load_l2cache(spa_t *spa) newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); } else { nl2cache = 0; + newvdevs = NULL; } oldvdevs = sav->sav_vdevs; @@ -1208,11 +1594,13 @@ spa_load_l2cache(spa_t *spa) vd = oldvdevs[i]; if (vd != NULL) { + ASSERT(vd->vdev_isl2cache); + if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); - (void) vdev_close(vd); - spa_l2cache_remove(vd); + vdev_clear_stats(vd); + vdev_free(vd); } } @@ -1235,7 +1623,7 @@ spa_load_l2cache(spa_t *spa) l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, - sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); + sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); out: @@ -1254,7 +1642,10 @@ load_nvlist(spa_t *spa, uint64_t obj, nv int error; *value = NULL; - VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); + error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); + if (error != 0) + return (error); + nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); @@ -1278,70 +1669,193 @@ spa_check_removed(vdev_t *vd) for (int c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); - if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { + if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && + !vd->vdev_ishole) { zfs_post_autoreplace(vd->vdev_spa, vd); spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); } } -/* - * Load the slog device state from the config object since it's possible - * that the label does not contain the most up-to-date information. - */ -void -spa_load_log_state(spa_t *spa, nvlist_t *nv) +static void +spa_config_valid_zaps(vdev_t *vd, vdev_t *mvd) { - vdev_t *ovd, *rvd = spa->spa_root_vdev; + ASSERT3U(vd->vdev_children, ==, mvd->vdev_children); - /* - * Load the original root vdev tree from the passed config. - */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + vd->vdev_top_zap = mvd->vdev_top_zap; + vd->vdev_leaf_zap = mvd->vdev_leaf_zap; - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *cvd = rvd->vdev_child[c]; - if (cvd->vdev_islog) - vdev_load_log_state(cvd, ovd->vdev_child[c]); + for (uint64_t i = 0; i < vd->vdev_children; i++) { + spa_config_valid_zaps(vd->vdev_child[i], mvd->vdev_child[i]); } - vdev_free(ovd); - spa_config_exit(spa, SCL_ALL, FTAG); } /* - * Check for missing log devices + * Validate the current config against the MOS config */ -int -spa_check_logs(spa_t *spa) -{ - switch (spa->spa_log_state) { - case SPA_LOG_MISSING: - /* need to recheck in case slog has been restored */ - case SPA_LOG_UNKNOWN: - if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, - DS_FIND_CHILDREN)) { - spa_set_log_state(spa, SPA_LOG_MISSING); - return (1); - } - break; - } - return (0); -} - static boolean_t -spa_passivate_log(spa_t *spa) +spa_config_valid(spa_t *spa, nvlist_t *config) { - vdev_t *rvd = spa->spa_root_vdev; - boolean_t slog_found = B_FALSE; + vdev_t *mrvd, *rvd = spa->spa_root_vdev; + nvlist_t *nv; - ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); - if (!spa_has_slogs(spa)) - return (B_FALSE); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - metaslab_group_t *mg = tvd->vdev_mg; + ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); + + /* + * If we're doing a normal import, then build up any additional + * diagnostic information about missing devices in this config. + * We'll pass this up to the user for further processing. + */ + if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { + nvlist_t **child, *nv; + uint64_t idx = 0; + + child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), + KM_SLEEP); + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + vdev_t *mtvd = mrvd->vdev_child[c]; + + if (tvd->vdev_ops == &vdev_missing_ops && + mtvd->vdev_ops != &vdev_missing_ops && + mtvd->vdev_islog) + child[idx++] = vdev_config_generate(spa, mtvd, + B_FALSE, 0); + } + + if (idx) { + VERIFY(nvlist_add_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, child, idx) == 0); + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); + + for (int i = 0; i < idx; i++) + nvlist_free(child[i]); + } + nvlist_free(nv); + kmem_free(child, rvd->vdev_children * sizeof (char **)); + } + + /* + * Compare the root vdev tree with the information we have + * from the MOS config (mrvd). Check each top-level vdev + * with the corresponding MOS config top-level (mtvd). + */ + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + vdev_t *mtvd = mrvd->vdev_child[c]; + + /* + * Resolve any "missing" vdevs in the current configuration. + * If we find that the MOS config has more accurate information + * about the top-level vdev then use that vdev instead. + */ + if (tvd->vdev_ops == &vdev_missing_ops && + mtvd->vdev_ops != &vdev_missing_ops) { + + if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) + continue; + + /* + * Device specific actions. + */ + if (mtvd->vdev_islog) { + spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + /* + * XXX - once we have 'readonly' pool + * support we should be able to handle + * missing data devices by transitioning + * the pool to readonly. + */ + continue; + } + + /* + * Swap the missing vdev with the data we were + * able to obtain from the MOS config. + */ + vdev_remove_child(rvd, tvd); + vdev_remove_child(mrvd, mtvd); + + vdev_add_child(rvd, mtvd); + vdev_add_child(mrvd, tvd); + + spa_config_exit(spa, SCL_ALL, FTAG); + vdev_load(mtvd); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + vdev_reopen(rvd); + } else { + if (mtvd->vdev_islog) { + /* + * Load the slog device's state from the MOS + * config since it's possible that the label + * does not contain the most up-to-date + * information. + */ + vdev_load_log_state(tvd, mtvd); + vdev_reopen(tvd); + } + + /* + * Per-vdev ZAP info is stored exclusively in the MOS. + */ + spa_config_valid_zaps(tvd, mtvd); + } + } + + vdev_free(mrvd); + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * Ensure we were able to validate the config. + */ + return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); +} + +/* + * Check for missing log devices + */ +static boolean_t +spa_check_logs(spa_t *spa) +{ + boolean_t rv = B_FALSE; + dsl_pool_t *dp = spa_get_dsl(spa); + + switch (spa->spa_log_state) { + case SPA_LOG_MISSING: + /* need to recheck in case slog has been restored */ + case SPA_LOG_UNKNOWN: + rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); + if (rv) + spa_set_log_state(spa, SPA_LOG_MISSING); + break; + } + return (rv); +} + +static boolean_t +spa_passivate_log(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + boolean_t slog_found = B_FALSE; + + ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + + if (!spa_has_slogs(spa)) + return (B_FALSE); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) { metaslab_group_passivate(mg); @@ -1371,11 +1885,11 @@ spa_activate_log(spa_t *spa) int spa_offline_log(spa_t *spa) { - int error = 0; - - if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, - NULL, DS_FIND_CHILDREN)) == 0) { + int error; + error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + NULL, DS_FIND_CHILDREN); + if (error == 0) { /* * We successfully offlined the log device, sync out the * current txg so that the "stubby" block can be removed @@ -1389,7 +1903,9 @@ spa_offline_log(spa_t *spa) static void spa_aux_check_removed(spa_aux_vdev_t *sav) { - for (int i = 0; i < sav->sav_count; i++) + int i; + + for (i = 0; i < sav->sav_count; i++) spa_check_removed(sav->sav_vdevs[i]); } @@ -1419,32 +1935,85 @@ spa_load_verify_done(zio_t *zio) spa_load_error_t *sle = zio->io_private; dmu_object_type_t type = BP_GET_TYPE(bp); int error = zio->io_error; + spa_t *spa = zio->io_spa; if (error) { - if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && + if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) - atomic_add_64(&sle->sle_meta_count, 1); + atomic_inc_64(&sle->sle_meta_count); else - atomic_add_64(&sle->sle_data_count, 1); + atomic_inc_64(&sle->sle_data_count); } zio_data_buf_free(zio->io_data, zio->io_size); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_inflight--; + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); } +/* + * Maximum number of concurrent scrub i/os to create while verifying + * a pool while importing it. + */ +int spa_load_verify_maxinflight = 10000; +boolean_t spa_load_verify_metadata = B_TRUE; +boolean_t spa_load_verify_data = B_TRUE; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, + &spa_load_verify_maxinflight, 0, + "Maximum number of concurrent scrub I/Os to create while verifying a " + "pool while importing it"); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, + &spa_load_verify_metadata, 0, + "Check metadata on import?"); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, + &spa_load_verify_data, 0, + "Check user data on import?"); + /*ARGSUSED*/ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { - if (bp != NULL) { - zio_t *rio = arg; - size_t size = BP_GET_PSIZE(bp); - void *data = zio_data_buf_alloc(size); - - zio_nowait(zio_read(rio, spa, bp, data, size, - spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, - ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | - ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); - } + if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + return (0); + /* + * Note: normally this routine will not be called if + * spa_load_verify_metadata is not set. However, it may be useful + * to manually set the flag after the traversal has begun. + */ + if (!spa_load_verify_metadata) + return (0); + if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) + return (0); + + zio_t *rio = arg; + size_t size = BP_GET_PSIZE(bp); + void *data = zio_data_buf_alloc(size); + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight++; + mutex_exit(&spa->spa_scrub_lock); + + zio_nowait(zio_read(rio, spa, bp, data, size, + spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); + return (0); +} + +/* ARGSUSED */ +int +verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + return (0); } @@ -1455,18 +2024,29 @@ spa_load_verify(spa_t *spa) spa_load_error_t sle = { 0 }; zpool_rewind_policy_t policy; boolean_t verify_ok = B_FALSE; - int error; + int error = 0; zpool_get_rewind_policy(spa->spa_config, &policy); if (policy.zrp_request & ZPOOL_NEVER_REWIND) return (0); + dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); + error = dmu_objset_find_dp(spa->spa_dsl_pool, + spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, + DS_FIND_CHILDREN); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); + if (error != 0) + return (error); + rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); - error = traverse_pool(spa, spa->spa_verify_min_txg, - TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); + if (spa_load_verify_metadata) { + error = traverse_pool(spa, spa->spa_verify_min_txg, + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, + spa_load_verify_cb, rio); + } (void) zio_wait(rio); @@ -1475,16 +2055,26 @@ spa_load_verify(spa_t *spa) if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && sle.sle_data_count <= policy.zrp_maxdata) { + int64_t loss = 0; + verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; + + loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); + VERIFY(nvlist_add_int64(spa->spa_load_info, + ZPOOL_CONFIG_REWIND_TIME, loss) == 0); + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } if (error) { if (error != ENXIO && error != EIO) - error = EIO; + error = SET_ERROR(EIO); return (error); } @@ -1606,12 +2196,17 @@ spa_load(spa_t *spa, spa_load_state_t st { nvlist_t *config = spa->spa_config; char *ereport = FM_EREPORT_ZFS_POOL; + char *comment; int error; uint64_t pool_guid; nvlist_t *nvl; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) - return (EINVAL); + return (SET_ERROR(EINVAL)); + + ASSERT(spa->spa_comment == NULL); + if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) + spa->spa_comment = spa_strdup(comment); /* * Versioning wasn't explicitly added to the label until later, so if @@ -1626,9 +2221,9 @@ spa_load(spa_t *spa, spa_load_state_t st if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) { - error = EEXIST; + error = SET_ERROR(EEXIST); } else { - spa->spa_load_guid = pool_guid; + spa->spa_config_guid = pool_guid; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) { @@ -1636,13 +2231,29 @@ spa_load(spa_t *spa, spa_load_state_t st KM_SLEEP) == 0); } + nvlist_free(spa->spa_load_info); + spa->spa_load_info = fnvlist_alloc(); + + gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, pool_guid, config, state, type, mosconfig, &ereport); } + /* + * Don't count references from objsets that are already closed + * and are making their way through the eviction process. + */ + spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); - if (error && error != EBADF) - zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); + if (error) { + if (error != EEXIST) { + spa->spa_loaded_ts.tv_sec = 0; + spa->spa_loaded_ts.tv_nsec = 0; + } + if (error != EBADF) { + zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); + } + } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; @@ -1650,6 +2261,34 @@ spa_load(spa_t *spa, spa_load_state_t st } /* + * Count the number of per-vdev ZAPs associated with all of the vdevs in the + * vdev tree rooted in the given vd, and ensure that each ZAP is present in the + * spa's per-vdev ZAP list. + */ +static uint64_t +vdev_count_verify_zaps(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + uint64_t total = 0; + if (vd->vdev_top_zap != 0) { + total++; + ASSERT0(zap_lookup_int(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, vd->vdev_top_zap)); + } + if (vd->vdev_leaf_zap != 0) { + total++; + ASSERT0(zap_lookup_int(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + total += vdev_count_verify_zaps(vd->vdev_child[i]); + } + + return (total); +} + +/* * Load an existing storage pool, using the pool's builtin spa_config as a * source of configuration information. */ @@ -1659,12 +2298,15 @@ spa_load_impl(spa_t *spa, uint64_t pool_ char **ereport) { int error = 0; - nvlist_t *nvconfig, *nvroot = NULL; + nvlist_t *nvroot = NULL; + nvlist_t *label; vdev_t *rvd; uberblock_t *ub = &spa->spa_uberblock; - uint64_t config_cache_txg = spa->spa_config_txg; + uint64_t children, config_cache_txg = spa->spa_config_txg; int orig_mode = spa->spa_mode; int parse; + uint64_t obj; + boolean_t missing_feat_write = B_FALSE; /* * If this is an untrusted config, access the pool in read-only mode. @@ -1678,7 +2320,7 @@ spa_load_impl(spa_t *spa, uint64_t pool_ spa->spa_load_state = state; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) - return (EINVAL); + return (SET_ERROR(EINVAL)); parse = (type == SPA_IMPORT_EXISTING ? VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); @@ -1686,8 +2328,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_ /* * Create "The Godfather" zio to hold all async IOs */ - spa->spa_async_zio_root = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); + spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), + KM_SLEEP); + for (int i = 0; i < max_ncpus; i++) { + spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + } /* * Parse the configuration into a vdev tree. We explicitly set the @@ -1702,6 +2349,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_ return (error); ASSERT(spa->spa_root_vdev == rvd); + ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); + ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); if (type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_guid(spa) == pool_guid); @@ -1731,38 +2380,101 @@ spa_load_impl(spa_t *spa, uint64_t pool_ */ if (type != SPA_IMPORT_ASSEMBLE) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_validate(rvd); + error = vdev_validate(rvd, mosconfig); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) return (error); if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) - return (ENXIO); + return (SET_ERROR(ENXIO)); } /* * Find the best uberblock. */ - vdev_uberblock_load(NULL, rvd, ub); + vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ - if (ub->ub_txg == 0) + if (ub->ub_txg == 0) { + nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); + } /* - * If the pool is newer than the code, we can't open it. + * If the pool has an unsupported version we can't open it. */ - if (ub->ub_version > SPA_VERSION) + if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { + nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); + } + + if (ub->ub_version >= SPA_VERSION_FEATURES) { + nvlist_t *features; + + /* + * If we weren't able to find what's necessary for reading the + * MOS in the label, return failure. + */ + if (label == NULL || nvlist_lookup_nvlist(label, + ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { + nvlist_free(label); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + ENXIO)); + } + + /* + * Update our in-core representation with the definitive values + * from the label. + */ + nvlist_free(spa->spa_label_features); + VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); + } + + nvlist_free(label); + + /* + * Look through entries in the label nvlist's features_for_read. If + * there is a feature listed there which we don't understand then we + * cannot open a pool. + */ + if (ub->ub_version >= SPA_VERSION_FEATURES) { + nvlist_t *unsup_feat; + + VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == + 0); + + for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, + NULL); nvp != NULL; + nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { + if (!zfeature_is_supported(nvpair_name(nvp))) { + VERIFY(nvlist_add_string(unsup_feat, + nvpair_name(nvp), "") == 0); + } + } + + if (!nvlist_empty(unsup_feat)) { + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); + nvlist_free(unsup_feat); + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + + nvlist_free(unsup_feat); + } /* * If the vdev guid sum doesn't match the uberblock, we have an - * incomplete configuration. + * incomplete configuration. We first check to see if the pool + * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). + * If it is, defer the vdev_guid_sum check till later so we + * can handle missing vdevs. */ - if (mosconfig && type != SPA_IMPORT_ASSEMBLE && + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, + &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && rvd->vdev_guid_sum != ub->ub_guid_sum) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); @@ -1784,8 +2496,9 @@ spa_load_impl(spa_t *spa, uint64_t pool_ spa->spa_first_txg = spa->spa_last_ubsync_txg ? spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; spa->spa_claim_max_txg = spa->spa_first_txg; + spa->spa_prev_software_version = ub->ub_software_version; - error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); + error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; @@ -1793,12 +2506,119 @@ spa_load_impl(spa_t *spa, uint64_t pool_ if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) + if (spa_version(spa) >= SPA_VERSION_FEATURES) { + boolean_t missing_feat_read = B_FALSE; + nvlist_t *unsup_feat, *enabled_feat; + + if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, + &spa->spa_feat_for_read_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, + &spa->spa_feat_for_write_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, + &spa->spa_feat_desc_obj) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + enabled_feat = fnvlist_alloc(); + unsup_feat = fnvlist_alloc(); + + if (!spa_features_check(spa, B_FALSE, + unsup_feat, enabled_feat)) + missing_feat_read = B_TRUE; + + if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { + if (!spa_features_check(spa, B_TRUE, + unsup_feat, enabled_feat)) { + missing_feat_write = B_TRUE; + } + } + + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); + + if (!nvlist_empty(unsup_feat)) { + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); + } + + fnvlist_free(enabled_feat); + fnvlist_free(unsup_feat); + + if (!missing_feat_read) { + fnvlist_add_boolean(spa->spa_load_info, + ZPOOL_CONFIG_CAN_RDONLY); + } + + /* + * If the state is SPA_LOAD_TRYIMPORT, our objective is + * twofold: to determine whether the pool is available for + * import in read-write mode and (if it is not) whether the + * pool is available for import in read-only mode. If the pool + * is available for import in read-write mode, it is displayed + * as available in userland; if it is not available for import + * in read-only mode, it is displayed as unavailable in + * userland. If the pool is available for import in read-only + * mode but not read-write mode, it is displayed as unavailable + * in userland with a special note that the pool is actually + * available for open in read-only mode. + * + * As a result, if the state is SPA_LOAD_TRYIMPORT and we are + * missing a feature for write, we must first determine whether + * the pool can be opened read-only before returning to + * userland in order to know whether to display the + * abovementioned note. + */ + if (missing_feat_read || (missing_feat_write && + spa_writeable(spa))) { + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + + /* + * Load refcounts for ZFS features from disk into an in-memory + * cache during SPA initialization. + */ + for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { + uint64_t refcount; + + error = feature_get_refcount_from_disk(spa, + &spa_feature_table[i], &refcount); + if (error == 0) { + spa->spa_feat_refcount_cache[i] = refcount; + } else if (error == ENOTSUP) { + spa->spa_feat_refcount_cache[i] = + SPA_FEATURE_DISABLED; + } else { + return (spa_vdev_err(rvd, + VDEV_AUX_CORRUPT_DATA, EIO)); + } + } + } + + if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { + if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, + &spa->spa_feat_enabled_txg_obj) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + spa->spa_is_initializing = B_TRUE; + error = dsl_pool_open(spa->spa_dsl_pool); + spa->spa_is_initializing = B_FALSE; + if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!mosconfig) { uint64_t hostid; - nvlist_t *policy = NULL; + nvlist_t *policy = NULL, *nvconfig; + + if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { @@ -1817,15 +2637,16 @@ spa_load_impl(spa_t *spa, uint64_t pool_ */ (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); #endif /* _KERNEL */ - if (hostid != 0 && myhostid != 0 && + if (check_hostid && hostid != 0 && myhostid != 0 && hostid != myhostid) { + nvlist_free(nvconfig); cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%lx). " - "See: http://www.sun.com/msg/ZFS-8000-EY", + "See: http://illumos.org/msg/ZFS-8000-EY", spa_name(spa), hostname, (unsigned long)hostid); - return (EBADF); + return (SET_ERROR(EBADF)); } } if (nvlist_lookup_nvlist(spa->spa_config, @@ -1841,8 +2662,23 @@ spa_load_impl(spa_t *spa, uint64_t pool_ return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); } - if (spa_dir_prop(spa, DMU_POOL_SYNC_BPLIST, - &spa->spa_deferred_bplist_obj) != 0) + /* Grab the secret checksum salt from the MOS. */ + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CHECKSUM_SALT, 1, + sizeof (spa->spa_cksum_salt.zcs_bytes), + spa->spa_cksum_salt.zcs_bytes); + if (error == ENOENT) { + /* Generate a new salt for subsequent use */ + (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, + sizeof (spa->spa_cksum_salt.zcs_bytes)); + } else if (error != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); + if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* @@ -1854,6 +2690,11 @@ spa_load_impl(spa_t *spa, uint64_t pool_ if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, + &spa->spa_creation_version); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + /* * Load the persistent error log. If we have an older pool, this will * not be present. @@ -1876,6 +2717,39 @@ spa_load_impl(spa_t *spa, uint64_t pool_ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* + * Load the per-vdev ZAP map. If we have an older pool, this will not + * be present; in this case, defer its creation to a later time to + * avoid dirtying the MOS this early / out of sync context. See + * spa_sync_config_object. + */ + + /* The sentinel is only available in the MOS config. */ + nvlist_t *mos_config; + if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, + &spa->spa_all_vdev_zaps); + + if (error != ENOENT && error != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } else if (error == 0 && !nvlist_exists(mos_config, + ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { + /* + * An older version of ZFS overwrote the sentinel value, so + * we have orphaned per-vdev ZAPs in the MOS. Defer their + * destruction to later; see spa_sync_config_object. + */ + spa->spa_avz_action = AVZ_ACTION_DESTROY; + /* + * We're assuming that no vdevs have had their ZAPs created + * before this. Better be sure of it. + */ + ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); + } + nvlist_free(mos_config); + + /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache * devices. @@ -1973,13 +2847,6 @@ spa_load_impl(spa_t *spa, uint64_t pool_ spa_config_exit(spa, SCL_ALL, FTAG); /* - * Check the state of the root vdev. If it can't be opened, it - * indicates one or more toplevel vdevs are faulted. - */ - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) - return (ENXIO); - - /* * Load the DDTs (dedup tables). */ error = ddt_load(spa); @@ -1988,33 +2855,66 @@ spa_load_impl(spa_t *spa, uint64_t pool_ spa_update_dspace(spa); - if (state != SPA_LOAD_TRYIMPORT) { - error = spa_load_verify(spa); - if (error) - return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, - error)); - } - /* - * Load the intent log state and check log integrity. If we're - * assembling a pool from a split, the log is not transferred over. + * Validate the config, using the MOS config to fill in any + * information which might be missing. If we fail to validate + * the config then declare the pool unfit for use. If we're + * assembling a pool from a split, the log is not transferred + * over. */ if (type != SPA_IMPORT_ASSEMBLE) { - VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - spa_load_log_state(spa, nvroot); + nvlist_t *nvconfig; + + if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + if (!spa_config_valid(spa, nvconfig)) { + nvlist_free(nvconfig); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, + ENXIO)); + } nvlist_free(nvconfig); - if (spa_check_logs(spa)) { + /* + * Now that we've validated the config, check the state of the + * root vdev. If it can't be opened, it indicates one or + * more toplevel vdevs are faulted. + */ + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + return (SET_ERROR(ENXIO)); + + if (spa_writeable(spa) && spa_check_logs(spa)) { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } + if (missing_feat_write) { + ASSERT(state == SPA_LOAD_TRYIMPORT); + + /* + * At this point, we know that we can open the pool in + * read-only mode but not read-write mode. We now have enough + * information and can return to userland. + */ + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); + } + + /* + * We've successfully opened the pool, verify that we're ready + * to start pushing transactions. + */ + if (state != SPA_LOAD_TRYIMPORT) { + if (error = spa_load_verify(spa)) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + error)); + } + if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { dmu_tx_t *tx; int need_update = B_FALSE; + dsl_pool_t *dp = spa_get_dsl(spa); ASSERT(state != SPA_LOAD_TRYIMPORT); @@ -2027,9 +2927,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_ */ spa->spa_claiming = B_TRUE; - tx = dmu_tx_create_assigned(spa_get_dsl(spa), - spa_first_txg(spa)); - (void) dmu_objset_find(spa_name(spa), + tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); + (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); @@ -2052,12 +2951,13 @@ spa_load_impl(spa_t *spa, uint64_t pool_ * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * - * If spa_load_verbatim is true, trust the current + * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (config_cache_txg != spa->spa_config_txg || - state == SPA_LOAD_IMPORT || spa->spa_load_verbatim || - state == SPA_LOAD_RECOVER) + state == SPA_LOAD_IMPORT || + state == SPA_LOAD_RECOVER || + (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) @@ -2074,10 +2974,17 @@ spa_load_impl(spa_t *spa, uint64_t pool_ /* * Check all DTLs to see if anything needs resilvering. */ - if (vdev_resilver_needed(rvd, NULL, NULL)) + if (!dsl_scan_resilvering(spa->spa_dsl_pool) && + vdev_resilver_needed(rvd, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); /* + * Log the fact that we booted up (so that we can detect if + * we rebooted in the middle of an operation). + */ + spa_history_log_version(spa, "open"); + + /* * Delete any inconsistent datasets. */ (void) dmu_objset_find(spa_name(spa), @@ -2095,21 +3002,31 @@ spa_load_impl(spa_t *spa, uint64_t pool_ static int spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) { + int mode = spa->spa_mode; + spa_unload(spa); spa_deactivate(spa); - spa->spa_load_max_txg--; + spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; - spa_activate(spa, spa_mode_global); + spa_activate(spa, mode); spa_async_suspend(spa); return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); } +/* + * If spa_load() fails this function will try loading prior txg's. If + * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool + * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this + * function will not rewind the pool and will return the same error as + * spa_load(). + */ static int spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, uint64_t max_request, int rewind_flags) { + nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; @@ -2120,6 +3037,8 @@ spa_load_best(spa_t *spa, spa_load_state spa_set_log_state(spa, SPA_LOG_CLEAR); } else { spa->spa_load_max_txg = max_request; + if (max_request != UINT64_MAX) + spa->spa_extreme_rewind = B_TRUE; } load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, @@ -2138,11 +3057,20 @@ spa_load_best(spa_t *spa, spa_load_state return (load_error); } - /* Price of rolling back is discarding txgs, including log */ - if (state == SPA_LOAD_RECOVER) + if (state == SPA_LOAD_RECOVER) { + /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); - - spa->spa_load_max_txg = spa->spa_last_ubsync_txg; + } else { + /* + * If we aren't rolling back save the load info from our first + * import attempt so that we can restore it after attempting + * to rewind. + */ + loadinfo = spa->spa_load_info; + spa->spa_load_info = fnvlist_alloc(); + } + + spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? TXG_INITIAL : safe_rewind_txg; @@ -2158,16 +3086,26 @@ spa_load_best(spa_t *spa, spa_load_state rewind_error = spa_load_retry(spa, state, mosconfig); } - if (config) - spa_rewind_data_to_nvlist(spa, config); - spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); - return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); + if (state == SPA_LOAD_RECOVER) { + ASSERT3P(loadinfo, ==, NULL); + return (rewind_error); + } else { + /* Store the rewind info as part of the initial load info */ + fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, + spa->spa_load_info); + + /* Restore the initial load info */ + fnvlist_free(spa->spa_load_info); + spa->spa_load_info = loadinfo; + + return (load_error); + } } /* @@ -2187,10 +3125,10 @@ spa_open_common(const char *pool, spa_t nvlist_t **config) { spa_t *spa; - zpool_rewind_policy_t policy; spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; + int firstopen = B_FALSE; *spapp = NULL; @@ -2208,27 +3146,20 @@ spa_open_common(const char *pool, spa_t if ((spa = spa_lookup(pool)) == NULL) { if (locked) mutex_exit(&spa_namespace_lock); - return (ENOENT); + return (SET_ERROR(ENOENT)); } - zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); - if (policy.zrp_request & ZPOOL_DO_REWIND) - state = SPA_LOAD_RECOVER; - if (spa->spa_state == POOL_STATE_UNINITIALIZED) { + zpool_rewind_policy_t policy; - spa_activate(spa, spa_mode_global); + firstopen = B_TRUE; - if (spa->spa_last_open_failed && (policy.zrp_request & - (ZPOOL_NO_REWIND | ZPOOL_NEVER_REWIND))) { - if (config != NULL && spa->spa_config) - VERIFY(nvlist_dup(spa->spa_config, - config, KM_SLEEP) == 0); - spa_deactivate(spa); - if (locked) - mutex_exit(&spa_namespace_lock); - return (spa->spa_last_open_failed); - } + zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, + &policy); + if (policy.zrp_request & ZPOOL_DO_REWIND) + state = SPA_LOAD_RECOVER; + + spa_activate(spa, spa_mode_global); if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; @@ -2250,7 +3181,7 @@ spa_open_common(const char *pool, spa_t spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); - return (ENOENT); + return (SET_ERROR(ENOENT)); } if (error) { @@ -2259,9 +3190,13 @@ spa_open_common(const char *pool, spa_t * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ - if (config != NULL && spa->spa_config) + if (config != NULL && spa->spa_config) { VERIFY(nvlist_dup(spa->spa_config, config, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist(*config, + ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; @@ -2270,20 +3205,33 @@ spa_open_common(const char *pool, spa_t *spapp = NULL; return (error); } - } spa_open_ref(spa, tag); - if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + /* + * If we've recovered the pool, pass back any information we + * gathered while doing the load. + */ + if (state == SPA_LOAD_RECOVER) { + VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + } + if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; spa->spa_load_txg = 0; mutex_exit(&spa_namespace_lock); +#ifdef __FreeBSD__ +#ifdef _KERNEL + if (firstopen) + zvol_create_minors(spa->spa_name); +#endif +#endif } *spapp = spa; @@ -2372,7 +3320,7 @@ spa_add_spares(spa_t *spa, nvlist_t *con if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { VERIFY(nvlist_lookup_uint64_array( - spares[i], ZPOOL_CONFIG_STATS, + spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; @@ -2429,14 +3377,62 @@ spa_add_l2cache(spa_t *spa, nvlist_t *co ASSERT(vd != NULL); VERIFY(nvlist_lookup_uint64_array(l2cache[i], - ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) + == 0); vdev_get_stats(vd, vs); } } } +static void +spa_add_feature_stats(spa_t *spa, nvlist_t *config) +{ + nvlist_t *features; + zap_cursor_t zc; + zap_attribute_t za; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + /* We may be unable to read features if pool is suspended. */ + if (spa_suspended(spa)) + goto out; + + if (spa->spa_feat_for_read_obj != 0) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_feat_for_read_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, + za.za_first_integer)); + } + zap_cursor_fini(&zc); + } + + if (spa->spa_feat_for_write_obj != 0) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_feat_for_write_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, + za.za_first_integer)); + } + zap_cursor_fini(&zc); + } + +out: + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, + features) == 0); + nvlist_free(features); +} + int -spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) +spa_get_stats(const char *name, nvlist_t **config, + char *altroot, size_t buflen) { int error; spa_t *spa; @@ -2453,6 +3449,13 @@ spa_get_stats(const char *name, nvlist_t spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { + uint64_t loadtimes[2]; + + loadtimes[0] = spa->spa_loaded_ts.tv_sec; + loadtimes[1] = spa->spa_loaded_ts.tv_nsec; + VERIFY(nvlist_add_uint64_array(*config, + ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); + VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)) == 0); @@ -2464,6 +3467,7 @@ spa_get_stats(const char *name, nvlist_t spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); + spa_add_feature_stats(spa, *config); } } @@ -2519,14 +3523,14 @@ spa_validate_aux_devs(spa_t *spa, nvlist return (0); if (ndev == 0) - return (EINVAL); + return (SET_ERROR(EINVAL)); /* * Make sure the pool is formatted with a version that supports this * device type. */ if (spa_version(spa) < version) - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); /* * Set the pending device list so we correctly handle device in-use @@ -2542,7 +3546,7 @@ spa_validate_aux_devs(spa_t *spa, nvlist if (!vd->vdev_ops->vdev_op_leaf) { vdev_free(vd); - error = EINVAL; + error = SET_ERROR(EINVAL); goto out; } @@ -2553,7 +3557,8 @@ spa_validate_aux_devs(spa_t *spa, nvlist #ifdef _KERNEL if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { - error = ENOTBLK; + error = SET_ERROR(ENOTBLK); + vdev_free(vd); goto out; } #endif @@ -2663,10 +3668,6 @@ spa_l2cache_drop(spa_t *spa) if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); - if (vd->vdev_isl2cache) - spa_l2cache_remove(vd); - vdev_clear_stats(vd); - (void) vdev_close(vd); } } @@ -2675,7 +3676,7 @@ spa_l2cache_drop(spa_t *spa) */ int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, - const char *history_str, nvlist_t *zplprops) + nvlist_t *zplprops) { spa_t *spa; char *altroot = NULL; @@ -2686,7 +3687,8 @@ spa_create(const char *pool, nvlist_t *n uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - uint64_t version; + uint64_t version, obj; + boolean_t has_features; /* * If this pool already exists, return failure. @@ -2694,7 +3696,7 @@ spa_create(const char *pool, nvlist_t *n mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); - return (EEXIST); + return (SET_ERROR(EEXIST)); } /* @@ -2712,21 +3714,35 @@ spa_create(const char *pool, nvlist_t *n return (error); } - if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), - &version) != 0) + has_features = B_FALSE; + for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); + elem != NULL; elem = nvlist_next_nvpair(props, elem)) { + if (zpool_prop_feature(nvpair_name(elem))) + has_features = B_TRUE; + } + + if (has_features || nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; - ASSERT(version <= SPA_VERSION); + } + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; + spa->spa_load_state = SPA_LOAD_CREATE; /* * Create "The Godfather" zio to hold all async IOs */ - spa->spa_async_zio_root = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); + spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), + KM_SLEEP); + for (int i = 0; i < max_ncpus; i++) { + spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + } /* * Create the root vdev. @@ -2739,13 +3755,14 @@ spa_create(const char *pool, nvlist_t *n ASSERT(error != 0 || spa->spa_root_vdev == rvd); if (error == 0 && !zfs_allocatable_devs(nvroot)) - error = EINVAL; + error = SET_ERROR(EINVAL); if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { for (int c = 0; c < rvd->vdev_children; c++) { + vdev_ashift_optimize(rvd->vdev_child[c]); vdev_metaslab_set_size(rvd->vdev_child[c]); vdev_expand(rvd->vdev_child[c], txg); } @@ -2791,8 +3808,10 @@ spa_create(const char *pool, nvlist_t *n spa->spa_l2cache.sav_sync = B_TRUE; } + spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); spa->spa_meta_objset = dp->dp_meta_objset; + spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). @@ -2816,6 +3835,15 @@ spa_create(const char *pool, nvlist_t *n cmn_err(CE_PANIC, "failed to add pool config"); } + if (spa_version(spa) >= SPA_VERSION_FEATURES) + spa_feature_create_zap_objects(spa, tx); + + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, + sizeof (uint64_t), 1, &version, tx) != 0) { + cmn_err(CE_PANIC, "failed to add pool version"); + } + /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; @@ -2827,20 +3855,20 @@ spa_create(const char *pool, nvlist_t *n } /* - * Create the deferred-free bplist object. Turn off compression + * Create the deferred-free bpobj. Turn off compression * because sync-to-convergence takes longer if the blocksize * keeps changing. */ - spa->spa_deferred_bplist_obj = bplist_create(spa->spa_meta_objset, - 1 << 14, tx); - dmu_object_set_compress(spa->spa_meta_objset, - spa->spa_deferred_bplist_obj, ZIO_COMPRESS_OFF, tx); - + obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); + dmu_object_set_compress(spa->spa_meta_objset, obj, + ZIO_COMPRESS_OFF, tx); if (zap_add(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, - sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj, tx) != 0) { - cmn_err(CE_PANIC, "failed to add bplist"); + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, + sizeof (uint64_t), 1, &obj, tx) != 0) { + cmn_err(CE_PANIC, "failed to add bpobj"); } + VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, + spa->spa_meta_objset, obj)); /* * Create the pool's history object. @@ -2849,6 +3877,12 @@ spa_create(const char *pool, nvlist_t *n spa_history_create_obj(spa, tx); /* + * Generate some random noise for salted checksums to operate on. + */ + (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, + sizeof (spa->spa_cksum_salt.zcs_bytes)); + + /* * Set pool properties. */ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); @@ -2858,7 +3892,7 @@ spa_create(const char *pool, nvlist_t *n if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); - spa_sync_props(spa, props, CRED(), tx); + spa_sync_props(props, tx); } dmu_tx_commit(tx); @@ -2873,19 +3907,26 @@ spa_create(const char *pool, nvlist_t *n txg_wait_synced(spa->spa_dsl_pool, txg); spa_config_sync(spa, B_FALSE, B_TRUE); + spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE); - if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) - (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); - spa_history_log_version(spa, LOG_POOL_CREATE); + spa_history_log_version(spa, "create"); + /* + * Don't count references from objsets that are already closed + * and are making their way through the eviction process. + */ + spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); + spa->spa_load_state = SPA_LOAD_NONE; mutex_exit(&spa_namespace_lock); return (0); } +#ifndef __NetBSD__ #ifdef _KERNEL +#ifdef illumos /* * Get the root pool information from the root disk, then import the root pool * during the system boot up time. @@ -3000,9 +4041,9 @@ spa_import_rootpool(char *devpath, char } #endif if (config == NULL) { - cmn_err(CE_NOTE, "Can not read the pool label from '%s'", + cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", devpath); - return (EIO); + return (SET_ERROR(EIO)); } VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, @@ -3020,7 +4061,7 @@ spa_import_rootpool(char *devpath, char spa = spa_add(pname, config, NULL); spa->spa_is_root = B_TRUE; - spa->spa_load_verbatim = B_TRUE; + spa->spa_import_flags = ZFS_IMPORT_VERBATIM; /* * Build up a vdev tree based on the boot device's label config. @@ -3045,7 +4086,7 @@ spa_import_rootpool(char *devpath, char if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", (u_longlong_t)guid); - error = ENOENT; + error = SET_ERROR(ENOENT); goto out; } @@ -3057,7 +4098,7 @@ spa_import_rootpool(char *devpath, char if (avd != bvd) { cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " "try booting from '%s'", avd->vdev_path); - error = EINVAL; + error = SET_ERROR(EINVAL); goto out; } @@ -3069,13 +4110,13 @@ spa_import_rootpool(char *devpath, char !bvd->vdev_isspare) { cmn_err(CE_NOTE, "The boot device is currently spared. Please " "try booting from '%s'", - bvd->vdev_parent->vdev_child[1]->vdev_path); - error = EINVAL; + bvd->vdev_parent-> + vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); + error = SET_ERROR(EINVAL); goto out; } error = 0; - spa_history_log_version(spa, LOG_POOL_IMPORT); out: spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_free(rvd); @@ -3086,51 +4127,215 @@ out: return (error); } -#endif +#else /* !illumos */ + +extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, + uint64_t *count); + +static nvlist_t * +spa_generate_rootconf(const char *name) +{ + nvlist_t **configs, **tops; + nvlist_t *config; + nvlist_t *best_cfg, *nvtop, *nvroot; + uint64_t *holes; + uint64_t best_txg; + uint64_t nchildren; + uint64_t pgid; + uint64_t count; + uint64_t i; + uint_t nholes; + + if (vdev_geom_read_pool_label(name, &configs, &count) != 0) + return (NULL); + + ASSERT3U(count, !=, 0); + best_txg = 0; + for (i = 0; i < count; i++) { + uint64_t txg; + + VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, + &txg) == 0); + if (txg > best_txg) { + best_txg = txg; + best_cfg = configs[i]; + } + } + + nchildren = 1; + nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); + holes = NULL; + nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, + &holes, &nholes); + + tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); + for (i = 0; i < nchildren; i++) { + if (i >= count) + break; + if (configs[i] == NULL) + continue; + VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + nvlist_dup(nvtop, &tops[i], KM_SLEEP); + } + for (i = 0; holes != NULL && i < nholes; i++) { + if (i >= nchildren) + continue; + if (tops[holes[i]] != NULL) + continue; + nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); + VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) == 0); + VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, + holes[i]) == 0); + VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, + 0) == 0); + } + for (i = 0; i < nchildren; i++) { + if (tops[i] != NULL) + continue; + nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); + VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, + VDEV_TYPE_MISSING) == 0); + VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, + i) == 0); + VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, + 0) == 0); + } + + /* + * Create pool config based on the best vdev config. + */ + nvlist_dup(best_cfg, &config, KM_SLEEP); + + /* + * Put this pool's top-level vdevs into a root vdev. + */ + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pgid) == 0); + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); + VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + tops, nchildren) == 0); + + /* + * Replace the existing vdev_tree with the new root vdev in + * this pool's configuration (remove the old, add the new). + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + + /* + * Drop vdev config elements that should not be present at pool level. + */ + nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); + nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); + + for (i = 0; i < count; i++) + nvlist_free(configs[i]); + kmem_free(configs, count * sizeof(void *)); + for (i = 0; i < nchildren; i++) + nvlist_free(tops[i]); + kmem_free(tops, nchildren * sizeof(void *)); + nvlist_free(nvroot); + return (config); +} -/* - * Take a pool and insert it into the namespace as if it had been loaded at - * boot. - */ int -spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) +spa_import_rootpool(const char *name) { spa_t *spa; - char *altroot = NULL; + vdev_t *rvd, *bvd, *avd = NULL; + nvlist_t *config, *nvtop; + uint64_t txg; + char *pname; + int error; - mutex_enter(&spa_namespace_lock); - if (spa_lookup(pool) != NULL) { - mutex_exit(&spa_namespace_lock); - return (EEXIST); - } + /* + * Read the label from the boot device and generate a configuration. + */ + config = spa_generate_rootconf(name); - (void) nvlist_lookup_string(props, - zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - spa = spa_add(pool, config, altroot); + mutex_enter(&spa_namespace_lock); + if (config != NULL) { + VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pname) == 0 && strcmp(name, pname) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) + == 0); - spa->spa_load_verbatim = B_TRUE; + if ((spa = spa_lookup(pname)) != NULL) { + /* + * Remove the existing root pool from the namespace so + * that we can replace it with the correct config + * we just read in. + */ + spa_remove(spa); + } + spa = spa_add(pname, config, NULL); - if (props != NULL) - spa_configfile_set(spa, props, B_FALSE); + /* + * Set spa_ubsync.ub_version as it can be used in vdev_alloc() + * via spa_version(). + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; + } else if ((spa = spa_lookup(name)) == NULL) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", + name); + return (EIO); + } else { + VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); + } + spa->spa_is_root = B_TRUE; + spa->spa_import_flags = ZFS_IMPORT_VERBATIM; - spa_config_sync(spa, B_FALSE, B_TRUE); + /* + * Build up a vdev tree based on the boot device's label config. + */ + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, + VDEV_ALLOC_ROOTPOOL); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", + pname); + return (error); + } + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_free(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&spa_namespace_lock); - spa_history_log_version(spa, LOG_POOL_IMPORT); + nvlist_free(config); return (0); } +#endif /* illumos */ +#endif /* _KERNEL */ +#endif /* !__NetBSD__ */ + /* * Import a non-root pool into the system. */ int -spa_import(const char *pool, nvlist_t *config, nvlist_t *props) +spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_rewind_policy_t policy; + uint64_t mode = spa_mode_global; + uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; @@ -3142,26 +4347,47 @@ spa_import(const char *pool, nvlist_t *c mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); - return (EEXIST); + return (SET_ERROR(EEXIST)); } - zpool_get_rewind_policy(config, &policy); - if (policy.zrp_request & ZPOOL_DO_REWIND) - state = SPA_LOAD_RECOVER; - /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + (void) nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); + if (readonly) + mode = FREAD; spa = spa_add(pool, config, altroot); - spa_activate(spa, spa_mode_global); + spa->spa_import_flags = flags; + + /* + * Verbatim import - Take a pool and insert it into the namespace + * as if it had been loaded at boot. + */ + if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { + if (props != NULL) + spa_configfile_set(spa, props, B_FALSE); + + spa_config_sync(spa, B_FALSE, B_TRUE); + spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); + + mutex_exit(&spa_namespace_lock); + return (0); + } + + spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); + zpool_get_rewind_policy(config, &policy); + if (policy.zrp_request & ZPOOL_DO_REWIND) + state = SPA_LOAD_RECOVER; + /* * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig * because the user-supplied config is actually the one to trust when @@ -3169,14 +4395,16 @@ spa_import(const char *pool, nvlist_t *c */ if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, policy.zrp_request); /* - * Propagate anything learned about failing or best txgs - * back to caller + * Propagate anything learned while loading the pool and pass it + * back to caller (i.e. rewind info, missing devices, etc). */ - spa_rewind_data_to_nvlist(spa, config); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* @@ -3274,9 +4502,17 @@ spa_import(const char *pool, nvlist_t *c */ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); + spa_history_log_version(spa, "import"); + + spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); + mutex_exit(&spa_namespace_lock); - spa_history_log_version(spa, LOG_POOL_IMPORT); +#ifdef __FreeBSD__ +#ifdef _KERNEL + zvol_create_minors(pool); +#endif +#endif return (0); } @@ -3320,6 +4556,8 @@ spa_tryimport(nvlist_t *tryconfig) state) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp) == 0); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); /* * If the bootfs property exists on this pool then we @@ -3389,12 +4627,12 @@ spa_export_common(char *pool, int new_st *oldconfig = NULL; if (!(spa_mode_global & FWRITE)) - return (EROFS); + return (SET_ERROR(EROFS)); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) == NULL) { mutex_exit(&spa_namespace_lock); - return (ENOENT); + return (SET_ERROR(ENOENT)); } /* @@ -3417,6 +4655,7 @@ spa_export_common(char *pool, int new_st * have to force it to sync before checking spa_refcnt. */ txg_wait_synced(spa->spa_dsl_pool, 0); + spa_evicting_os_wait(spa); /* * A pool cannot be exported or destroyed if there are active @@ -3428,7 +4667,7 @@ spa_export_common(char *pool, int new_st new_state != POOL_STATE_UNINITIALIZED)) { spa_async_resume(spa); mutex_exit(&spa_namespace_lock); - return (EBUSY); + return (SET_ERROR(EBUSY)); } /* @@ -3441,7 +4680,7 @@ spa_export_common(char *pool, int new_st spa_has_active_shared_spare(spa)) { spa_async_resume(spa); mutex_exit(&spa_namespace_lock); - return (EXDEV); + return (SET_ERROR(EXDEV)); } /* @@ -3452,7 +4691,8 @@ spa_export_common(char *pool, int new_st if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; - spa->spa_final_txg = spa_last_synced_txg(spa) + 1; + spa->spa_final_txg = spa_last_synced_txg(spa) + + TXG_DEFER_SIZE + 1; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); } @@ -3529,6 +4769,8 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroo nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; + ASSERT(spa_writeable(spa)); + txg = spa_vdev_enter(spa); if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, @@ -3611,6 +4853,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroo mutex_enter(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD); mutex_exit(&spa_namespace_lock); return (0); @@ -3632,7 +4875,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroo int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) { - uint64_t txg, open_txg; + uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; @@ -3640,6 +4883,8 @@ spa_vdev_attach(spa_t *spa, uint64_t gui int newvd_isspare; int error; + ASSERT(spa_writeable(spa)); + txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); @@ -3653,7 +4898,7 @@ spa_vdev_attach(spa_t *spa, uint64_t gui pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, - VDEV_ALLOC_ADD)) != 0) + VDEV_ALLOC_ATTACH)) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) @@ -3689,7 +4934,7 @@ spa_vdev_attach(spa_t *spa, uint64_t gui * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && - pvd->vdev_child[1] == oldvd && + oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); @@ -3701,13 +4946,15 @@ spa_vdev_attach(spa_t *spa, uint64_t gui * the same (spare replaces spare, non-spare replaces * non-spare). */ - if (pvd->vdev_ops == &vdev_replacing_ops) + if (pvd->vdev_ops == &vdev_replacing_ops && + spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - else if (pvd->vdev_ops == &vdev_spare_ops && - newvd->vdev_isspare != oldvd->vdev_isspare) + } else if (pvd->vdev_ops == &vdev_spare_ops && + newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); - else if (pvd->vdev_ops != &vdev_spare_ops && - newvd->vdev_isspare) + } + + if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; @@ -3731,10 +4978,10 @@ spa_vdev_attach(spa_t *spa, uint64_t gui * to make it distinguishable from newvd, and unopenable from now on. */ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { - size_t plen = strlen(newvd->vdev_path) + 5; spa_strfree(oldvd->vdev_path); - oldvd->vdev_path = kmem_alloc(plen, KM_SLEEP); - snprintf(oldvd->vdev_path, plen, "%s/%s", + oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, + KM_SLEEP); + (void) sprintf(oldvd->vdev_path, "%s/%s", newvd->vdev_path, "old"); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); @@ -3742,6 +4989,9 @@ spa_vdev_attach(spa_t *spa, uint64_t gui } } + /* mark the device being resilvered */ + newvd->vdev_resilver_txg = txg; + /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. @@ -3768,13 +5018,14 @@ spa_vdev_attach(spa_t *spa, uint64_t gui vdev_config_dirty(tvd); /* - * Set newvd's DTL to [TXG_INITIAL, open_txg]. It will propagate - * upward when spa_vdev_exit() calls vdev_dtl_reassess(). + * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account + * for any dmu_sync-ed blocks. It will propagate upward when + * spa_vdev_exit() calls vdev_dtl_reassess(). */ - open_txg = txg + TXG_CONCURRENT_STATES - 1; + dtl_max_txg = txg + TXG_CONCURRENT_STATES; - vdev_dtl_dirty(newvd, DTL_MISSING, - TXG_INITIAL, open_txg - TXG_INITIAL + 1); + vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, + dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); @@ -3790,10 +5041,25 @@ spa_vdev_attach(spa_t *spa, uint64_t gui */ vdev_dirty(tvd, VDD_DTL, newvd, txg); - (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); + /* + * Schedule the resilver to restart in the future. We do this to + * ensure that dmu_sync-ed blocks have been stitched into the + * respective datasets. + */ + dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); + + if (spa->spa_bootfs) + spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); + + spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH); + + /* + * Commit the config + */ + (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); - spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL, - CRED(), "%s vdev=%s %s vdev=%s", + spa_history_log_internal(spa, "vdev attach", NULL, + "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : replacing ? "replace" : "attach", newvdpath, replacing ? "for" : "to", oldvdpath); @@ -3801,16 +5067,12 @@ spa_vdev_attach(spa_t *spa, uint64_t gui spa_strfree(oldvdpath); spa_strfree(newvdpath); - /* - * Kick off a resilver to update newvd. - */ - VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); - return (0); } /* * Detach a device from a mirror or replacing vdev. + * * If 'replace_done' is specified, only detach if the parent * is a replacing vdev. */ @@ -3822,10 +5084,11 @@ spa_vdev_detach(spa_t *spa, uint64_t gui vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; - uint64_t unspare_guid; - size_t len; + uint64_t unspare_guid = 0; char *vdpath; + ASSERT(spa_writeable(spa)); + txg = spa_vdev_enter(spa); vd = spa_lookup_by_guid(spa, guid, B_FALSE); @@ -3855,18 +5118,11 @@ spa_vdev_detach(spa_t *spa, uint64_t gui return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* - * If replace_done is specified, only remove this device if it's - * the first child of a replacing vdev. For the 'spare' vdev, either - * disk can be removed. - */ - if (replace_done) { - if (pvd->vdev_ops == &vdev_replacing_ops) { - if (vd->vdev_id != 0) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - } else if (pvd->vdev_ops != &vdev_spare_ops) { - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - } - } + * Only 'replacing' or 'spare' vdevs can be replaced. + */ + if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && + pvd->vdev_ops != &vdev_spare_ops) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); @@ -3893,16 +5149,22 @@ spa_vdev_detach(spa_t *spa, uint64_t gui * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ - if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 && - pvd->vdev_child[0]->vdev_path != NULL && - pvd->vdev_child[1]->vdev_path != NULL) { - ASSERT(pvd->vdev_child[1] == vd); - cvd = pvd->vdev_child[0]; - len = strlen(vd->vdev_path); - if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && - strcmp(cvd->vdev_path + len, "/old") == 0) { - spa_strfree(cvd->vdev_path); - cvd->vdev_path = spa_strdup(vd->vdev_path); + if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && + vd->vdev_path != NULL) { + size_t len = strlen(vd->vdev_path); + + for (int c = 0; c < pvd->vdev_children; c++) { + cvd = pvd->vdev_child[c]; + + if (cvd == vd || cvd->vdev_path == NULL) + continue; + + if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && + strcmp(cvd->vdev_path + len, "/old") == 0) { + spa_strfree(cvd->vdev_path); + cvd->vdev_path = spa_strdup(vd->vdev_path); + break; + } } } @@ -3912,7 +5174,8 @@ spa_vdev_detach(spa_t *spa, uint64_t gui * active spare list for the pool. */ if (pvd->vdev_ops == &vdev_spare_ops && - vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) + vd->vdev_id == 0 && + pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) unspare = B_TRUE; /* @@ -3934,7 +5197,7 @@ spa_vdev_detach(spa_t *spa, uint64_t gui /* * Remember one of the remaining children so we can get tvd below. */ - cvd = pvd->vdev_child[0]; + cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, @@ -3950,14 +5213,19 @@ spa_vdev_detach(spa_t *spa, uint64_t gui spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); + cvd->vdev_unspare = B_TRUE; } /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ - if (pvd->vdev_children == 1) + if (pvd->vdev_children == 1) { + if (pvd->vdev_ops == &vdev_spare_ops) + cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); + } + /* * We don't set tvd until now because the parent we just removed @@ -3999,9 +5267,12 @@ spa_vdev_detach(spa_t *spa, uint64_t gui spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); + /* hang on to the spa before we release the lock */ + spa_open_ref(spa, FTAG); + error = spa_vdev_exit(spa, vd, txg, 0); - spa_history_internal_log(LOG_POOL_VDEV_DETACH, spa, NULL, CRED(), + spa_history_log_internal(spa, "detach", NULL, "vdev=%s", vdpath); spa_strfree(vdpath); @@ -4011,23 +5282,31 @@ spa_vdev_detach(spa_t *spa, uint64_t gui * list of every other pool. */ if (unspare) { - spa_t *myspa = spa; - spa = NULL; + spa_t *altspa = NULL; + mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) { - if (spa->spa_state != POOL_STATE_ACTIVE) - continue; - if (spa == myspa) + while ((altspa = spa_next(altspa)) != NULL) { + if (altspa->spa_state != POOL_STATE_ACTIVE || + altspa == spa) continue; - spa_open_ref(spa, FTAG); + + spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); - (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); + (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); mutex_enter(&spa_namespace_lock); - spa_close(spa, FTAG); + spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); + + /* search the rest of the vdevs for spares to remove */ + spa_vdev_resilver_done(spa); } + /* all done with the spa; OK to release */ + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + mutex_exit(&spa_namespace_lock); + return (error); } @@ -4048,8 +5327,7 @@ spa_vdev_split_mirror(spa_t *spa, char * vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; - if (!spa_writeable(spa)) - return (EROFS); + ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); @@ -4115,7 +5393,7 @@ spa_vdev_split_mirror(spa_t *spa, char * spa->spa_root_vdev->vdev_child[c]->vdev_islog) { continue; } else { - error = EINVAL; + error = SET_ERROR(EINVAL); break; } } @@ -4123,14 +5401,14 @@ spa_vdev_split_mirror(spa_t *spa, char * /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { - error = EINVAL; + error = SET_ERROR(EINVAL); break; } /* look it up in the spa */ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); if (vml[c] == NULL) { - error = ENODEV; + error = SET_ERROR(ENODEV); break; } @@ -4144,12 +5422,12 @@ spa_vdev_split_mirror(spa_t *spa, char * vml[c]->vdev_children != 0 || vml[c]->vdev_state != VDEV_STATE_HEALTHY || c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { - error = EINVAL; + error = SET_ERROR(EINVAL); break; } if (vdev_dtl_required(vml[c])) { - error = EBUSY; + error = SET_ERROR(EBUSY); break; } @@ -4162,6 +5440,16 @@ spa_vdev_split_mirror(spa_t *spa, char * vml[c]->vdev_top->vdev_asize) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, vml[c]->vdev_top->vdev_ashift) == 0); + + /* transfer per-vdev ZAPs */ + ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); + VERIFY0(nvlist_add_uint64(child[c], + ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); + + ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); + VERIFY0(nvlist_add_uint64(child[c], + ZPOOL_CONFIG_VDEV_TOP_ZAP, + vml[c]->vdev_parent->vdev_top_zap)); } if (error != 0) { @@ -4186,8 +5474,10 @@ spa_vdev_split_mirror(spa_t *spa, char * glist, children) == 0); kmem_free(glist, children * sizeof (uint64_t)); + mutex_enter(&spa->spa_props_lock); VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl) == 0); + mutex_exit(&spa->spa_props_lock); spa->spa_config_splitting = nvl; vdev_config_dirty(spa->spa_root_vdev); @@ -4201,11 +5491,13 @@ spa_vdev_split_mirror(spa_t *spa, char * spa->spa_config_txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_generate_guid(NULL)) == 0); + VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); /* add the new pool to the namespace */ newspa = spa_add(newname, config, altroot); + newspa->spa_avz_action = AVZ_ACTION_REBUILD; newspa->spa_config_txg = spa->spa_config_txg; spa_set_log_state(newspa, SPA_LOG_CLEAR); @@ -4218,8 +5510,15 @@ spa_vdev_split_mirror(spa_t *spa, char * spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); +#ifndef illumos + /* mark that we are creating new spa by splitting */ + newspa->spa_splitting_newspa = B_TRUE; +#endif /* create the new pool from the disks of the original pool */ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); +#ifndef illumos + newspa->spa_splitting_newspa = B_FALSE; +#endif if (error) goto out; @@ -4261,12 +5560,13 @@ spa_vdev_split_mirror(spa_t *spa, char * if (vml[c] != NULL) { vdev_split(vml[c]); if (error == 0) - spa_history_internal_log(LOG_POOL_VDEV_DETACH, - spa, tx, CRED(), "vdev=%s", - vml[c]->vdev_path); + spa_history_log_internal(spa, "detach", tx, + "vdev=%s", vml[c]->vdev_path); + vdev_free(vml[c]); } } + spa->spa_avz_action = AVZ_ACTION_REBUILD; vdev_config_dirty(spa->spa_root_vdev); spa->spa_config_splitting = NULL; nvlist_free(nvl); @@ -4278,8 +5578,8 @@ spa_vdev_split_mirror(spa_t *spa, char * zio_handle_panic_injection(spa, FTAG, 3); /* split is complete; log a history record */ - spa_history_internal_log(LOG_POOL_SPLIT, newspa, NULL, CRED(), - "split new pool %s from pool %s", newname, spa_name(spa)); + spa_history_log_internal(newspa, "split", NULL, + "from pool %s", spa_name(spa)); kmem_free(vml, children * sizeof (vdev_t *)); @@ -4296,6 +5596,14 @@ out: spa_remove(newspa); txg = spa_vdev_config_enter(spa); + + /* re-online all offlined disks */ + for (c = 0; c < children; c++) { + if (vml[c] != NULL) + vml[c]->vdev_offline = B_FALSE; + } + vdev_reopen(spa->spa_root_vdev); + nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; (void) spa_vdev_exit(spa, NULL, txg, error); @@ -4322,7 +5630,7 @@ spa_nvlist_lookup_by_guid(nvlist_t **nvp static void spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, - nvlist_t *dev_to_remove) + nvlist_t *dev_to_remove) { nvlist_t **newdev = NULL; @@ -4346,21 +5654,13 @@ spa_vdev_remove_aux(nvlist_t *config, ch } /* - * Removing a device from the vdev namespace requires several steps - * and can take a significant amount of time. As a result we use - * the spa_vdev_config_[enter/exit] functions which allow us to - * grab and release the spa_config_lock while still holding the namespace - * lock. During each step the configuration is synced out. + * Evacuate the device. */ - -/* - * Evacuate the device. - */ -int +static int spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) { - int error = 0; uint64_t txg; + int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); @@ -4373,14 +5673,12 @@ spa_vdev_remove_evacuate(spa_t *spa, vde * should no longer have any blocks allocated on it. */ if (vd->vdev_islog) { - error = dmu_objset_find(spa_name(spa), zil_vdev_offline, - NULL, DS_FIND_CHILDREN); + if (vd->vdev_stat.vs_alloc != 0) + error = spa_offline_log(spa); } else { - error = ENOTSUP; /* until we have bp rewrite */ + error = SET_ERROR(ENOTSUP); } - txg_wait_synced(spa_get_dsl(spa), 0); - if (error) return (error); @@ -4388,9 +5686,10 @@ spa_vdev_remove_evacuate(spa_t *spa, vde * The evacuation succeeded. Remove any remaining MOS metadata * associated with this vdev, and wait for these changes to sync. */ + ASSERT0(vd->vdev_stat.vs_alloc); txg = spa_vdev_config_enter(spa); vd->vdev_removing = B_TRUE; - vdev_dirty(vd, 0, NULL, txg); + vdev_dirty_leaves(vd, VDD_DTL, txg); vdev_config_dirty(vd); spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); @@ -4400,7 +5699,7 @@ spa_vdev_remove_evacuate(spa_t *spa, vde /* * Complete the removal by cleaning up the namespace. */ -void +static void spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; @@ -4411,6 +5710,12 @@ spa_vdev_remove_from_namespace(spa_t *sp ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vd == vd->vdev_top); + /* + * Only remove any devices which are empty. + */ + if (vd->vdev_stat.vs_alloc != 0) + return; + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); if (list_link_active(&vd->vdev_state_dirty_node)) @@ -4435,13 +5740,22 @@ spa_vdev_remove_from_namespace(spa_t *sp } /* - * Remove a device from the pool. Currently, this supports removing only hot - * spares, slogs, and level 2 ARC devices. + * Remove a device from the pool - + * + * Removing a device from the vdev namespace requires several steps + * and can take a significant amount of time. As a result we use + * the spa_vdev_config_[enter/exit] functions which allow us to + * grab and release the spa_config_lock while still holding the namespace + * lock. During each step the configuration is synced out. + * + * Currently, this supports removing only hot spares, slogs, and level 2 ARC + * devices. */ int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) { vdev_t *vd; + sysevent_t *ev = NULL; metaslab_group_t *mg; nvlist_t **spares, **l2cache, *nv; uint64_t txg = 0; @@ -4449,6 +5763,8 @@ spa_vdev_remove(spa_t *spa, uint64_t gui int error = 0; boolean_t locked = MUTEX_HELD(&spa_namespace_lock); + ASSERT(spa_writeable(spa)); + if (!locked) txg = spa_vdev_enter(spa); @@ -4463,12 +5779,15 @@ spa_vdev_remove(spa_t *spa, uint64_t gui * in this pool. */ if (vd == NULL || unspare) { + if (vd == NULL) + vd = spa_lookup_by_guid(spa, guid, B_TRUE); + ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX); spa_vdev_remove_aux(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares, nv); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } else { - error = EBUSY; + error = SET_ERROR(EBUSY); } } else if (spa->spa_l2cache.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, @@ -4477,6 +5796,8 @@ spa_vdev_remove(spa_t *spa, uint64_t gui /* * Cache devices can always be removed. */ + vd = spa_lookup_by_guid(spa, guid, B_TRUE); + ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX); spa_vdev_remove_aux(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); spa_load_l2cache(spa); @@ -4485,11 +5806,6 @@ spa_vdev_remove(spa_t *spa, uint64_t gui ASSERT(!locked); ASSERT(vd == vd->vdev_top); - /* - * XXX - Once we have bp-rewrite this should - * become the common case. - */ - mg = vd->vdev_mg; /* @@ -4522,29 +5838,33 @@ spa_vdev_remove(spa_t *spa, uint64_t gui /* * Clean up the vdev namespace. */ + ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_DEV); spa_vdev_remove_from_namespace(spa, vd); } else if (vd != NULL) { /* * Normal vdevs cannot be removed (yet). */ - error = ENOTSUP; + error = SET_ERROR(ENOTSUP); } else { /* * There is no vdev of any kind with the specified guid. */ - error = ENOENT; + error = SET_ERROR(ENOENT); } if (!locked) - return (spa_vdev_exit(spa, NULL, txg, error)); + error = spa_vdev_exit(spa, NULL, txg, error); + + if (ev) + spa_event_post(ev); return (error); } /* * Find any device that's done replacing, or a vdev marked 'unspare' that's - * current spared, so we can detach it. + * currently spared, so we can detach it. */ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) @@ -4558,13 +5878,21 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) } /* - * Check for a completed replacement. + * Check for a completed replacement. We always consider the first + * vdev in the list to be the oldest vdev, and the last one to be + * the newest (see spa_vdev_attach() for how that works). In + * the case where the newest vdev is faulted, we will not automatically + * remove it after a resilver completes. This is OK as it will require + * user intervention to determine which disk the admin wishes to keep. */ - if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { + if (vd->vdev_ops == &vdev_replacing_ops) { + ASSERT(vd->vdev_children > 1); + + newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; - newvd = vd->vdev_child[1]; if (vdev_dtl_empty(newvd, DTL_MISSING) && + vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); } @@ -4572,15 +5900,41 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) /* * Check for a completed resilver with the 'unspare' flag set. */ - if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) { - newvd = vd->vdev_child[0]; - oldvd = vd->vdev_child[1]; + if (vd->vdev_ops == &vdev_spare_ops) { + vdev_t *first = vd->vdev_child[0]; + vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; + + if (last->vdev_unspare) { + oldvd = first; + newvd = last; + } else if (first->vdev_unspare) { + oldvd = last; + newvd = first; + } else { + oldvd = NULL; + } - if (newvd->vdev_unspare && + if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && - !vdev_dtl_required(oldvd)) { - newvd->vdev_unspare = 0; + vdev_dtl_empty(newvd, DTL_OUTAGE) && + !vdev_dtl_required(oldvd)) return (oldvd); + + /* + * If there are more than two spares attached to a disk, + * and those spares are not required, then we want to + * attempt to free them up now so that they can be used + * by other pools. Once we're back down to a single + * disk+spare, we stop removing them. + */ + if (vd->vdev_children > 2) { + newvd = vd->vdev_child[1]; + + if (newvd->vdev_isspare && last->vdev_isspare && + vdev_dtl_empty(last, DTL_MISSING) && + vdev_dtl_empty(last, DTL_OUTAGE) && + !vdev_dtl_required(newvd)) + return (newvd); } } @@ -4607,11 +5961,13 @@ spa_vdev_resilver_done(spa_t *spa) * we need to detach the parent's first child (the original hot * spare) as well. */ - if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { + if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && + ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); - ASSERT(ppvd->vdev_children == 2); sguid = ppvd->vdev_child[1]->vdev_guid; } + ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); + spa_config_exit(spa, SCL_ALL, FTAG); if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; @@ -4631,6 +5987,9 @@ spa_vdev_set_common(spa_t *spa, uint64_t boolean_t ispath) { vdev_t *vd; + boolean_t sync = B_FALSE; + + ASSERT(spa_writeable(spa)); spa_vdev_state_enter(spa, SCL_ALL); @@ -4641,15 +6000,23 @@ spa_vdev_set_common(spa_t *spa, uint64_t return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); if (ispath) { - spa_strfree(vd->vdev_path); - vd->vdev_path = spa_strdup(value); + if (strcmp(value, vd->vdev_path) != 0) { + spa_strfree(vd->vdev_path); + vd->vdev_path = spa_strdup(value); + sync = B_TRUE; + } } else { - if (vd->vdev_fru != NULL) + if (vd->vdev_fru == NULL) { + vd->vdev_fru = spa_strdup(value); + sync = B_TRUE; + } else if (strcmp(value, vd->vdev_fru) != 0) { spa_strfree(vd->vdev_fru); - vd->vdev_fru = spa_strdup(value); + vd->vdev_fru = spa_strdup(value); + sync = B_TRUE; + } } - return (spa_vdev_state_exit(spa, vd, 0)); + return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); } int @@ -4666,40 +6033,38 @@ spa_vdev_setfru(spa_t *spa, uint64_t gui /* * ========================================================================== - * SPA Scrubbing + * SPA Scanning * ========================================================================== */ int -spa_scrub(spa_t *spa, pool_scrub_type_t type) +spa_scan_stop(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + if (dsl_scan_resilvering(spa->spa_dsl_pool)) + return (SET_ERROR(EBUSY)); + return (dsl_scan_cancel(spa->spa_dsl_pool)); +} - if ((uint_t)type >= POOL_SCRUB_TYPES) - return (ENOTSUP); +int +spa_scan(spa_t *spa, pool_scan_func_t func) +{ + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + + if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) + return (SET_ERROR(ENOTSUP)); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ - if (type == POOL_SCRUB_RESILVER && + if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } - if (type == POOL_SCRUB_EVERYTHING && - spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE && - spa->spa_dsl_pool->dp_scrub_isresilver) - return (EBUSY); - - if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) { - return (dsl_pool_scrub_clean(spa->spa_dsl_pool)); - } else if (type == POOL_SCRUB_NONE) { - return (dsl_pool_scrub_cancel(spa->spa_dsl_pool)); - } else { - return (EINVAL); - } + return (dsl_scan(spa->spa_dsl_pool, func)); } /* @@ -4712,7 +6077,8 @@ static void spa_async_remove(spa_t *spa, vdev_t *vd) { if (vd->vdev_remove_wanted) { - vd->vdev_remove_wanted = 0; + vd->vdev_remove_wanted = B_FALSE; + vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); /* @@ -4726,6 +6092,8 @@ spa_async_remove(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); + /* Tell userspace that the vdev is gone. */ + zfs_post_remove(spa, vd); } for (int c = 0; c < vd->vdev_children; c++) @@ -4736,7 +6104,7 @@ static void spa_async_probe(spa_t *spa, vdev_t *vd) { if (vd->vdev_probe_wanted) { - vd->vdev_probe_wanted = 0; + vd->vdev_probe_wanted = B_FALSE; vdev_reopen(vd); /* vdev_open() does the actual probe */ } @@ -4769,7 +6137,7 @@ spa_async_autoexpand(spa_t *spa, vdev_t VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, - ESC_DEV_DLE, attr, &eid, DDI_SLEEP); + ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); nvlist_free(attr); kmem_free(physpath, MAXPATHLEN); @@ -4778,14 +6146,14 @@ spa_async_autoexpand(spa_t *spa, vdev_t static void spa_async_thread(void *arg) { - int tasks; spa_t *spa = arg; + int tasks; ASSERT(spa->spa_sync_on); mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; - spa->spa_async_tasks = 0; + spa->spa_async_tasks &= SPA_ASYNC_REMOVE; mutex_exit(&spa->spa_async_lock); /* @@ -4805,26 +6173,12 @@ spa_async_thread(void *arg) * then log an internal history event. */ if (new_space != old_space) { - spa_history_internal_log(LOG_POOL_VDEV_ONLINE, - spa, NULL, CRED(), + spa_history_log_internal(spa, "vdev online", NULL, "pool '%s' size: %llu(+%llu)", spa_name(spa), new_space, new_space - old_space); } } - /* - * See if any devices need to be marked REMOVED. - */ - if (tasks & SPA_ASYNC_REMOVE) { - spa_vdev_state_enter(spa, SCL_NONE); - spa_async_remove(spa, spa->spa_root_vdev); - for (int i = 0; i < spa->spa_l2cache.sav_count; i++) - spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); - for (int i = 0; i < spa->spa_spares.sav_count; i++) - spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); - (void) spa_vdev_state_exit(spa, NULL, 0); - } - if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_async_autoexpand(spa, spa->spa_root_vdev); @@ -4850,7 +6204,7 @@ spa_async_thread(void *arg) * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER) - VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0); + dsl_resilver_restart(spa->spa_dsl_pool, 0); /* * Let the world know that we're done. @@ -4862,12 +6216,53 @@ spa_async_thread(void *arg) thread_exit(); } +static void +spa_async_thread_vd(void *arg) +{ + spa_t *spa = arg; + int tasks; + + ASSERT(spa->spa_sync_on); + + mutex_enter(&spa->spa_async_lock); + tasks = spa->spa_async_tasks; +retry: + spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; + mutex_exit(&spa->spa_async_lock); + + /* + * See if any devices need to be marked REMOVED. + */ + if (tasks & SPA_ASYNC_REMOVE) { + spa_vdev_state_enter(spa, SCL_NONE); + spa_async_remove(spa, spa->spa_root_vdev); + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) + spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); + for (int i = 0; i < spa->spa_spares.sav_count; i++) + spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); + (void) spa_vdev_state_exit(spa, NULL, 0); + } + + /* + * Let the world know that we're done. + */ + mutex_enter(&spa->spa_async_lock); + tasks = spa->spa_async_tasks; + if ((tasks & SPA_ASYNC_REMOVE) != 0) + goto retry; + spa->spa_async_thread_vd = NULL; + cv_broadcast(&spa->spa_async_cv); + mutex_exit(&spa->spa_async_lock); + thread_exit(); +} + void spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; - while (spa->spa_async_thread != NULL) + while (spa->spa_async_thread != NULL && + spa->spa_async_thread_vd != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); } @@ -4881,24 +6276,61 @@ spa_async_resume(spa_t *spa) mutex_exit(&spa->spa_async_lock); } +static boolean_t +spa_async_tasks_pending(spa_t *spa) +{ + uint_t non_config_tasks; + uint_t config_task; + boolean_t config_task_suspended; + + non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | + SPA_ASYNC_REMOVE); + config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; + if (spa->spa_ccw_fail_time == 0) { + config_task_suspended = B_FALSE; + } else { + config_task_suspended = + (gethrtime() - spa->spa_ccw_fail_time) < + (zfs_ccw_retry_interval * NANOSEC); + } + + return (non_config_tasks || (config_task && !config_task_suspended)); +} + static void spa_async_dispatch(spa_t *spa) { mutex_enter(&spa->spa_async_lock); - if (spa->spa_async_tasks && !spa->spa_async_suspended && + if (spa_async_tasks_pending(spa) && + !spa->spa_async_suspended && spa->spa_async_thread == NULL && - rootdir != NULL && !vn_is_readonly(rootdir)) + rootdir != NULL) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } +static void +spa_async_dispatch_vd(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && + !spa->spa_async_suspended && + spa->spa_async_thread_vd == NULL && + rootdir != NULL) + spa->spa_async_thread_vd = thread_create(NULL, 0, + spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); + mutex_exit(&spa->spa_async_lock); +} + void spa_async_request(spa_t *spa, int task) { + zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); + spa_async_dispatch_vd(spa); } /* @@ -4906,36 +6338,51 @@ spa_async_request(spa_t *spa, int task) * SPA syncing routines * ========================================================================== */ -static void -spa_sync_deferred_bplist(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx, uint64_t txg) -{ - blkptr_t blk; - uint64_t itor = 0; - uint8_t c = 1; - while (bplist_iterate(bpl, &itor, &blk) == 0) { - ASSERT(blk.blk_birth < txg); - zio_free(spa, txg, &blk); - } +static int +bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + bpobj_t *bpo = arg; + bpobj_enqueue(bpo, bp, tx); + return (0); +} - bplist_vacate(bpl, tx); +static int +spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + zio_t *zio = arg; - /* - * Pre-dirty the first block so we sync to convergence faster. - * (Usually only the first block is needed.) - */ - dmu_write(bpl->bpl_mos, spa->spa_deferred_bplist_obj, 0, 1, &c, tx); + zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, + BP_GET_PSIZE(bp), zio->io_flags)); + return (0); } +/* + * Note: this simple function is not inlined to make it easier to dtrace the + * amount of time spent syncing frees. + */ static void -spa_sync_free(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) { - zio_t *zio = arg; + zio_t *zio = zio_root(spa, NULL, NULL, 0); + bplist_iterate(bpl, spa_free_sync_cb, zio, tx); + VERIFY(zio_wait(zio) == 0); +} - zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, - zio->io_flags)); +/* + * Note: this simple function is not inlined to make it easier to dtrace the + * amount of time spent syncing deferred frees. + */ +static void +spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) +{ + zio_t *zio = zio_root(spa, NULL, NULL, 0); + VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, + spa_free_sync_cb, zio, tx), ==, 0); + VERIFY0(zio_wait(zio)); } + static void spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) { @@ -4948,10 +6395,10 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj /* * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration - * information. This avoids the dbuf_will_dirty() path and + * information. This avoids the dmu_buf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ - bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); + bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = kmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, @@ -5000,7 +6447,7 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vde list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], - B_FALSE, B_FALSE, B_TRUE); + B_FALSE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(nvroot, config, list, sav->sav_count) == 0); for (i = 0; i < sav->sav_count; i++) @@ -5014,63 +6461,200 @@ spa_sync_aux_dev(spa_t *spa, spa_aux_vde sav->sav_sync = B_FALSE; } +/* + * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. + * The all-vdev ZAP must be empty. + */ +static void +spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + if (vd->vdev_top_zap != 0) { + VERIFY0(zap_add_int(spa->spa_meta_objset, avz, + vd->vdev_top_zap, tx)); + } + if (vd->vdev_leaf_zap != 0) { + VERIFY0(zap_add_int(spa->spa_meta_objset, avz, + vd->vdev_leaf_zap, tx)); + } + for (uint64_t i = 0; i < vd->vdev_children; i++) { + spa_avz_build(vd->vdev_child[i], avz, tx); + } +} + static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; - if (list_is_empty(&spa->spa_config_dirty_list)) + /* + * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, + * its config may not be dirty but we still need to build per-vdev ZAPs. + * Similarly, if the pool is being assembled (e.g. after a split), we + * need to rebuild the AVZ although the config may not be dirty. + */ + if (list_is_empty(&spa->spa_config_dirty_list) && + spa->spa_avz_action == AVZ_ACTION_NONE) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || + spa->spa_all_vdev_zaps != 0); + + if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { + /* Make and build the new AVZ */ + uint64_t new_avz = zap_create(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); + spa_avz_build(spa->spa_root_vdev, new_avz, tx); + + /* Diff old AVZ with new one */ + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_all_vdev_zaps); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t vdzap = za.za_first_integer; + if (zap_lookup_int(spa->spa_meta_objset, new_avz, + vdzap) == ENOENT) { + /* + * ZAP is listed in old AVZ but not in new one; + * destroy it + */ + VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, + tx)); + } + } + + zap_cursor_fini(&zc); + + /* Destroy the old AVZ */ + VERIFY0(zap_destroy(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, tx)); + + /* Replace the old AVZ in the dir obj with the new one */ + VERIFY0(zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, + sizeof (new_avz), 1, &new_avz, tx)); + + spa->spa_all_vdev_zaps = new_avz; + } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { + zap_cursor_t zc; + zap_attribute_t za; + + /* Walk through the AVZ and destroy all listed ZAPs */ + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_all_vdev_zaps); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t zap = za.za_first_integer; + VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); + } + + zap_cursor_fini(&zc); + + /* Destroy and unlink the AVZ itself */ + VERIFY0(zap_destroy(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, tx)); + VERIFY0(zap_remove(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); + spa->spa_all_vdev_zaps = 0; + } + + if (spa->spa_all_vdev_zaps == 0) { + spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_VDEV_ZAP_MAP, tx); + } + spa->spa_avz_action = AVZ_ACTION_NONE; + + /* Create ZAPs for vdevs that don't have them. */ + vdev_construct_zaps(spa->spa_root_vdev, tx); + config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); + /* + * If we're upgrading the spa version then make sure that + * the config object gets updated with the correct version. + */ + if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) + fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, + spa->spa_uberblock.ub_version); + spa_config_exit(spa, SCL_STATE, FTAG); - if (spa->spa_config_syncing) - nvlist_free(spa->spa_config_syncing); + nvlist_free(spa->spa_config_syncing); spa->spa_config_syncing = config; spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } +static void +spa_sync_version(void *arg, dmu_tx_t *tx) +{ + uint64_t *versionp = arg; + uint64_t version = *versionp; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + /* + * Setting the version is special cased when first creating the pool. + */ + ASSERT(tx->tx_txg != TXG_INITIAL); + + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); + ASSERT(version >= spa_version(spa)); + + spa->spa_uberblock.ub_version = version; + vdev_config_dirty(spa->spa_root_vdev); + spa_history_log_internal(spa, "set", tx, "version=%lld", version); +} + /* * Set zpool properties. */ static void -spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +spa_sync_props(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; + nvlist_t *nvp = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; - nvlist_t *nvp = arg2; - nvpair_t *elem; - uint64_t intval; - char *strval; - zpool_prop_t prop; - const char *propname; - zprop_type_t proptype; + nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); - elem = NULL; while ((elem = nvlist_next_nvpair(nvp, elem))) { + uint64_t intval; + char *strval, *fname; + zpool_prop_t prop; + const char *propname; + zprop_type_t proptype; + spa_feature_t fid; + switch (prop = zpool_name_to_prop(nvpair_name(elem))) { + case ZPROP_INVAL: + /* + * We checked this earlier in spa_prop_validate(). + */ + ASSERT(zpool_prop_feature(nvpair_name(elem))); + + fname = strchr(nvpair_name(elem), '@') + 1; + VERIFY0(zfeature_lookup_name(fname, &fid)); + + spa_feature_enable(spa, fid, tx); + spa_history_log_internal(spa, "set", tx, + "%s=enabled", nvpair_name(elem)); + break; + case ZPOOL_PROP_VERSION: + intval = fnvpair_value_uint64(elem); /* - * Only set version for non-zpool-creation cases - * (set/import). spa_create() needs special care - * for version setting. + * The version is synced seperatly before other + * properties and should be correct by now. */ - if (tx->tx_txg != TXG_INITIAL) { - VERIFY(nvpair_value_uint64(elem, - &intval) == 0); - ASSERT(intval <= SPA_VERSION); - ASSERT(intval >= spa_version(spa)); - spa->spa_uberblock.ub_version = intval; - vdev_config_dirty(spa->spa_root_vdev); - } + ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: @@ -5081,24 +6665,38 @@ spa_sync_props(void *arg1, void *arg2, c ASSERT(spa->spa_root != NULL); break; + case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* - * 'cachefile' is also a non-persisitent property. + * 'readonly' and 'cachefile' are also non-persisitent + * properties. */ break; + case ZPOOL_PROP_COMMENT: + strval = fnvpair_value_string(elem); + if (spa->spa_comment != NULL) + spa_strfree(spa->spa_comment); + spa->spa_comment = spa_strdup(strval); + /* + * We need to dirty the configuration on all the vdevs + * so that their labels get updated. It's unnecessary + * to do this for pool creation since the vdev's + * configuratoin has already been dirtied. + */ + if (tx->tx_txg != TXG_INITIAL) + vdev_config_dirty(spa->spa_root_vdev); + spa_history_log_internal(spa, "set", tx, + "%s=%s", nvpair_name(elem), strval); + break; default: /* * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { - VERIFY((spa->spa_pool_props_object = - zap_create(mos, DMU_OT_POOL_PROPS, - DMU_OT_NONE, 0, tx)) > 0); - - VERIFY(zap_update(mos, + spa->spa_pool_props_object = + zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, - 8, 1, &spa->spa_pool_props_object, tx) - == 0); + tx); } /* normalize the property name */ @@ -5107,22 +6705,25 @@ spa_sync_props(void *arg1, void *arg2, c if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); - VERIFY(nvpair_value_string(elem, &strval) == 0); - VERIFY(zap_update(mos, + strval = fnvpair_value_string(elem); + VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, - 1, strlen(strval) + 1, strval, tx) == 0); - + 1, strlen(strval) + 1, strval, tx)); + spa_history_log_internal(spa, "set", tx, + "%s=%s", nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { - VERIFY(nvpair_value_uint64(elem, &intval) == 0); + intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; - VERIFY(zpool_prop_index_to_string( - prop, intval, &unused) == 0); + VERIFY0(zpool_prop_index_to_string( + prop, intval, &unused)); } - VERIFY(zap_update(mos, + VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, - 8, 1, &intval, tx) == 0); + 8, 1, &intval, tx)); + spa_history_log_internal(spa, "set", tx, + "%s=%lld", nvpair_name(elem), intval); } else { ASSERT(0); /* not allowed */ } @@ -5139,7 +6740,9 @@ spa_sync_props(void *arg1, void *arg2, c break; case ZPOOL_PROP_AUTOEXPAND: spa->spa_autoexpand = intval; - spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); + if (tx->tx_txg != TXG_INITIAL) + spa_async_request(spa, + SPA_ASYNC_AUTOEXPAND); break; case ZPOOL_PROP_DEDUPDITTO: spa->spa_dedup_ditto = intval; @@ -5149,33 +6752,104 @@ spa_sync_props(void *arg1, void *arg2, c } } - /* log internal history if this is not a zpool create */ - if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && - tx->tx_txg != TXG_INITIAL) { - spa_history_internal_log(LOG_POOL_PROPSET, - spa, tx, cr, "%s %lld %s", - nvpair_name(elem), intval, spa_name(spa)); - } } mutex_exit(&spa->spa_props_lock); } /* + * Perform one-time upgrade on-disk changes. spa_version() does not + * reflect the new version this txg, so there must be no changes this + * txg to anything that the upgrade code depends on after it executes. + * Therefore this must be called after dsl_pool_sync() does the sync + * tasks. + */ +static void +spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + + ASSERT(spa->spa_sync_pass == 1); + + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + + if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && + spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { + dsl_pool_create_origin(dp, tx); + + /* Keeping the origin open increases spa_minref */ + spa->spa_minref += 3; + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && + spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { + dsl_pool_upgrade_clones(dp, tx); + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && + spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { + dsl_pool_upgrade_dir_clones(dp, tx); + + /* Keeping the freedir open increases spa_minref */ + spa->spa_minref += 3; + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && + spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { + spa_feature_create_zap_objects(spa, tx); + } + + /* + * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable + * when possibility to use lz4 compression for metadata was added + * Old pools that have this feature enabled must be upgraded to have + * this feature active + */ + if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { + boolean_t lz4_en = spa_feature_is_enabled(spa, + SPA_FEATURE_LZ4_COMPRESS); + boolean_t lz4_ac = spa_feature_is_active(spa, + SPA_FEATURE_LZ4_COMPRESS); + + if (lz4_en && !lz4_ac) + spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); + } + + /* + * If we haven't written the salt, do so now. Note that the + * feature may not be activated yet, but that's fine since + * the presence of this ZAP entry is backwards compatible. + */ + if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CHECKSUM_SALT) == ENOENT) { + VERIFY0(zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, + sizeof (spa->spa_cksum_salt.zcs_bytes), + spa->spa_cksum_salt.zcs_bytes, tx)); + } + + rrw_exit(&dp->dp_config_rwlock, FTAG); +} + +/* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. */ + void spa_sync(spa_t *spa, uint64_t txg) { dsl_pool_t *dp = spa->spa_dsl_pool; objset_t *mos = spa->spa_meta_objset; - bplist_t *defer_bpl = &spa->spa_deferred_bplist; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; dmu_tx_t *tx; int error; + uint32_t max_queue_depth = zfs_vdev_async_write_max_active * + zfs_vdev_queue_depth_pct / 100; + + VERIFY(spa_writeable(spa)); /* * Lock out configuration changes. @@ -5185,6 +6859,10 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; + mutex_enter(&spa->spa_alloc_lock); + VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); + mutex_exit(&spa->spa_alloc_lock); + /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. @@ -5209,10 +6887,26 @@ spa_sync(spa_t *spa, uint64_t txg) } spa_config_exit(spa, SCL_STATE, FTAG); - VERIFY(0 == bplist_open(defer_bpl, mos, spa->spa_deferred_bplist_obj)); - tx = dmu_tx_create_assigned(dp, txg); + spa->spa_sync_starttime = gethrtime(); +#ifdef illumos + VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, + spa->spa_sync_starttime + spa->spa_deadman_synctime)); +#endif /* illumos */ +#ifdef __FreeBSD__ +#ifdef _KERNEL + callout_schedule(&spa->spa_deadman_cycid, + hz * spa->spa_deadman_synctime / NANOSEC); +#endif +#endif /* __FreeBSD__ */ +#ifdef __NetBSD__ +#ifdef _KERNEL + callout_schedule(&spa->spa_deadman_cycid, + hz * spa->spa_deadman_synctime / NANOSEC); +#endif +#endif + /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. @@ -5234,28 +6928,37 @@ spa_sync(spa_t *spa, uint64_t txg) } } - if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && - spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { - dsl_pool_create_origin(dp, tx); + /* + * Set the top-level vdev's max queue depth. Evaluate each + * top-level's async write queue depth in case it changed. + * The max queue depth will not change in the middle of syncing + * out this txg. + */ + uint64_t queue_depth_total = 0; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; - /* Keeping the origin open increases spa_minref */ - spa->spa_minref += 3; - } + if (mg == NULL || mg->mg_class != spa_normal_class(spa) || + !metaslab_group_initialized(mg)) + continue; - if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && - spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { - dsl_pool_upgrade_clones(dp, tx); - } + /* + * It is safe to do a lock-free check here because only async + * allocations look at mg_max_alloc_queue_depth, and async + * allocations all happen from spa_sync(). + */ + ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); + mg->mg_max_alloc_queue_depth = max_queue_depth; + queue_depth_total += mg->mg_max_alloc_queue_depth; + } + metaslab_class_t *mc = spa_normal_class(spa); + ASSERT0(refcount_count(&mc->mc_alloc_slots)); + mc->mc_alloc_max_slots = queue_depth_total; + mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; - /* - * If anything has changed in this txg, push the deferred frees - * from the previous txg. If not, leave them alone so that we - * don't generate work on an otherwise idle system. - */ - if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || - !txg_list_empty(&dp->dp_dirty_dirs, txg) || - !txg_list_empty(&dp->dp_sync_tasks, txg)) - spa_sync_deferred_bplist(spa, defer_bpl, tx, txg); + ASSERT3U(mc->mc_alloc_max_slots, <=, + max_queue_depth * rvd->vdev_children); /* * Iterate to convergence. @@ -5271,29 +6974,72 @@ spa_sync(spa_t *spa, uint64_t txg) spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); - if (pass <= SYNC_PASS_DEFERRED_FREE) { - zio_t *zio = zio_root(spa, NULL, NULL, 0); - bplist_sync(free_bpl, spa_sync_free, zio, tx); - VERIFY(zio_wait(zio) == 0); + if (pass < zfs_sync_pass_deferred_free) { + spa_sync_frees(spa, free_bpl, tx); } else { - bplist_sync(free_bpl, bplist_enqueue_cb, defer_bpl, tx); + /* + * We can not defer frees in pass 1, because + * we sync the deferred frees later in pass 1. + */ + ASSERT3U(pass, >, 1); + bplist_iterate(free_bpl, bpobj_enqueue_cb, + &spa->spa_deferred_bpobj, tx); } ddt_sync(spa, txg); - - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight > 0) - cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - mutex_exit(&spa->spa_scrub_lock); + dsl_scan_sync(dp, tx); while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) vdev_sync(vd, txg); - } while (dmu_objset_is_dirty(mos, txg)); + if (pass == 1) { + spa_sync_upgrades(spa, tx); + ASSERT3U(txg, >=, + spa->spa_uberblock.ub_rootbp.blk_birth); + /* + * Note: We need to check if the MOS is dirty + * because we could have marked the MOS dirty + * without updating the uberblock (e.g. if we + * have sync tasks but no dirty user data). We + * need to check the uberblock's rootbp because + * it is updated if we have synced out dirty + * data (though in this case the MOS will most + * likely also be dirty due to second order + * effects, we don't want to rely on that here). + */ + if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && + !dmu_objset_is_dirty(mos, txg)) { + /* + * Nothing changed on the first pass, + * therefore this TXG is a no-op. Avoid + * syncing deferred frees, so that we + * can keep this TXG as a no-op. + */ + ASSERT(txg_list_empty(&dp->dp_dirty_datasets, + txg)); + ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); + ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); + break; + } + spa_sync_deferred_frees(spa, tx); + } - ASSERT(free_bpl->bpl_queue == NULL); + } while (dmu_objset_is_dirty(mos, txg)); - bplist_close(defer_bpl); + if (!list_is_empty(&spa->spa_config_dirty_list)) { + /* + * Make sure that the number of ZAPs for all the vdevs matches + * the number of ZAPs in the per-vdev ZAP list. This only gets + * called if the config is dirty; otherwise there may be + * outstanding AVZ operations that weren't completed in + * spa_sync_config_object. + */ + uint64_t all_vdev_zap_entry_count; + ASSERT0(zap_count(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); + ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, + all_vdev_zap_entry_count); + } /* * Rewrite the vdev configuration (which includes the uberblock) @@ -5325,18 +7071,15 @@ spa_sync(spa_t *spa, uint64_t txg) if (svdcount == SPA_DVAS_PER_BP) break; } - error = vdev_config_sync(svd, svdcount, txg, B_FALSE); - if (error != 0) - error = vdev_config_sync(svd, svdcount, txg, - B_TRUE); + error = vdev_config_sync(svd, svdcount, txg); } else { error = vdev_config_sync(rvd->vdev_child, - rvd->vdev_children, txg, B_FALSE); - if (error != 0) - error = vdev_config_sync(rvd->vdev_child, - rvd->vdev_children, txg, B_TRUE); + rvd->vdev_children, txg); } + if (error == 0) + spa->spa_last_synced_guid = rvd->vdev_guid; + spa_config_exit(spa, SCL_STATE, FTAG); if (error == 0) @@ -5346,6 +7089,20 @@ spa_sync(spa_t *spa, uint64_t txg) } dmu_tx_commit(tx); +#ifdef illumos + VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); +#endif /* illumos */ +#ifdef __FreeBSD__ +#ifdef _KERNEL + callout_drain(&spa->spa_deadman_cycid); +#endif +#endif /* __FreeBSD__ */ +#ifdef __NetBSD__ +#ifdef _KERNEL + callout_drain(&spa->spa_deadman_cycid); +#endif +#endif /* __NetBSD__ */ + /* * Clear the dirty config list. */ @@ -5362,10 +7119,12 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_config_syncing = NULL; } - spa->spa_ubsync = spa->spa_uberblock; - dsl_pool_sync_done(dp, txg); + mutex_enter(&spa->spa_alloc_lock); + VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); + mutex_exit(&spa->spa_alloc_lock); + /* * Update usable space statistics. */ @@ -5381,11 +7140,16 @@ spa_sync(spa_t *spa, uint64_t txg) ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); - ASSERT(defer_bpl->bpl_queue == NULL); - ASSERT(free_bpl->bpl_queue == NULL); spa->spa_sync_pass = 0; + /* + * Update the last synced uberblock here. We want to do this at + * the end of spa_sync() so that consumers of spa_last_synced_txg() + * will be guaranteed that all the processing associated with + * that txg has been completed. + */ + spa->spa_ubsync = spa->spa_uberblock; spa_config_exit(spa, SCL_CONFIG, FTAG); spa_handle_ignored_writes(spa); @@ -5394,6 +7158,7 @@ spa_sync(spa_t *spa, uint64_t txg) * If any async tasks have been requested, kick them off. */ spa_async_dispatch(spa); + spa_async_dispatch_vd(spa); } /* @@ -5407,7 +7172,8 @@ spa_sync_allpools(void) spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { - if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa)) + if (spa_state(spa) != POOL_STATE_ACTIVE || + !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); @@ -5487,6 +7253,8 @@ spa_lookup_by_guid(spa_t *spa, uint64_t void spa_upgrade(spa_t *spa, uint64_t version) { + ASSERT(spa_writeable(spa)); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* @@ -5494,8 +7262,8 @@ spa_upgrade(spa_t *spa, uint64_t version * future version would result in an unopenable pool, this shouldn't be * possible. */ - ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); - ASSERT(version >= spa->spa_uberblock.ub_version); + ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); + ASSERT3U(version, >=, spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); @@ -5546,25 +7314,17 @@ spa_has_active_shared_spare(spa_t *spa) return (B_FALSE); } -/* - * Post a sysevent corresponding to the given event. The 'name' must be one of - * the event definitions in sys/sysevent/eventdefs.h. The payload will be - * filled in from the spa and (optionally) the vdev. This doesn't do anything - * in the userland libzpool, as we don't want consumers to misinterpret ztest - * or zdb as real changes. - */ -void -spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) +static sysevent_t * +spa_event_create(spa_t *spa, vdev_t *vd, const char *name) { -#ifndef __NetBSD__ + sysevent_t *ev = NULL; #ifdef _KERNEL - sysevent_t *ev; sysevent_attr_list_t *attr = NULL; sysevent_value_t value; - sysevent_id_t eid; ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", SE_SLEEP); + ASSERT(ev != NULL); value.value_type = SE_DATA_TYPE_STRING; value.value.sv_string = spa_name(spa); @@ -5596,12 +7356,34 @@ spa_event_notify(spa_t *spa, vdev_t *vd, goto done; attr = NULL; - (void) log_sysevent(ev, SE_SLEEP, &eid); - done: if (attr) sysevent_free_attr(attr); + +#endif + return (ev); +} + +static void +spa_event_post(sysevent_t *ev) +{ +#ifdef _KERNEL + sysevent_id_t eid; + + (void) log_sysevent(ev, SE_SLEEP, &eid); sysevent_free(ev); #endif -#endif /* __NetBSD__ */ +} + +/* + * Post a sysevent corresponding to the given event. The 'name' must be one of + * the event definitions in sys/sysevent/eventdefs.h. The payload will be + * filled in from the spa and (optionally) the vdev. This doesn't do anything + * in the userland libzpool, as we don't want consumers to misinterpret ztest + * or zdb as real changes. + */ +void +spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) +{ + spa_event_post(spa_event_create(spa, vd, name)); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c,v retrieving revision 1.6 diff -u -p -r1.6 spa_config.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c 21 Nov 2011 17:51:03 -0000 1.6 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c 14 Apr 2017 20:19:21 -0000 @@ -20,11 +20,14 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ +#include #include +#include #include #include #include @@ -32,8 +35,8 @@ #include #include #include -#include #include +#include #ifdef _KERNEL #include #include @@ -83,6 +86,7 @@ spa_config_load(void) * Open the configuration file. */ pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); file = kobj_open_file(pathname); @@ -122,7 +126,7 @@ spa_config_load(void) if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) continue; - VERIFY(nvpair_value_nvlist(nvpair, &child) == 0); + child = fnvpair_value_nvlist(nvpair); if (spa_lookup(nvpair_name(nvpair)) != NULL) continue; @@ -140,6 +144,26 @@ out: } static void +spa_config_clean(nvlist_t *nvl) +{ + nvlist_t **child; + nvlist_t *nvroot = NULL; + uint_t c, children; + + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0) { + for (c = 0; c < children; c++) + spa_config_clean(child[c]); + } + + if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0) + spa_config_clean(nvroot); + + nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS, DATA_TYPE_UINT64_ARRAY); + nvlist_remove(nvl, ZPOOL_CONFIG_SCAN_STATS, DATA_TYPE_UINT64_ARRAY); +} + +static int spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) { size_t buflen; @@ -147,25 +171,22 @@ spa_config_write(spa_config_dirent_t *dp vnode_t *vp; int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; char *temp; + int err; + /* * If the nvlist is empty (NULL), then remove the old cachefile. */ if (nvl == NULL) { - (void) vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE); - return; + err = vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE); + return (err); } /* * Pack the configuration into a buffer. */ - VERIFY(nvlist_size(nvl, &buflen, NV_ENCODE_XDR) == 0); - - buf = kmem_alloc(buflen, KM_SLEEP); + buf = fnvlist_pack(nvl, &buflen); temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); - VERIFY(nvlist_pack(nvl, &buf, &buflen, NV_ENCODE_XDR, - KM_SLEEP) == 0); - /* * Write the configuration to disk. We need to do the traditional * 'write to temporary file, sync, move over original' to make sure we @@ -173,30 +194,40 @@ spa_config_write(spa_config_dirent_t *dp */ (void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path); - if (vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) == 0) { - if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, NULL) == 0 && - VOP_FSYNC(vp, FSYNC, kcred, NULL) == 0) { - (void) vn_rename(temp, dp->scd_path, UIO_SYSSPACE); - } + err = vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0); + if (err == 0) { + err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, + 0, RLIM64_INFINITY, kcred, NULL); + if (err == 0) + err = VOP_FSYNC(vp, FSYNC, kcred, NULL); + if (err == 0) + err = vn_rename(temp, dp->scd_path, UIO_SYSSPACE); (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL); } (void) vn_remove(temp, UIO_SYSSPACE, RMFILE); - kmem_free(buf, buflen); + fnvlist_pack_free(buf, buflen); kmem_free(temp, MAXPATHLEN); + return (err); } /* * Synchronize pool configuration to disk. This must be called with the - * namespace lock held. + * namespace lock held. Synchronizing the pool cache is typically done after + * the configuration has been synced to the MOS. This exposes a window where + * the MOS config will have been updated but the cache file has not. If + * the system were to crash at that instant then the cached config may not + * contain the correct information to open the pool and an explicity import + * would be required. */ void spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; + boolean_t ccw_failure; + int error; ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -208,6 +239,7 @@ spa_config_sync(spa_t *target, boolean_t * cachefile is changed, the new one is pushed onto this list, allowing * us to update previous cachefiles that no longer contain this pool. */ + ccw_failure = B_FALSE; for (dp = list_head(&target->spa_config_list); dp != NULL; dp = list_next(&target->spa_config_list, dp)) { spa_t *spa = NULL; @@ -219,8 +251,19 @@ spa_config_sync(spa_t *target, boolean_t */ nvl = NULL; while ((spa = spa_next(spa)) != NULL) { - if (spa == target && removing) + nvlist_t *nvroot = NULL; + /* + * Skip over our own pool if we're about to remove + * ourselves from the spa namespace or any pool that + * is readonly. Since we cannot guarantee that a + * readonly pool would successfully import upon reboot, + * we don't allow them to be written to the cache file. + */ + if ((spa == target && removing) || + (spa_state(spa) == POOL_STATE_ACTIVE && + !spa_writeable(spa))) continue; + mutex_enter(&spa->spa_props_lock); tdp = list_head(&spa->spa_config_list); if (spa->spa_config == NULL || @@ -231,18 +274,42 @@ spa_config_sync(spa_t *target, boolean_t } if (nvl == NULL) - VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, - KM_SLEEP) == 0); + nvl = fnvlist_alloc(); - VERIFY(nvlist_add_nvlist(nvl, spa->spa_name, - spa->spa_config) == 0); + fnvlist_add_nvlist(nvl, spa->spa_name, + spa->spa_config); mutex_exit(&spa->spa_props_lock); + + if (nvlist_lookup_nvlist(nvl, spa->spa_name, &nvroot) == 0) + spa_config_clean(nvroot); } - spa_config_write(dp, nvl); + error = spa_config_write(dp, nvl); + if (error != 0) + ccw_failure = B_TRUE; nvlist_free(nvl); } + if (ccw_failure) { + /* + * Keep trying so that configuration data is + * written if/when any temporary filesystem + * resource issues are resolved. + */ + if (target->spa_ccw_fail_time == 0) { + zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, + target, NULL, NULL, 0, 0); + } + target->spa_ccw_fail_time = gethrtime(); + spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); + } else { + /* + * Do not rate limit future attempts to update + * the config cache. + */ + target->spa_ccw_fail_time = 0; + } + /* * Remove any config entries older than the current one. */ @@ -275,15 +342,15 @@ spa_all_configs(uint64_t *generation) if (*generation == spa_config_generation) return (NULL); - VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0); + pools = fnvlist_alloc(); mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { - if (INGLOBALZONE(curproc) || + if (INGLOBALZONE(curthread) || zone_dataset_visible(spa_name(spa), NULL)) { mutex_enter(&spa->spa_props_lock); - VERIFY(nvlist_add_nvlist(pools, spa_name(spa), - spa->spa_config) == 0); + fnvlist_add_nvlist(pools, spa_name(spa), + spa->spa_config); mutex_exit(&spa->spa_props_lock); } } @@ -297,32 +364,14 @@ void spa_config_set(spa_t *spa, nvlist_t *config) { mutex_enter(&spa->spa_props_lock); - if (spa->spa_config != NULL) - nvlist_free(spa->spa_config); + nvlist_free(spa->spa_config); spa->spa_config = config; mutex_exit(&spa->spa_props_lock); } -/* Add discovered rewind info, if any to the provided nvlist */ -void -spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *tonvl) -{ - int64_t loss = 0; - - if (tonvl == NULL || spa->spa_load_txg == 0) - return; - - VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_TIME, - spa->spa_load_txg_ts) == 0); - if (spa->spa_last_ubsync_txg) - loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; - VERIFY(nvlist_add_int64(tonvl, ZPOOL_CONFIG_REWIND_TIME, loss) == 0); - VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_DATA_ERRORS, - spa->spa_load_data_errors) == 0); -} - /* * Generate the pool's configuration based on the current in-core state. + * * We infer whether to generate a complete config or just one top-level config * based on whether vd is the root vdev. */ @@ -350,18 +399,18 @@ spa_config_generate(spa_t *spa, vdev_t * if (txg == -1ULL) txg = spa->spa_config_txg; - VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0); + config = fnvlist_alloc(); + + fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); + fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, spa_name(spa)); + fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa)); + fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); + fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); + if (spa->spa_comment != NULL) { + fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, + spa->spa_comment); + } - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, - spa_version(spa)) == 0); - VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, - spa_name(spa)) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, - spa_state(spa)) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, - txg) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, - spa_guid(spa)) == 0); #ifdef _KERNEL hostid = zone_get_hostid(NULL); #else /* _KERNEL */ @@ -372,23 +421,24 @@ spa_config_generate(spa_t *spa, vdev_t * (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); #endif /* _KERNEL */ if (hostid != 0) { - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, - hostid) == 0); + fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid); } - VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, - utsname.nodename) == 0); + fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname.nodename); + int config_gen_flags = 0; if (vd != rvd) { - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, - vd->vdev_top->vdev_guid) == 0); - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_GUID, - vd->vdev_guid) == 0); - if (vd->vdev_isspare) - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_SPARE, - 1ULL) == 0); - if (vd->vdev_islog) - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, - 1ULL) == 0); + fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, + vd->vdev_top->vdev_guid); + fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID, + vd->vdev_guid); + if (vd->vdev_isspare) { + fnvlist_add_uint64(config, + ZPOOL_CONFIG_IS_SPARE, 1ULL); + } + if (vd->vdev_islog) { + fnvlist_add_uint64(config, + ZPOOL_CONFIG_IS_LOG, 1ULL); + } vd = vd->vdev_top; /* label contains top config */ } else { /* @@ -396,14 +446,17 @@ spa_config_generate(spa_t *spa, vdev_t * * in the mos config, and not in the vdev labels */ if (spa->spa_config_splitting != NULL) - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, - spa->spa_config_splitting) == 0); + fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, + spa->spa_config_splitting); + fnvlist_add_boolean(config, + ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS); + + config_gen_flags |= VDEV_CONFIG_MOS; } /* * Add the top-level config. We even add this on pools which - * don't support holes in the namespace as older pools will - * just ignore it. + * don't support holes in the namespace. */ vdev_top_config_generate(spa, config); @@ -413,14 +466,20 @@ spa_config_generate(spa_t *spa, vdev_t * if (spa->spa_config_splitting != NULL && nvlist_lookup_uint64(spa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { - VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, - split_guid) == 0); + fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, + split_guid); } - nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE); - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags); + fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot); nvlist_free(nvroot); + /* + * Store what's necessary for reading the MOS in the label. + */ + fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, + spa->spa_label_features); + if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { ddt_histogram_t *ddh; ddt_stat_t *dds; @@ -428,28 +487,26 @@ spa_config_generate(spa_t *spa, vdev_t * ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); ddt_get_dedup_histogram(spa, ddh); - VERIFY(nvlist_add_uint64_array(config, + fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, - (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)) == 0); + (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)); kmem_free(ddh, sizeof (ddt_histogram_t)); ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP); ddt_get_dedup_object_stats(spa, ddo); - VERIFY(nvlist_add_uint64_array(config, + fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, - (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)) == 0); + (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)); kmem_free(ddo, sizeof (ddt_object_t)); dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP); ddt_get_dedup_stats(spa, dds); - VERIFY(nvlist_add_uint64_array(config, + fnvlist_add_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, - (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)) == 0); + (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)); kmem_free(dds, sizeof (ddt_stat_t)); } - spa_rewind_data_to_nvlist(spa, config); - if (locked) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); @@ -484,8 +541,10 @@ spa_config_update(spa_t *spa, int what) */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - if (tvd->vdev_ms_array == 0) + if (tvd->vdev_ms_array == 0) { + vdev_ashift_optimize(tvd); vdev_metaslab_set_size(tvd); + } vdev_expand(tvd, txg); } } @@ -499,8 +558,7 @@ spa_config_update(spa_t *spa, int what) /* * Update the global config cache to reflect the new mosconfig. */ - if (!spa->spa_is_root) - spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); + spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); if (what == SPA_CONFIG_UPDATE_POOL) spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 spa_errlog.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c 27 Feb 2010 22:31:06 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c 17 Jul 2014 16:23:26 -0000 @@ -19,8 +19,8 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. */ /* @@ -35,7 +35,7 @@ * deleted from the log when the scrub completes. * * The log is stored using a ZAP object whose key is a string form of the - * zbookmark tuple (objset, object, level, blkid), and whose contents is an + * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an * optional 'objset:object' human-readable string describing the data. When an * error is first logged, this string will be empty, indicating that no name is * known. This prevents us from having to issue a potentially large amount of @@ -54,42 +54,12 @@ #include #include -/* - * This is a stripped-down version of strtoull, suitable only for converting - * lowercase hexidecimal numbers that don't overflow. - */ -uint64_t -strtonum(const char *str, char **nptr) -{ - uint64_t val = 0; - char c; - int digit; - - while ((c = *str) != '\0') { - if (c >= '0' && c <= '9') - digit = c - '0'; - else if (c >= 'a' && c <= 'f') - digit = 10 + c - 'a'; - else - break; - - val *= 16; - val += digit; - - str++; - } - - if (nptr) - *nptr = (char *)str; - - return (val); -} /* * Convert a bookmark to a string. */ static void -bookmark_to_name(zbookmark_t *zb, char *buf, size_t len) +bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len) { (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, @@ -101,7 +71,7 @@ bookmark_to_name(zbookmark_t *zb, char * */ #ifdef _KERNEL static void -name_to_bookmark(char *buf, zbookmark_t *zb) +name_to_bookmark(char *buf, zbookmark_phys_t *zb) { zb->zb_objset = strtonum(buf, &buf); ASSERT(*buf == ':'); @@ -122,7 +92,7 @@ name_to_bookmark(char *buf, zbookmark_t void spa_log_error(spa_t *spa, zio_t *zio) { - zbookmark_t *zb = &zio->io_logical->io_bookmark; + zbookmark_phys_t *zb = &zio->io_logical->io_bookmark; spa_error_entry_t search; spa_error_entry_t *new; avl_tree_t *tree; @@ -195,7 +165,7 @@ process_error_log(spa_t *spa, uint64_t o { zap_cursor_t zc; zap_attribute_t za; - zbookmark_t zb; + zbookmark_phys_t zb; if (obj == 0) return (0); @@ -206,15 +176,17 @@ process_error_log(spa_t *spa, uint64_t o if (*count == 0) { zap_cursor_fini(&zc); - return (ENOMEM); + return (SET_ERROR(ENOMEM)); } name_to_bookmark(za.za_name, &zb); if (copyout(&zb, (char *)addr + - (*count - 1) * sizeof (zbookmark_t), - sizeof (zbookmark_t)) != 0) - return (EFAULT); + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { + zap_cursor_fini(&zc); + return (SET_ERROR(EFAULT)); + } *count -= 1; } @@ -232,12 +204,12 @@ process_error_list(avl_tree_t *list, voi for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { if (*count == 0) - return (ENOMEM); + return (SET_ERROR(ENOMEM)); if (copyout(&se->se_bookmark, (char *)addr + - (*count - 1) * sizeof (zbookmark_t), - sizeof (zbookmark_t)) != 0) - return (EFAULT); + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) + return (SET_ERROR(EFAULT)); *count -= 1; } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c,v retrieving revision 1.3 diff -u -p -r1.3 spa_history.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c 27 Feb 2010 23:43:53 -0000 1.3 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c 14 Apr 2017 20:24:24 -0000 @@ -20,8 +20,9 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ #include @@ -30,10 +31,14 @@ #include #include #include +#include +#include #include -#include #include +#include +#include "zfs_comutil.h" #ifdef _KERNEL +#include #include #endif @@ -86,7 +91,7 @@ spa_history_create_obj(spa_t *spa, dmu_t ASSERT(spa->spa_history == 0); spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY, - SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, + SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, sizeof (spa_history_phys_t), tx); VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, @@ -101,11 +106,11 @@ spa_history_create_obj(spa_t *spa, dmu_t /* * Figure out maximum size of history log. We set it at - * 1% of pool size, with a max of 32MB and min of 128KB. + * 0.1% of pool size, with a max of 1G and min of 128KB. */ shpp->sh_phys_max_off = - metaslab_class_get_dspace(spa_normal_class(spa)) / 100; - shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20); + metaslab_class_get_dspace(spa_normal_class(spa)) / 1000; + shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30); shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); dmu_buf_rele(dbp, FTAG); @@ -175,30 +180,32 @@ spa_history_write(spa_t *spa, void *buf, } static char * -spa_history_zone() +spa_history_zone(void) { -#if defined(_KERNEL) && !defined(__NetBSD__) - return (curproc->p_zone->zone_name); -#else - return ("global"); +#ifdef _KERNEL +#ifdef __FreeBSD__ + /* XXX: pr_hostname can be changed by default from within a jail! */ + if (jailed(curthread->td_ucred)) + return (curthread->td_ucred->cr_prison->pr_hostname); +#endif #endif + return (NULL); } /* * Write out a history event. */ +/*ARGSUSED*/ static void -spa_history_log_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +spa_history_log_sync(void *arg, dmu_tx_t *tx) { - spa_t *spa = arg1; - history_arg_t *hap = arg2; - const char *history_str = hap->ha_history_str; + nvlist_t *nvl = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; dmu_buf_t *dbp; spa_history_phys_t *shpp; size_t reclen; uint64_t le_len; - nvlist_t *nvrecord; char *record_packed = NULL; int ret; @@ -215,7 +222,7 @@ spa_history_log_sync(void *arg1, void *a * Get the offset of where we need to write via the bonus buffer. * Update the offset when the write completes. */ - VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); + VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); shpp = dbp->db_data; dmu_buf_will_dirty(dbp, tx); @@ -228,40 +235,35 @@ spa_history_log_sync(void *arg1, void *a } #endif - VERIFY(nvlist_alloc(&nvrecord, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TIME, - gethrestime_sec()) == 0); - VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_WHO, - (uint64_t)crgetuid(cr)) == 0); - if (hap->ha_zone[0] != '\0') - VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_ZONE, - hap->ha_zone) == 0); + fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec()); #ifdef _KERNEL - VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_HOST, - utsname.nodename) == 0); + fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname.nodename); #endif - if (hap->ha_log_type == LOG_CMD_POOL_CREATE || - hap->ha_log_type == LOG_CMD_NORMAL) { - VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_CMD, - history_str) == 0); - } else { - VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_INT_EVENT, - hap->ha_event) == 0); - VERIFY(nvlist_add_uint64(nvrecord, ZPOOL_HIST_TXG, - tx->tx_txg) == 0); - VERIFY(nvlist_add_string(nvrecord, ZPOOL_HIST_INT_STR, - history_str) == 0); + if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) { + zfs_dbgmsg("command: %s", + fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD)); + } else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) { + if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) { + zfs_dbgmsg("txg %lld %s %s (id %llu) %s", + fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), + fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), + fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME), + fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID), + fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); + } else { + zfs_dbgmsg("txg %lld %s %s", + fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), + fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), + fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); + } + } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) { + zfs_dbgmsg("ioctl %s", + fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL)); } - VERIFY(nvlist_size(nvrecord, &reclen, NV_ENCODE_XDR) == 0); - record_packed = kmem_alloc(reclen, KM_SLEEP); - - VERIFY(nvlist_pack(nvrecord, &record_packed, &reclen, - NV_ENCODE_XDR, KM_SLEEP) == 0); + record_packed = fnvlist_pack(nvl, &reclen); mutex_enter(&spa->spa_history_lock); - if (hap->ha_log_type == LOG_CMD_POOL_CREATE) - VERIFY(shpp->sh_eof == shpp->sh_pool_create_len); /* write out the packed length as little endian */ le_len = LE_64((uint64_t)reclen); @@ -269,37 +271,68 @@ spa_history_log_sync(void *arg1, void *a if (!ret) ret = spa_history_write(spa, record_packed, reclen, shpp, tx); - if (!ret && hap->ha_log_type == LOG_CMD_POOL_CREATE) { - shpp->sh_pool_create_len += sizeof (le_len) + reclen; - shpp->sh_bof = shpp->sh_pool_create_len; + /* The first command is the create, which we keep forever */ + if (ret == 0 && shpp->sh_pool_create_len == 0 && + nvlist_exists(nvl, ZPOOL_HIST_CMD)) { + shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof; } mutex_exit(&spa->spa_history_lock); - nvlist_free(nvrecord); - kmem_free(record_packed, reclen); + fnvlist_pack_free(record_packed, reclen); dmu_buf_rele(dbp, FTAG); - - if (hap->ha_log_type == LOG_INTERNAL) { - kmem_free((void*)hap->ha_history_str, HIS_MAX_RECORD_LEN); - kmem_free(hap, sizeof (history_arg_t)); - } + fnvlist_free(nvl); } /* * Write out a history event. */ int -spa_history_log(spa_t *spa, const char *history_str, history_log_type_t what) +spa_history_log(spa_t *spa, const char *msg) { - history_arg_t ha; + int err; + nvlist_t *nvl = fnvlist_alloc(); - ASSERT(what != LOG_INTERNAL); + fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg); + err = spa_history_log_nvl(spa, nvl); + fnvlist_free(nvl); + return (err); +} + +int +spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) +{ + int err = 0; + dmu_tx_t *tx; + nvlist_t *nvarg; + + if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) + return (EINVAL); + + if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa)) + return (SET_ERROR(EINVAL)); + + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + nvarg = fnvlist_dup(nvl); + if (spa_history_zone() != NULL) { + fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE, + spa_history_zone()); + } + fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED())); + + /* Kick this off asynchronously; errors are ignored. */ + dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, + nvarg, 0, ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + + /* spa_history_log_sync will free nvl */ + return (err); - ha.ha_history_str = history_str; - ha.ha_log_type = what; - (void) strlcpy(ha.ha_zone, spa_history_zone(), sizeof (ha.ha_zone)); - return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_history_log_sync, - spa, &ha, 0)); } /* @@ -316,11 +349,19 @@ spa_history_get(spa_t *spa, uint64_t *of int err; /* - * If the command history doesn't exist (older pool), + * If the command history doesn't exist (older pool), * that's ok, just return ENOENT. */ if (!spa->spa_history) - return (ENOENT); + return (SET_ERROR(ENOENT)); + + /* + * The history is logged asynchronously, so when they request + * the first chunk of history, make sure everything has been + * synced to disk so that we get it. + */ + if (*offp == 0 && spa_writeable(spa)) + txg_wait_synced(spa_get_dsl(spa), 0); if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) return (err); @@ -391,42 +432,50 @@ spa_history_get(spa_t *spa, uint64_t *of return (err); } +/* + * The nvlist will be consumed by this call. + */ static void -log_internal(history_internal_events_t event, spa_t *spa, - dmu_tx_t *tx, cred_t *cr, const char *fmt, va_list adx) +log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, + dmu_tx_t *tx, const char *fmt, va_list adx) { - history_arg_t *hap; - char *str; + char *msg; + va_list adx2; /* * If this is part of creating a pool, not everything is * initialized yet, so don't bother logging the internal events. + * Likewise if the pool is not writeable. */ - if (tx->tx_txg == TXG_INITIAL) + if (tx->tx_txg == TXG_INITIAL || !spa_writeable(spa)) { + fnvlist_free(nvl); return; + } + + va_copy(adx2, adx); - hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); - str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); + msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP); + (void) vsprintf(msg, fmt, adx2); + fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg); + strfree(msg); - (void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx); + va_end(adx2); - hap->ha_log_type = LOG_INTERNAL; - hap->ha_history_str = str; - hap->ha_event = event; - hap->ha_zone[0] = '\0'; + fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation); + fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg); if (dmu_tx_is_syncing(tx)) { - spa_history_log_sync(spa, hap, cr, tx); + spa_history_log_sync(nvl, tx); } else { - dsl_sync_task_do_nowait(spa_get_dsl(spa), NULL, - spa_history_log_sync, spa, hap, 0, tx); + dsl_sync_task_nowait(spa_get_dsl(spa), + spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx); } - /* spa_history_log_sync() will free hap and str */ + /* spa_history_log_sync() will free nvl */ } void -spa_history_internal_log(history_internal_events_t event, spa_t *spa, - dmu_tx_t *tx, cred_t *cr, const char *fmt, ...) +spa_history_log_internal(spa_t *spa, const char *operation, + dmu_tx_t *tx, const char *fmt, ...) { dmu_tx_t *htx = tx; va_list adx; @@ -441,7 +490,7 @@ spa_history_internal_log(history_interna } va_start(adx, fmt); - log_internal(event, spa, htx, cr, fmt, adx); + log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx); va_end(adx); /* if we didn't get a tx from the caller, commit the one we made */ @@ -450,21 +499,50 @@ spa_history_internal_log(history_interna } void -spa_history_log_version(spa_t *spa, history_internal_events_t event) +spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation, + dmu_tx_t *tx, const char *fmt, ...) { -#ifdef _KERNEL - uint64_t current_vers = spa_version(spa); + va_list adx; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *nvl = fnvlist_alloc(); - if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) { - spa_history_internal_log(event, spa, NULL, CRED(), - "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s", - (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION, - utsname.nodename, utsname.release, utsname.version, - utsname.machine); - } - cmn_err(CE_CONT, "!%s version %llu pool %s using %llu", - event == LOG_POOL_IMPORT ? "imported" : - event == LOG_POOL_CREATE ? "created" : "accessed", - (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION); -#endif + ASSERT(tx != NULL); + + dsl_dataset_name(ds, namebuf); + fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); + fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object); + + va_start(adx, fmt); + log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx); + va_end(adx); +} + +void +spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, + dmu_tx_t *tx, const char *fmt, ...) +{ + va_list adx; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *nvl = fnvlist_alloc(); + + ASSERT(tx != NULL); + + dsl_dir_name(dd, namebuf); + fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); + fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, + dsl_dir_phys(dd)->dd_head_dataset_obj); + + va_start(adx, fmt); + log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx); + va_end(adx); +} + +void +spa_history_log_version(spa_t *spa, const char *operation) +{ + spa_history_log_internal(spa, operation, NULL, + "pool version %llu; software version %llu/%d; uts %s %s %s %s", + (u_longlong_t)spa_version(spa), SPA_VERSION, ZPL_VERSION, + utsname.nodename, utsname.release, utsname.version, + utsname.machine); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c,v retrieving revision 1.2 diff -u -p -r1.2 spa_misc.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c 28 Mar 2014 03:46:56 -0000 1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c 6 May 2017 15:16:38 -0000 @@ -19,12 +19,18 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright 2013 Martin Matuska . All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ #include #include +#include #include #include #include @@ -41,11 +47,18 @@ #include #include #include +#include #include #include #include #include #include "zfs_prop.h" +#include + +#if defined(__FreeBSD__) && defined(_KERNEL) +#include +#include +#endif /* * SPA locking @@ -214,7 +227,7 @@ * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual * locking is, always, based on spa_namespace_lock and spa_config_lock[]. * - * spa_rename() is also implemented within this file since is requires + * spa_rename() is also implemented within this file since it requires * manipulation of the namespace. */ @@ -233,8 +246,8 @@ kmem_cache_t *spa_buffer_pool; int spa_mode_global; #ifdef ZFS_DEBUG -/* Everything except dprintf is on by default in debug builds */ -int zfs_flags = ~ZFS_DEBUG_DPRINTF; +/* Everything except dprintf and spa is on by default in debug builds */ +int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA); #else int zfs_flags = 0; #endif @@ -243,9 +256,212 @@ int zfs_flags = 0; * zfs_recover can be set to nonzero to attempt to recover from * otherwise-fatal errors, typically caused by on-disk corruption. When * set, calls to zfs_panic_recover() will turn into warning messages. + * This should only be used as a last resort, as it typically results + * in leaked space, or worse. + */ +boolean_t zfs_recover = B_FALSE; + +/* + * If destroy encounters an EIO while reading metadata (e.g. indirect + * blocks), space referenced by the missing metadata can not be freed. + * Normally this causes the background destroy to become "stalled", as + * it is unable to make forward progress. While in this stalled state, + * all remaining space to free from the error-encountering filesystem is + * "temporarily leaked". Set this flag to cause it to ignore the EIO, + * permanently leak the space from indirect blocks that can not be read, + * and continue to free everything else that it can. + * + * The default, "stalling" behavior is useful if the storage partially + * fails (i.e. some but not all i/os fail), and then later recovers. In + * this case, we will be able to continue pool operations while it is + * partially failed, and when it recovers, we can continue to free the + * space, with no leaks. However, note that this case is actually + * fairly rare. + * + * Typically pools either (a) fail completely (but perhaps temporarily, + * e.g. a top-level vdev going offline), or (b) have localized, + * permanent errors (e.g. disk returns the wrong data due to bit flip or + * firmware bug). In case (a), this setting does not matter because the + * pool will be suspended and the sync thread will not be able to make + * forward progress regardless. In case (b), because the error is + * permanent, the best we can do is leak the minimum amount of space, + * which is what setting this flag will do. Therefore, it is reasonable + * for this flag to normally be set, but we chose the more conservative + * approach of not setting it, so that there is no possibility of + * leaking space in the "partial temporary" failure case. + */ +boolean_t zfs_free_leak_on_eio = B_FALSE; + +/* + * Expiration time in milliseconds. This value has two meanings. First it is + * used to determine when the spa_deadman() logic should fire. By default the + * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. + * Secondly, the value determines if an I/O is considered "hung". Any I/O that + * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting + * in a system panic. + */ +uint64_t zfs_deadman_synctime_ms = 1000000ULL; + +/* + * Check time in milliseconds. This defines the frequency at which we check + * for hung I/O. + */ +uint64_t zfs_deadman_checktime_ms = 5000ULL; + +/* + * Default value of -1 for zfs_deadman_enabled is resolved in + * zfs_deadman_init() + */ +int zfs_deadman_enabled = -1; + +/* + * The worst case is single-sector max-parity RAID-Z blocks, in which + * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) + * times the size; so just assume that. Add to this the fact that + * we can have up to 3 DVAs per bp, and one more factor of 2 because + * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, + * the worst case is: + * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 */ -int zfs_recover = 0; +int spa_asize_inflation = 24; + +#if defined(__FreeBSD__) && defined(_KERNEL) +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0, + "Try to recover from otherwise-fatal errors."); + +static int +sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) +{ + int err, val; + + val = zfs_flags; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + /* + * ZFS_DEBUG_MODIFY must be enabled prior to boot so all + * arc buffers in the system have the necessary additional + * checksum data. However, it is safe to disable at any + * time. + */ + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + val &= ~ZFS_DEBUG_MODIFY; + zfs_flags = val; + + return (0); +} + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, debug_flags, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), + sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN, + &zfs_deadman_synctime_ms, 0, + "Stalled ZFS I/O expiration time in milliseconds"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN, + &zfs_deadman_checktime_ms, 0, + "Period of checks for stalled ZFS I/O in milliseconds"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN, + &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN, + &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes"); +#endif + +#ifdef __FreeBSD__ +#ifdef _KERNEL +static void +zfs_deadman_init(void) +{ + /* + * If we are not i386 or amd64 or in a virtual machine, + * disable ZFS deadman thread by default + */ + if (zfs_deadman_enabled == -1) { +#if defined(__amd64__) || defined(__i386__) + zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0; +#else + zfs_deadman_enabled = 0; +#endif + } +} +#endif /* _KERNEL */ +#endif /* __FreeBSD__ */ + +#ifdef __NetBSD__ +#ifdef _HARDKERNEL +static struct workqueue *spa_workqueue; + +static void spa_deadman(void *arg); + +static void +spa_deadman_wq(struct work *wk, void *arg) +{ + spa_t *spa = container_of(wk, struct spa, spa_deadman_work); + + spa_deadman(spa); +} + +static void +zfs_deadman_init(void) +{ + int error; + + error = workqueue_create(&spa_workqueue, "spa_deadman", + spa_deadman_wq, NULL, PRI_NONE, IPL_NONE, WQ_MPSAFE); + VERIFY0(error); +} + +static void +zfs_deadman_fini(void) +{ + workqueue_destroy(spa_workqueue); + spa_workqueue = NULL; +} +#else /* !_HARDKERNEL */ +#define zfs_deadman_init() /* nothing */ +#define zfs_deadman_fini() /* nothing */ +#endif /* !_HARDKERNEL */ +#endif /* __NetBSD__ */ + +/* + * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in + * the pool to be consumed. This ensures that we don't run the pool + * completely out of space, due to unaccounted changes (e.g. to the MOS). + * It also limits the worst-case time to allocate space. If we have + * less than this amount of free space, most ZPL operations (e.g. write, + * create) will return ENOSPC. + * + * Certain operations (e.g. file removal, most administrative actions) can + * use half the slop space. They will only return ENOSPC if less than half + * the slop space is free. Typically, once the pool has less than the slop + * space free, the user will use these operations to free up space in the pool. + * These are the operations that call dsl_pool_adjustedsize() with the netfree + * argument set to TRUE. + * + * A very restricted set of operations are always permitted, regardless of + * the amount of free space. These are the operations that call + * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy". If these + * operations result in a net increase in the amount of space used, + * it is possible to run the pool completely out of space, causing it to + * be permanently read-only. + * + * Note that on very small pools, the slop space will be larger than + * 3.2%, in an effort to have it be at least spa_min_slop (128MB), + * but we never allow it to be more than half the pool size. + * + * See also the comments in zfs_space_check_t. + */ +int spa_slop_shift = 5; +SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN, + &spa_slop_shift, 0, + "Shift value of reserved space (1/(2^spa_slop_shift))."); +uint64_t spa_min_slop = 128 * 1024 * 1024; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN, + &spa_min_slop, 0, + "Minimal value of reserved space"); /* * ========================================================================== @@ -259,7 +475,7 @@ spa_config_lock_init(spa_t *spa) spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); - refcount_create(&scl->scl_count); + refcount_create_untracked(&scl->scl_count); scl->scl_writer = NULL; scl->scl_write_wanted = 0; } @@ -289,14 +505,16 @@ spa_config_tryenter(spa_t *spa, int lock if (rw == RW_READER) { if (scl->scl_writer || scl->scl_write_wanted) { mutex_exit(&scl->scl_lock); - spa_config_exit(spa, locks ^ (1 << i), tag); + spa_config_exit(spa, locks & ((1 << i) - 1), + tag); return (0); } } else { ASSERT(scl->scl_writer != curthread); if (!refcount_is_zero(&scl->scl_count)) { mutex_exit(&scl->scl_lock); - spa_config_exit(spa, locks ^ (1 << i), tag); + spa_config_exit(spa, locks & ((1 << i) - 1), + tag); return (0); } scl->scl_writer = curthread; @@ -312,6 +530,8 @@ spa_config_enter(spa_t *spa, int locks, { int wlocks_held = 0; + ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); + for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (scl->scl_writer == curthread) @@ -390,31 +610,76 @@ spa_lookup(const char *name) static spa_t search; /* spa_t is large; don't allocate on stack */ spa_t *spa; avl_index_t where; - char c; char *cp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); + (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); + /* * If it's a full dataset name, figure out the pool name and * just use that. */ - cp = strpbrk(name, "/@"); - if (cp) { - c = *cp; + cp = strpbrk(search.spa_name, "/@#"); + if (cp != NULL) *cp = '\0'; - } - (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); spa = avl_find(&spa_namespace_avl, &search, &where); - if (cp) - *cp = c; - return (spa); } /* + * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. + * If the zfs_deadman_enabled flag is set then it inspects all vdev queues + * looking for potentially hung I/Os. + */ +static void +spa_deadman(void *arg) +{ + spa_t *spa = arg; + + /* + * Disable the deadman timer if the pool is suspended. + */ + if (spa_suspended(spa)) { +#ifdef illumos + VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); +#else + /* Nothing. just don't schedule any future callouts. */ +#endif + return; + } + + zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", + (gethrtime() - spa->spa_sync_starttime) / NANOSEC, + ++spa->spa_deadman_calls); + if (zfs_deadman_enabled) + vdev_deadman(spa->spa_root_vdev); +#ifndef illumos +#ifdef _KERNEL + callout_schedule(&spa->spa_deadman_cycid, + hz * zfs_deadman_checktime_ms / MILLISEC); +#endif +#endif +} + +#ifdef _HARDKERNEL +static void +spa_deadman_timeout(void *arg) +{ + spa_t *spa = arg; + +#ifdef __FreeBSD__ + taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task); +#endif +#ifdef __NetBSD__ + workqueue_enqueue(spa_workqueue, &spa->spa_deadman_work, NULL); +#endif +} +#endif /* _KERNEL */ + +/* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already * exist by calling spa_lookup() first. @@ -424,6 +689,10 @@ spa_add(const char *name, nvlist_t *conf { spa_t *spa; spa_config_dirent_t *dp; +#ifndef __FreeBSD__ + cyc_handler_t hdlr; + cyc_time_t when; +#endif ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -432,21 +701,24 @@ spa_add(const char *name, nvlist_t *conf mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < TXG_SIZE; t++) - bplist_init(&spa->spa_free_bplist[t]); - bplist_init(&spa->spa_deferred_bplist); + bplist_create(&spa->spa_free_bplist[t]); (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); spa->spa_state = POOL_STATE_UNINITIALIZED; @@ -456,6 +728,55 @@ spa_add(const char *name, nvlist_t *conf spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; +#ifndef __FreeBSD__ + hdlr.cyh_func = spa_deadman; + hdlr.cyh_arg = spa; + hdlr.cyh_level = CY_LOW_LEVEL; +#endif + + spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); + +#ifdef illumos + /* + * This determines how often we need to check for hung I/Os after + * the cyclic has already fired. Since checking for hung I/Os is + * an expensive operation we don't want to check too frequently. + * Instead wait for 5 seconds before checking again. + */ + when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms); + when.cyt_when = CY_INFINITY; + mutex_enter(&cpu_lock); + spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); + mutex_exit(&cpu_lock); +#endif +#ifdef __FreeBSD__ +#ifdef _KERNEL + /* + * callout(9) does not provide a way to initialize a callout with + * a function and an argument, so we use callout_reset() to schedule + * the callout in the very distant future. Even if that event ever + * fires, it should be okayas we won't have any active zio-s. + * But normally spa_sync() will reschedule the callout with a proper + * timeout. + * callout(9) does not allow the callback function to sleep but + * vdev_deadman() needs to acquire vq_lock and illumos mutexes are + * emulated using sx(9). For this reason spa_deadman_timeout() + * will schedule spa_deadman() as task on a taskqueue that allows + * sleeping. + */ + TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa); + callout_init(&spa->spa_deadman_cycid, 1); + callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0, + spa_deadman_timeout, spa, 0); +#endif +#endif +#ifdef __NetBSD__ +#ifdef _HARDKERNEL + callout_init(&spa->spa_deadman_cycid, 0); + callout_setfunc(&spa->spa_deadman_cycid, spa_deadman_timeout, spa); +#endif +#endif + refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); @@ -469,6 +790,9 @@ spa_add(const char *name, nvlist_t *conf spa_active_count++; } + avl_create(&spa->spa_alloc_tree, zio_timestamp_compare, + sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + /* * Every pool starts with the default cachefile */ @@ -476,11 +800,42 @@ spa_add(const char *name, nvlist_t *conf offsetof(spa_config_dirent_t, scd_link)); dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); - dp->scd_path = spa_strdup(spa_config_path); + dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); - if (config != NULL) + VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + + if (config != NULL) { + nvlist_t *features; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, + &features) == 0) { + VERIFY(nvlist_dup(features, &spa->spa_label_features, + 0) == 0); + } + VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); + } + + if (spa->spa_label_features == NULL) { + VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + } + + spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0); + + spa->spa_min_ashift = INT_MAX; + spa->spa_max_ashift = 0; + + /* + * As a pool is being created, treat all features as disabled by + * setting SPA_FEATURE_DISABLED for all entries in the feature + * refcount cache. + */ + for (int i = 0; i < SPA_FEATURES; i++) { + spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; + } return (spa); } @@ -497,6 +852,7 @@ spa_remove(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0); nvlist_free(spa->spa_config_splitting); @@ -515,29 +871,56 @@ spa_remove(spa_t *spa) kmem_free(dp, sizeof (spa_config_dirent_t)); } + avl_destroy(&spa->spa_alloc_tree); list_destroy(&spa->spa_config_list); + nvlist_free(spa->spa_label_features); + nvlist_free(spa->spa_load_info); spa_config_set(spa, NULL); +#ifdef illumos + mutex_enter(&cpu_lock); + if (spa->spa_deadman_cycid != CYCLIC_NONE) + cyclic_remove(spa->spa_deadman_cycid); + mutex_exit(&cpu_lock); + spa->spa_deadman_cycid = CYCLIC_NONE; +#endif /* !illumos */ +#ifdef __FreeBSD__ +#ifdef _KERNEL + callout_drain(&spa->spa_deadman_cycid); + taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task); +#endif +#endif +#ifdef __NetBSD__ +#ifdef _HARDKERNEL + callout_drain(&spa->spa_deadman_cycid); +#endif +#endif + refcount_destroy(&spa->spa_refcount); spa_config_lock_destroy(spa); for (int t = 0; t < TXG_SIZE; t++) - bplist_fini(&spa->spa_free_bplist[t]); - bplist_fini(&spa->spa_deferred_bplist); + bplist_destroy(&spa->spa_free_bplist[t]); + + zio_checksum_templates_free(spa); cv_destroy(&spa->spa_async_cv); + cv_destroy(&spa->spa_evicting_os_cv); cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); + mutex_destroy(&spa->spa_alloc_lock); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); + mutex_destroy(&spa->spa_evicting_os_lock); mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); + mutex_destroy(&spa->spa_cksum_tmpls_lock); mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); @@ -591,6 +974,20 @@ spa_close(spa_t *spa, void *tag) } /* + * Remove a reference to the given spa_t held by a dsl dir that is + * being asynchronously released. Async releases occur from a taskq + * performing eviction of dsl datasets and dirs. The namespace lock + * isn't held and the hold by the object being evicted may contribute to + * spa_minref (e.g. dataset or directory released during pool export), + * so the asserts in spa_close() do not apply. + */ +void +spa_async_close(spa_t *spa, void *tag) +{ + (void) refcount_remove(&spa->spa_refcount, tag); +} + +/* * Check to see if the spa refcount is zero. Must be called with * spa_namespace_lock held. We really compare against spa_minref, which is the * number of references acquired when opening a pool @@ -888,11 +1285,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t */ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); - /* - * If the config changed, notify the scrub thread that it must restart. - */ if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { - dsl_pool_scrub_restart(spa->spa_dsl_pool); config_changed = B_TRUE; spa->spa_config_generation++; } @@ -922,7 +1315,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t txg_wait_synced(spa->spa_dsl_pool, txg); if (vd != NULL) { - ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0); + ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); vdev_free(vd); spa_config_exit(spa, SCL_ALL, spa); @@ -959,22 +1352,46 @@ spa_vdev_state_enter(spa_t *spa, int opl { int locks = SCL_STATE_ALL | oplocks; - spa_config_enter(spa, locks, spa, RW_WRITER); + /* + * Root pools may need to read of the underlying devfs filesystem + * when opening up a vdev. Unfortunately if we're holding the + * SCL_ZIO lock it will result in a deadlock when we try to issue + * the read from the root filesystem. Instead we "prefetch" + * the associated vnodes that we need prior to opening the + * underlying devices and cache them so that we can prevent + * any I/O when we are doing the actual open. + */ + if (spa_is_root(spa)) { + int low = locks & ~(SCL_ZIO - 1); + int high = locks & ~low; + + spa_config_enter(spa, high, spa, RW_WRITER); + vdev_hold(spa->spa_root_vdev); + spa_config_enter(spa, low, spa, RW_WRITER); + } else { + spa_config_enter(spa, locks, spa, RW_WRITER); + } spa->spa_vdev_locks = locks; } int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) { + boolean_t config_changed = B_FALSE; + if (vd != NULL || error == 0) vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, 0, 0, B_FALSE); if (vd != NULL) { vdev_state_dirty(vd->vdev_top); + config_changed = B_TRUE; spa->spa_config_generation++; } + if (spa_is_root(spa)) + vdev_rele(spa->spa_root_vdev); + ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); spa_config_exit(spa, spa->spa_vdev_locks, spa); @@ -987,6 +1404,15 @@ spa_vdev_state_exit(spa_t *spa, vdev_t * if (vd != NULL) txg_wait_synced(spa->spa_dsl_pool, 0); + /* + * If the config changed, update the config cache. + */ + if (config_changed) { + mutex_enter(&spa_namespace_lock); + spa_config_sync(spa, B_FALSE, B_TRUE); + mutex_exit(&spa_namespace_lock); + } + return (error); } @@ -996,6 +1422,30 @@ spa_vdev_state_exit(spa_t *spa, vdev_t * * ========================================================================== */ +void +spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) +{ + if (!nvlist_exists(spa->spa_label_features, feature)) { + fnvlist_add_boolean(spa->spa_label_features, feature); + /* + * When we are creating the pool (tx_txg==TXG_INITIAL), we can't + * dirty the vdev config because lock SCL_CONFIG is not held. + * Thankfully, in this case we don't need to dirty the config + * because it will be written out anyway when we finish + * creating the pool. + */ + if (tx->tx_txg != TXG_INITIAL) + vdev_config_dirty(spa->spa_root_vdev); + } +} + +void +spa_deactivate_mos_feature(spa_t *spa, const char *feature) +{ + if (nvlist_remove_all(spa->spa_label_features, feature) == 0) + vdev_config_dirty(spa->spa_root_vdev); +} + /* * Rename a spa_t. */ @@ -1046,14 +1496,13 @@ spa_rename(const char *name, const char return (0); } - /* - * Determine whether a pool with given pool_guid exists. If device_guid is - * non-zero, determine whether the pool exists *and* contains a device with the - * specified device_guid. + * Return the spa_t associated with given pool_guid, if it exists. If + * device_guid is non-zero, determine whether the pool exists *and* contains + * a device with the specified device_guid. */ -boolean_t -spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) +spa_t * +spa_by_guid(uint64_t pool_guid, uint64_t device_guid) { spa_t *spa; avl_tree_t *t = &spa_namespace_avl; @@ -1084,7 +1533,16 @@ spa_guid_exists(uint64_t pool_guid, uint } } - return (spa != NULL); + return (spa); +} + +/* + * Determine whether a pool with the given pool_guid exists. + */ +boolean_t +spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) +{ + return (spa_by_guid(pool_guid, device_guid) != NULL); } char * @@ -1138,11 +1596,31 @@ spa_generate_guid(spa_t *spa) void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) { - char *type = dmu_ot[BP_GET_TYPE(bp)].ot_name; - char *checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; - char *compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; + char type[256]; + char *checksum = NULL; + char *compress = NULL; + + if (bp != NULL) { + if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); + (void) snprintf(type, sizeof (type), "bswap %s %s", + DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? + "metadata" : "data", + dmu_ot_byteswap[bswap].ob_name); + } else { + (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, + sizeof (type)); + } + if (!BP_IS_EMBEDDED(bp)) { + checksum = + zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; + } + compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; + } - SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, compress); + SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, + compress); } void @@ -1171,6 +1649,37 @@ zfs_panic_recover(const char *fmt, ...) } /* + * This is a stripped-down version of strtoull, suitable only for converting + * lowercase hexadecimal numbers that don't overflow. + */ +uint64_t +zfs_strtonum(const char *str, char **nptr) +{ + uint64_t val = 0; + char c; + int digit; + + while ((c = *str) != '\0') { + if (c >= '0' && c <= '9') + digit = c - '0'; + else if (c >= 'a' && c <= 'f') + digit = 10 + c - 'a'; + else + break; + + val *= 16; + val += digit; + + str++; + } + + if (nptr) + *nptr = (char *)str; + + return (val); +} + +/* * ========================================================================== * Accessor functions * ========================================================================== @@ -1188,6 +1697,12 @@ spa_get_dsl(spa_t *spa) return (spa->spa_dsl_pool); } +boolean_t +spa_is_initializing(spa_t *spa) +{ + return (spa->spa_is_initializing); +} + blkptr_t * spa_get_rootblkptr(spa_t *spa) { @@ -1224,16 +1739,40 @@ spa_name(spa_t *spa) uint64_t spa_guid(spa_t *spa) { + dsl_pool_t *dp = spa_get_dsl(spa); + uint64_t guid; + /* * If we fail to parse the config during spa_load(), we can go through * the error path (which posts an ereport) and end up here with no root - * vdev. We stash the original pool guid in 'spa_load_guid' to handle + * vdev. We stash the original pool guid in 'spa_config_guid' to handle * this case. */ - if (spa->spa_root_vdev != NULL) + if (spa->spa_root_vdev == NULL) + return (spa->spa_config_guid); + + guid = spa->spa_last_synced_guid != 0 ? + spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; + + /* + * Return the most recently synced out guid unless we're + * in syncing context. + */ + if (dp && dsl_pool_sync_context(dp)) return (spa->spa_root_vdev->vdev_guid); else - return (spa->spa_load_guid); + return (guid); +} + +uint64_t +spa_load_guid(spa_t *spa) +{ + /* + * This is a GUID that exists solely as a reference for the + * purposes of the arc. It is generated at load time, and + * is never written to persistent storage. + */ + return (spa->spa_load_guid); } uint64_t @@ -1276,14 +1815,21 @@ spa_freeze_txg(spa_t *spa) uint64_t spa_get_asize(spa_t *spa, uint64_t lsize) { - /* - * The worst case is single-sector max-parity RAID-Z blocks, in which - * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) - * times the size; so just assume that. Add to this the fact that - * we can have up to 3 DVAs per bp, and one more factor of 2 because - * the block may be dittoed with up to 3 DVAs by ddt_sync(). - */ - return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2); + return (lsize * spa_asize_inflation); +} + +/* + * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%), + * or at least 128MB, unless that would cause it to be more than half the + * pool size. + * + * See the comment above spa_slop_shift for details. + */ +uint64_t +spa_get_slop_space(spa_t *spa) +{ + uint64_t space = spa_get_dspace(spa); + return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop))); } uint64_t @@ -1339,6 +1885,34 @@ spa_log_class(spa_t *spa) return (spa->spa_log_class); } +void +spa_evicting_os_register(spa_t *spa, objset_t *os) +{ + mutex_enter(&spa->spa_evicting_os_lock); + list_insert_head(&spa->spa_evicting_os_list, os); + mutex_exit(&spa->spa_evicting_os_lock); +} + +void +spa_evicting_os_deregister(spa_t *spa, objset_t *os) +{ + mutex_enter(&spa->spa_evicting_os_lock); + list_remove(&spa->spa_evicting_os_list, os); + cv_broadcast(&spa->spa_evicting_os_cv); + mutex_exit(&spa->spa_evicting_os_lock); +} + +void +spa_evicting_os_wait(spa_t *spa) +{ + mutex_enter(&spa->spa_evicting_os_lock); + while (!list_is_empty(&spa->spa_evicting_os_list)) + cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); + mutex_exit(&spa->spa_evicting_os_lock); + + dmu_buf_user_evict_wait(); +} + int spa_max_replication(spa_t *spa) { @@ -1352,6 +1926,18 @@ spa_max_replication(spa_t *spa) return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } +int +spa_prev_software_version(spa_t *spa) +{ + return (spa->spa_prev_software_version); +} + +uint64_t +spa_deadman_synctime(spa_t *spa) +{ + return (spa->spa_deadman_synctime); +} + uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { @@ -1361,7 +1947,13 @@ dva_get_dsize_sync(spa_t *spa, const dva ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (asize != 0 && spa->spa_deflate) { - vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + uint64_t vdev = DVA_GET_VDEV(dva); + vdev_t *vd = vdev_lookup_top(spa, vdev); + if (vd == NULL) { + panic( + "dva_get_dsize_sync(): bad DVA %llu:%llu", + (u_longlong_t)vdev, (u_longlong_t)asize); + } dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } @@ -1373,7 +1965,7 @@ bp_get_dsize_sync(spa_t *spa, const blkp { uint64_t dsize = 0; - for (int d = 0; d < SPA_DVAS_PER_BP; d++) + for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); return (dsize); @@ -1386,7 +1978,7 @@ bp_get_dsize(spa_t *spa, const blkptr_t spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - for (int d = 0; d < SPA_DVAS_PER_BP; d++) + for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); spa_config_exit(spa, SCL_VDEV, FTAG); @@ -1427,6 +2019,12 @@ spa_boot_init() spa_config_load(); } +#ifdef __FreeBSD__ +#ifdef _KERNEL +EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); +#endif +#endif + void spa_init(int mode) { @@ -1446,21 +2044,51 @@ spa_init(int mode) spa_mode_global = mode; - refcount_init(); +#ifdef illumos +#ifdef _KERNEL + spa_arch_init(); +#else + if (spa_mode_global != FREAD && dprintf_find_string("watch")) { + arc_procfd = open("/proc/self/ctl", O_WRONLY); + if (arc_procfd == -1) { + perror("could not enable watchpoints: " + "opening /proc/self/ctl failed: "); + } else { + arc_watch = B_TRUE; + } + } +#endif +#endif /* illumos */ + refcount_sysinit(); unique_init(); + range_tree_init(); + metaslab_alloc_trace_init(); zio_init(); + lz4_init(); dmu_init(); zil_init(); vdev_cache_stat_init(); zfs_prop_init(); zpool_prop_init(); + zpool_feature_init(); spa_config_load(); l2arc_start(); +#ifdef __FreeBSD__ +#ifdef _KERNEL + zfs_deadman_init(); +#endif +#endif /* __FreeBSD__ */ +#ifdef __NetBSD__ + zfs_deadman_init(); +#endif } void spa_fini(void) { +#ifdef __NetBSD__ + zfs_deadman_fini(); +#endif l2arc_stop(); spa_evict_all(); @@ -1468,7 +2096,10 @@ spa_fini(void) vdev_cache_stat_fini(); zil_fini(); dmu_fini(); + lz4_fini(); zio_fini(); + metaslab_alloc_trace_fini(); + range_tree_fini(); unique_fini(); refcount_fini(); @@ -1517,6 +2148,16 @@ spa_writeable(spa_t *spa) return (!!(spa->spa_mode & FWRITE)); } +/* + * Returns true if there is a pending sync task in any of the current + * syncing txg, the current quiescing txg, or the current open txg. + */ +boolean_t +spa_has_pending_synctask(spa_t *spa) +{ + return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks)); +} + int spa_mode(spa_t *spa) { @@ -1546,3 +2187,60 @@ spa_dedup_checksum(spa_t *spa) { return (spa->spa_dedup_checksum); } + +/* + * Reset pool scan stat per scan pass (or reboot). + */ +void +spa_scan_stat_init(spa_t *spa) +{ + /* data not stored on disk */ + spa->spa_scan_pass_start = gethrestime_sec(); + spa->spa_scan_pass_exam = 0; + vdev_scan_stat_init(spa->spa_root_vdev); +} + +/* + * Get scan stats for zpool status reports + */ +int +spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) +{ + dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; + + if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) + return (SET_ERROR(ENOENT)); + bzero(ps, sizeof (pool_scan_stat_t)); + + /* data stored on disk */ + ps->pss_func = scn->scn_phys.scn_func; + ps->pss_start_time = scn->scn_phys.scn_start_time; + ps->pss_end_time = scn->scn_phys.scn_end_time; + ps->pss_to_examine = scn->scn_phys.scn_to_examine; + ps->pss_examined = scn->scn_phys.scn_examined; + ps->pss_to_process = scn->scn_phys.scn_to_process; + ps->pss_processed = scn->scn_phys.scn_processed; + ps->pss_errors = scn->scn_phys.scn_errors; + ps->pss_state = scn->scn_phys.scn_state; + + /* data not stored on disk */ + ps->pss_pass_start = spa->spa_scan_pass_start; + ps->pss_pass_exam = spa->spa_scan_pass_exam; + + return (0); +} + +boolean_t +spa_debug_enabled(spa_t *spa) +{ + return (spa->spa_debug); +} + +int +spa_maxblocksize(spa_t *spa) +{ + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) + return (SPA_MAXBLOCKSIZE); + else + return (SPA_OLD_MAXBLOCKSIZE); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c,v retrieving revision 1.4 diff -u -p -r1.4 space_map.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c 19 May 2010 17:50:59 -0000 1.4 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c 27 Mar 2017 06:19:45 -0000 @@ -22,296 +22,81 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ #include #include #include +#include +#include +#include #include #include +#include +#include -/* - * Space map routines. - * NOTE: caller is responsible for all locking. - */ -static int -space_map_seg_compare(const void *x1, const void *x2) -{ - const space_seg_t *s1 = x1; - const space_seg_t *s2 = x2; - - if (s1->ss_start < s2->ss_start) { - if (s1->ss_end > s2->ss_start) - return (0); - return (-1); - } - if (s1->ss_start > s2->ss_start) { - if (s1->ss_start < s2->ss_end) - return (0); - return (1); - } - return (0); -} - -void -space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift, - kmutex_t *lp) -{ - bzero(sm, sizeof (*sm)); - - cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL); - - avl_create(&sm->sm_root, space_map_seg_compare, - sizeof (space_seg_t), offsetof(struct space_seg, ss_node)); - - sm->sm_start = start; - sm->sm_size = size; - sm->sm_shift = shift; - sm->sm_lock = lp; -} - -void -space_map_destroy(space_map_t *sm) -{ - ASSERT(!sm->sm_loaded && !sm->sm_loading); - VERIFY3U(sm->sm_space, ==, 0); - avl_destroy(&sm->sm_root); - cv_destroy(&sm->sm_load_cv); -} - -void -space_map_add(space_map_t *sm, uint64_t start, uint64_t size) -{ - avl_index_t where; - space_seg_t ssearch, *ss_before, *ss_after, *ss; - uint64_t end = start + size; - int merge_before, merge_after; - - ASSERT(MUTEX_HELD(sm->sm_lock)); - VERIFY(size != 0); - VERIFY3U(start, >=, sm->sm_start); - VERIFY3U(end, <=, sm->sm_start + sm->sm_size); - VERIFY(sm->sm_space + size <= sm->sm_size); - VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); - VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); - - ssearch.ss_start = start; - ssearch.ss_end = end; - ss = avl_find(&sm->sm_root, &ssearch, &where); - - if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) { - zfs_panic_recover("zfs: allocating allocated segment" - "(offset=%llu size=%llu)\n", - (longlong_t)start, (longlong_t)size); - return; - } - - /* Make sure we don't overlap with either of our neighbors */ - VERIFY(ss == NULL); - - ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE); - ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER); - - merge_before = (ss_before != NULL && ss_before->ss_end == start); - merge_after = (ss_after != NULL && ss_after->ss_start == end); - - if (merge_before && merge_after) { - avl_remove(&sm->sm_root, ss_before); - if (sm->sm_pp_root) { - avl_remove(sm->sm_pp_root, ss_before); - avl_remove(sm->sm_pp_root, ss_after); - } - ss_after->ss_start = ss_before->ss_start; - kmem_free(ss_before, sizeof (*ss_before)); - ss = ss_after; - } else if (merge_before) { - ss_before->ss_end = end; - if (sm->sm_pp_root) - avl_remove(sm->sm_pp_root, ss_before); - ss = ss_before; - } else if (merge_after) { - ss_after->ss_start = start; - if (sm->sm_pp_root) - avl_remove(sm->sm_pp_root, ss_after); - ss = ss_after; - } else { - ss = kmem_alloc(sizeof (*ss), KM_SLEEP); - ss->ss_start = start; - ss->ss_end = end; - avl_insert(&sm->sm_root, ss, where); - } - - if (sm->sm_pp_root) - avl_add(sm->sm_pp_root, ss); - - sm->sm_space += size; -} - -void -space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) -{ - avl_index_t where; - space_seg_t ssearch, *ss, *newseg; - uint64_t end = start + size; - int left_over, right_over; - - ASSERT(MUTEX_HELD(sm->sm_lock)); - VERIFY(size != 0); - VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); - VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); - - ssearch.ss_start = start; - ssearch.ss_end = end; - ss = avl_find(&sm->sm_root, &ssearch, &where); - - /* Make sure we completely overlap with someone */ - if (ss == NULL) { - zfs_panic_recover("zfs: freeing free segment " - "(offset=%llu size=%llu)", - (longlong_t)start, (longlong_t)size); - return; - } - VERIFY3U(ss->ss_start, <=, start); - VERIFY3U(ss->ss_end, >=, end); - VERIFY(sm->sm_space - size <= sm->sm_size); - - left_over = (ss->ss_start != start); - right_over = (ss->ss_end != end); - - if (sm->sm_pp_root) - avl_remove(sm->sm_pp_root, ss); - - if (left_over && right_over) { - newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP); - newseg->ss_start = end; - newseg->ss_end = ss->ss_end; - ss->ss_end = start; - avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER); - if (sm->sm_pp_root) - avl_add(sm->sm_pp_root, newseg); - } else if (left_over) { - ss->ss_end = start; - } else if (right_over) { - ss->ss_start = end; - } else { - avl_remove(&sm->sm_root, ss); - kmem_free(ss, sizeof (*ss)); - ss = NULL; - } - - if (sm->sm_pp_root && ss != NULL) - avl_add(sm->sm_pp_root, ss); - - sm->sm_space -= size; -} - -boolean_t -space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) -{ - avl_index_t where; - space_seg_t ssearch, *ss; - uint64_t end = start + size; - - ASSERT(MUTEX_HELD(sm->sm_lock)); - VERIFY(size != 0); - VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); - VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); - - ssearch.ss_start = start; - ssearch.ss_end = end; - ss = avl_find(&sm->sm_root, &ssearch, &where); - - return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end); -} - -void -space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) -{ - space_seg_t *ss; - void *cookie = NULL; - - ASSERT(MUTEX_HELD(sm->sm_lock)); - - while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { - if (func != NULL) - func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); - kmem_free(ss, sizeof (*ss)); - } - sm->sm_space = 0; -} - -void -space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) -{ - space_seg_t *ss; - - ASSERT(MUTEX_HELD(sm->sm_lock)); - - for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) - func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); -} +SYSCTL_DECL(_vfs_zfs); /* - * Wait for any in-progress space_map_load() to complete. + * The data for a given space map can be kept on blocks of any size. + * Larger blocks entail fewer i/o operations, but they also cause the + * DMU to keep more data in-core, and also to waste more i/o bandwidth + * when only a few blocks have changed since the last transaction group. */ -void -space_map_load_wait(space_map_t *sm) -{ - ASSERT(MUTEX_HELD(sm->sm_lock)); - - while (sm->sm_loading) { - ASSERT(!sm->sm_loaded); - cv_wait(&sm->sm_load_cv, sm->sm_lock); - } -} +int space_map_blksz = (1 << 12); +SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_blksz, CTLFLAG_RDTUN, &space_map_blksz, 0, + "Maximum block size for space map. Must be power of 2 and greater than 4096."); /* + * Load the space map disk into the specified range tree. Segments of maptype + * are added to the range tree, other segment types are removed. + * * Note: space_map_load() will drop sm_lock across dmu_read() calls. * The caller must be OK with this. */ int -space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, - space_map_obj_t *smo, objset_t *os) +space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) { uint64_t *entry, *entry_map, *entry_map_end; uint64_t bufsize, size, offset, end, space; - uint64_t mapstart = sm->sm_start; int error = 0; ASSERT(MUTEX_HELD(sm->sm_lock)); - ASSERT(!sm->sm_loaded); - ASSERT(!sm->sm_loading); - sm->sm_loading = B_TRUE; - end = smo->smo_objsize; - space = smo->smo_alloc; + end = space_map_length(sm); + space = space_map_allocated(sm); - ASSERT(sm->sm_ops == NULL); - VERIFY3U(sm->sm_space, ==, 0); + VERIFY0(range_tree_space(rt)); if (maptype == SM_FREE) { - space_map_add(sm, sm->sm_start, sm->sm_size); + range_tree_add(rt, sm->sm_start, sm->sm_size); space = sm->sm_size - space; } - bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT; + bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); entry_map = zio_buf_alloc(bufsize); mutex_exit(sm->sm_lock); - if (end > bufsize) - dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize); + if (end > bufsize) { + dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize, + end - bufsize, ZIO_PRIORITY_SYNC_READ); + } mutex_enter(sm->sm_lock); for (offset = 0; offset < end; offset += bufsize) { size = MIN(end - offset, bufsize); VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); VERIFY(size != 0); + ASSERT3U(sm->sm_blksz, !=, 0); dprintf("object=%llu offset=%llx size=%llx\n", - smo->smo_object, offset, size); + space_map_object(sm), offset, size); mutex_exit(sm->sm_lock); - error = dmu_read(os, smo->smo_object, offset, size, entry_map, - DMU_READ_PREFETCH); + error = dmu_read(sm->sm_os, space_map_object(sm), offset, size, + entry_map, DMU_READ_PREFETCH); mutex_enter(sm->sm_lock); if (error != 0) break; @@ -319,115 +104,176 @@ space_map_load(space_map_t *sm, space_ma entry_map_end = entry_map + (size / sizeof (uint64_t)); for (entry = entry_map; entry < entry_map_end; entry++) { uint64_t e = *entry; + uint64_t offset, size; if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ continue; - (SM_TYPE_DECODE(e) == maptype ? - space_map_add : space_map_remove)(sm, - (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart, - SM_RUN_DECODE(e) << sm->sm_shift); + offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) + + sm->sm_start; + size = SM_RUN_DECODE(e) << sm->sm_shift; + + VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift)); + VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift)); + VERIFY3U(offset, >=, sm->sm_start); + VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size); + if (SM_TYPE_DECODE(e) == maptype) { + VERIFY3U(range_tree_space(rt) + size, <=, + sm->sm_size); + range_tree_add(rt, offset, size); + } else { + range_tree_remove(rt, offset, size); + } } } - if (error == 0) { - VERIFY3U(sm->sm_space, ==, space); - - sm->sm_loaded = B_TRUE; - sm->sm_ops = ops; - if (ops != NULL) - ops->smop_load(sm); - } else { - space_map_vacate(sm, NULL, NULL); - } + if (error == 0) + VERIFY3U(range_tree_space(rt), ==, space); + else + range_tree_vacate(rt, NULL, NULL); zio_buf_free(entry_map, bufsize); - - sm->sm_loading = B_FALSE; - - cv_broadcast(&sm->sm_load_cv); - return (error); } void -space_map_unload(space_map_t *sm) +space_map_histogram_clear(space_map_t *sm) { - ASSERT(MUTEX_HELD(sm->sm_lock)); - - if (sm->sm_loaded && sm->sm_ops != NULL) - sm->sm_ops->smop_unload(sm); - - sm->sm_loaded = B_FALSE; - sm->sm_ops = NULL; + if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) + return; - space_map_vacate(sm, NULL, NULL); + bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram)); } -uint64_t -space_map_maxsize(space_map_t *sm) +boolean_t +space_map_histogram_verify(space_map_t *sm, range_tree_t *rt) { - ASSERT(sm->sm_ops != NULL); - return (sm->sm_ops->smop_max(sm)); + /* + * Verify that the in-core range tree does not have any + * ranges smaller than our sm_shift size. + */ + for (int i = 0; i < sm->sm_shift; i++) { + if (rt->rt_histogram[i] != 0) + return (B_FALSE); + } + return (B_TRUE); } -uint64_t -space_map_alloc(space_map_t *sm, uint64_t size) +void +space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx) { - uint64_t start; + int idx = 0; - start = sm->sm_ops->smop_alloc(sm, size); - if (start != -1ULL) - space_map_remove(sm, start, size); - return (start); -} + ASSERT(MUTEX_HELD(rt->rt_lock)); + ASSERT(dmu_tx_is_syncing(tx)); + VERIFY3U(space_map_object(sm), !=, 0); -void -space_map_claim(space_map_t *sm, uint64_t start, uint64_t size) -{ - sm->sm_ops->smop_claim(sm, start, size); - space_map_remove(sm, start, size); + if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) + return; + + dmu_buf_will_dirty(sm->sm_dbuf, tx); + + ASSERT(space_map_histogram_verify(sm, rt)); + /* + * Transfer the content of the range tree histogram to the space + * map histogram. The space map histogram contains 32 buckets ranging + * between 2^sm_shift to 2^(32+sm_shift-1). The range tree, + * however, can represent ranges from 2^0 to 2^63. Since the space + * map only cares about allocatable blocks (minimum of sm_shift) we + * can safely ignore all ranges in the range tree smaller than sm_shift. + */ + for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + + /* + * Since the largest histogram bucket in the space map is + * 2^(32+sm_shift-1), we need to normalize the values in + * the range tree for any bucket larger than that size. For + * example given an sm_shift of 9, ranges larger than 2^40 + * would get normalized as if they were 1TB ranges. Assume + * the range tree had a count of 5 in the 2^44 (16TB) bucket, + * the calculation below would normalize this to 5 * 2^4 (16). + */ + ASSERT3U(i, >=, idx + sm->sm_shift); + sm->sm_phys->smp_histogram[idx] += + rt->rt_histogram[i] << (i - idx - sm->sm_shift); + + /* + * Increment the space map's index as long as we haven't + * reached the maximum bucket size. Accumulate all ranges + * larger than the max bucket size into the last bucket. + */ + if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { + ASSERT3U(idx + sm->sm_shift, ==, i); + idx++; + ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); + } + } } -void -space_map_free(space_map_t *sm, uint64_t start, uint64_t size) +uint64_t +space_map_entries(space_map_t *sm, range_tree_t *rt) { - space_map_add(sm, start, size); - sm->sm_ops->smop_free(sm, start, size); + avl_tree_t *t = &rt->rt_root; + range_seg_t *rs; + uint64_t size, entries; + + /* + * All space_maps always have a debug entry so account for it here. + */ + entries = 1; + + /* + * Traverse the range tree and calculate the number of space map + * entries that would be required to write out the range tree. + */ + for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { + size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; + entries += howmany(size, SM_RUN_MAX); + } + return (entries); } /* - * Note: space_map_sync() will drop sm_lock across dmu_write() calls. + * Note: space_map_write() will drop sm_lock across dmu_write() calls. */ void -space_map_sync(space_map_t *sm, uint8_t maptype, - space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) +space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + dmu_tx_t *tx) { + objset_t *os = sm->sm_os; spa_t *spa = dmu_objset_spa(os); - void *cookie = NULL; - space_seg_t *ss; - uint64_t bufsize, start, size, run_len; + avl_tree_t *t = &rt->rt_root; + range_seg_t *rs; + uint64_t size, total, rt_space, nodes; uint64_t *entry, *entry_map, *entry_map_end; + uint64_t expected_entries, actual_entries = 1; - ASSERT(MUTEX_HELD(sm->sm_lock)); + ASSERT(MUTEX_HELD(rt->rt_lock)); + ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); + VERIFY3U(space_map_object(sm), !=, 0); + dmu_buf_will_dirty(sm->sm_dbuf, tx); + + /* + * This field is no longer necessary since the in-core space map + * now contains the object number but is maintained for backwards + * compatibility. + */ + sm->sm_phys->smp_object = sm->sm_object; - if (sm->sm_space == 0) + if (range_tree_space(rt) == 0) { + VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object); return; - - dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n", - smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa), - maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root), - sm->sm_space); + } if (maptype == SM_ALLOC) - smo->smo_alloc += sm->sm_space; + sm->sm_phys->smp_alloc += range_tree_space(rt); else - smo->smo_alloc -= sm->sm_space; + sm->sm_phys->smp_alloc -= range_tree_space(rt); - bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t); - bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT); - entry_map = zio_buf_alloc(bufsize); - entry_map_end = entry_map + (bufsize / sizeof (uint64_t)); + expected_entries = space_map_entries(sm, rt); + + entry_map = zio_buf_alloc(sm->sm_blksz); + entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t)); entry = entry_map; *entry++ = SM_DEBUG_ENCODE(1) | @@ -435,22 +281,29 @@ space_map_sync(space_map_t *sm, uint8_t SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) | SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); - while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { - size = ss->ss_end - ss->ss_start; - start = (ss->ss_start - sm->sm_start) >> sm->sm_shift; + total = 0; + nodes = avl_numnodes(&rt->rt_root); + rt_space = range_tree_space(rt); + for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { + uint64_t start; + + size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; + start = (rs->rs_start - sm->sm_start) >> sm->sm_shift; + + total += size << sm->sm_shift; - sm->sm_space -= size; - size >>= sm->sm_shift; + while (size != 0) { + uint64_t run_len; - while (size) { run_len = MIN(size, SM_RUN_MAX); if (entry == entry_map_end) { - mutex_exit(sm->sm_lock); - dmu_write(os, smo->smo_object, smo->smo_objsize, - bufsize, entry_map, tx); - mutex_enter(sm->sm_lock); - smo->smo_objsize += bufsize; + mutex_exit(rt->rt_lock); + dmu_write(os, space_map_object(sm), + sm->sm_phys->smp_objsize, sm->sm_blksz, + entry_map, tx); + mutex_enter(rt->rt_lock); + sm->sm_phys->smp_objsize += sm->sm_blksz; entry = entry_map; } @@ -460,157 +313,234 @@ space_map_sync(space_map_t *sm, uint8_t start += run_len; size -= run_len; + actual_entries++; } - kmem_free(ss, sizeof (*ss)); } if (entry != entry_map) { size = (entry - entry_map) * sizeof (uint64_t); - mutex_exit(sm->sm_lock); - dmu_write(os, smo->smo_object, smo->smo_objsize, + mutex_exit(rt->rt_lock); + dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize, size, entry_map, tx); - mutex_enter(sm->sm_lock); - smo->smo_objsize += size; + mutex_enter(rt->rt_lock); + sm->sm_phys->smp_objsize += size; } + ASSERT3U(expected_entries, ==, actual_entries); - zio_buf_free(entry_map, bufsize); + /* + * Ensure that the space_map's accounting wasn't changed + * while we were in the middle of writing it out. + */ + VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root)); + VERIFY3U(range_tree_space(rt), ==, rt_space); + VERIFY3U(range_tree_space(rt), ==, total); - VERIFY3U(sm->sm_space, ==, 0); + zio_buf_free(entry_map, sm->sm_blksz); } -void -space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) +static int +space_map_open_impl(space_map_t *sm) { - VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0); + int error; + u_longlong_t blocks; + + error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf); + if (error) + return (error); - smo->smo_objsize = 0; - smo->smo_alloc = 0; + dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks); + sm->sm_phys = sm->sm_dbuf->db_data; + return (0); } -/* - * Space map reference trees. - * - * A space map is a collection of integers. Every integer is either - * in the map, or it's not. A space map reference tree generalizes - * the idea: it allows its members to have arbitrary reference counts, - * as opposed to the implicit reference count of 0 or 1 in a space map. - * This representation comes in handy when computing the union or - * intersection of multiple space maps. For example, the union of - * N space maps is the subset of the reference tree with refcnt >= 1. - * The intersection of N space maps is the subset with refcnt >= N. - * - * [It's very much like a Fourier transform. Unions and intersections - * are hard to perform in the 'space map domain', so we convert the maps - * into the 'reference count domain', where it's trivial, then invert.] - * - * vdev_dtl_reassess() uses computations of this form to determine - * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev - * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev - * has an outage wherever refcnt >= vdev_children. - */ -static int -space_map_ref_compare(const void *x1, const void *x2) +int +space_map_open(space_map_t **smp, objset_t *os, uint64_t object, + uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp) { - const space_ref_t *sr1 = x1; - const space_ref_t *sr2 = x2; + space_map_t *sm; + int error; + + ASSERT(*smp == NULL); + ASSERT(os != NULL); + ASSERT(object != 0); - if (sr1->sr_offset < sr2->sr_offset) - return (-1); - if (sr1->sr_offset > sr2->sr_offset) - return (1); - - if (sr1 < sr2) - return (-1); - if (sr1 > sr2) - return (1); + sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP); + + sm->sm_start = start; + sm->sm_size = size; + sm->sm_shift = shift; + sm->sm_lock = lp; + sm->sm_os = os; + sm->sm_object = object; + + error = space_map_open_impl(sm); + if (error != 0) { + space_map_close(sm); + return (error); + } + + *smp = sm; return (0); } void -space_map_ref_create(avl_tree_t *t) +space_map_close(space_map_t *sm) { - avl_create(t, space_map_ref_compare, - sizeof (space_ref_t), offsetof(space_ref_t, sr_node)); + if (sm == NULL) + return; + + if (sm->sm_dbuf != NULL) + dmu_buf_rele(sm->sm_dbuf, sm); + sm->sm_dbuf = NULL; + sm->sm_phys = NULL; + + kmem_free(sm, sizeof (*sm)); } void -space_map_ref_destroy(avl_tree_t *t) +space_map_truncate(space_map_t *sm, dmu_tx_t *tx) { - space_ref_t *sr; - void *cookie = NULL; + objset_t *os = sm->sm_os; + spa_t *spa = dmu_objset_spa(os); + dmu_object_info_t doi; - while ((sr = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(sr, sizeof (*sr)); + ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); + ASSERT(dmu_tx_is_syncing(tx)); - avl_destroy(t); -} + dmu_object_info_from_db(sm->sm_dbuf, &doi); -static void -space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt) -{ - space_ref_t *sr; + /* + * If the space map has the wrong bonus size (because + * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or + * the wrong block size (because space_map_blksz has changed), + * free and re-allocate its object with the updated sizes. + * + * Otherwise, just truncate the current object. + */ + if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && + doi.doi_bonus_size != sizeof (space_map_phys_t)) || + doi.doi_data_block_size != space_map_blksz) { + zfs_dbgmsg("txg %llu, spa %s, reallocating: " + "old bonus %u, old blocksz %u", dmu_tx_get_txg(tx), + spa_name(spa), doi.doi_bonus_size, doi.doi_data_block_size); - sr = kmem_alloc(sizeof (*sr), KM_SLEEP); - sr->sr_offset = offset; - sr->sr_refcnt = refcnt; + space_map_free(sm, tx); + dmu_buf_rele(sm->sm_dbuf, sm); - avl_add(t, sr); -} + sm->sm_object = space_map_alloc(sm->sm_os, tx); + VERIFY0(space_map_open_impl(sm)); + } else { + VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx)); -void -space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, - int64_t refcnt) -{ - space_map_ref_add_node(t, start, refcnt); - space_map_ref_add_node(t, end, -refcnt); + /* + * If the spacemap is reallocated, its histogram + * will be reset. Do the same in the common case so that + * bugs related to the uncommon case do not go unnoticed. + */ + bzero(sm->sm_phys->smp_histogram, + sizeof (sm->sm_phys->smp_histogram)); + } + + dmu_buf_will_dirty(sm->sm_dbuf, tx); + sm->sm_phys->smp_objsize = 0; + sm->sm_phys->smp_alloc = 0; } /* - * Convert (or add) a space map into a reference tree. + * Update the in-core space_map allocation and length values. */ void -space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt) +space_map_update(space_map_t *sm) { - space_seg_t *ss; + if (sm == NULL) + return; ASSERT(MUTEX_HELD(sm->sm_lock)); - for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) - space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt); + sm->sm_alloc = sm->sm_phys->smp_alloc; + sm->sm_length = sm->sm_phys->smp_objsize; } -/* - * Convert a reference tree into a space map. The space map will contain - * all members of the reference tree for which refcnt >= minref. - */ -void -space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref) +uint64_t +space_map_alloc(objset_t *os, dmu_tx_t *tx) { - uint64_t start = -1ULL; - int64_t refcnt = 0; - space_ref_t *sr; + spa_t *spa = dmu_objset_spa(os); + uint64_t object; + int bonuslen; - ASSERT(MUTEX_HELD(sm->sm_lock)); + if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { + spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); + bonuslen = sizeof (space_map_phys_t); + ASSERT3U(bonuslen, <=, dmu_bonus_max()); + } else { + bonuslen = SPACE_MAP_SIZE_V0; + } - space_map_vacate(sm, NULL, NULL); + object = dmu_object_alloc(os, + DMU_OT_SPACE_MAP, space_map_blksz, + DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); - for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) { - refcnt += sr->sr_refcnt; - if (refcnt >= minref) { - if (start == -1ULL) { - start = sr->sr_offset; - } - } else { - if (start != -1ULL) { - uint64_t end = sr->sr_offset; - ASSERT(start <= end); - if (end > start) - space_map_add(sm, start, end - start); - start = -1ULL; - } + return (object); +} + +void +space_map_free(space_map_t *sm, dmu_tx_t *tx) +{ + spa_t *spa; + + if (sm == NULL) + return; + + spa = dmu_objset_spa(sm->sm_os); + if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { + dmu_object_info_t doi; + + dmu_object_info_from_db(sm->sm_dbuf, &doi); + if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) { + VERIFY(spa_feature_is_active(spa, + SPA_FEATURE_SPACEMAP_HISTOGRAM)); + spa_feature_decr(spa, + SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); } } - ASSERT(refcnt == 0); - ASSERT(start == -1ULL); + + VERIFY3U(dmu_object_free(sm->sm_os, space_map_object(sm), tx), ==, 0); + sm->sm_object = 0; +} + +uint64_t +space_map_object(space_map_t *sm) +{ + return (sm != NULL ? sm->sm_object : 0); +} + +/* + * Returns the already synced, on-disk allocated space. + */ +uint64_t +space_map_allocated(space_map_t *sm) +{ + return (sm != NULL ? sm->sm_alloc : 0); +} + +/* + * Returns the already synced, on-disk length; + */ +uint64_t +space_map_length(space_map_t *sm) +{ + return (sm != NULL ? sm->sm_length : 0); +} + +/* + * Returns the allocated space that is currently syncing. + */ +int64_t +space_map_alloc_delta(space_map_t *sm) +{ + if (sm == NULL) + return (0); + ASSERT(sm->sm_dbuf != NULL); + return (sm->sm_phys->smp_alloc - space_map_allocated(sm)); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/space_reftree.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/space_reftree.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/space_reftree.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/space_reftree.c 22 Nov 2015 17:22:31 -0000 @@ -0,0 +1,159 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + */ + +#include +#include +#include + +/* + * Space reference trees. + * + * A range tree is a collection of integers. Every integer is either + * in the tree, or it's not. A space reference tree generalizes + * the idea: it allows its members to have arbitrary reference counts, + * as opposed to the implicit reference count of 0 or 1 in a range tree. + * This representation comes in handy when computing the union or + * intersection of multiple space maps. For example, the union of + * N range trees is the subset of the reference tree with refcnt >= 1. + * The intersection of N range trees is the subset with refcnt >= N. + * + * [It's very much like a Fourier transform. Unions and intersections + * are hard to perform in the 'range tree domain', so we convert the trees + * into the 'reference count domain', where it's trivial, then invert.] + * + * vdev_dtl_reassess() uses computations of this form to determine + * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev + * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev + * has an outage wherever refcnt >= vdev_children. + */ +static int +space_reftree_compare(const void *x1, const void *x2) +{ + const space_ref_t *sr1 = x1; + const space_ref_t *sr2 = x2; + + if (sr1->sr_offset < sr2->sr_offset) + return (-1); + if (sr1->sr_offset > sr2->sr_offset) + return (1); + + if (sr1 < sr2) + return (-1); + if (sr1 > sr2) + return (1); + + return (0); +} + +void +space_reftree_create(avl_tree_t *t) +{ + avl_create(t, space_reftree_compare, + sizeof (space_ref_t), offsetof(space_ref_t, sr_node)); +} + +void +space_reftree_destroy(avl_tree_t *t) +{ + space_ref_t *sr; + void *cookie = NULL; + + while ((sr = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(sr, sizeof (*sr)); + + avl_destroy(t); +} + +static void +space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt) +{ + space_ref_t *sr; + + sr = kmem_alloc(sizeof (*sr), KM_SLEEP); + sr->sr_offset = offset; + sr->sr_refcnt = refcnt; + + avl_add(t, sr); +} + +void +space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, + int64_t refcnt) +{ + space_reftree_add_node(t, start, refcnt); + space_reftree_add_node(t, end, -refcnt); +} + +/* + * Convert (or add) a range tree into a reference tree. + */ +void +space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt) +{ + range_seg_t *rs; + + ASSERT(MUTEX_HELD(rt->rt_lock)); + + for (rs = avl_first(&rt->rt_root); rs; rs = AVL_NEXT(&rt->rt_root, rs)) + space_reftree_add_seg(t, rs->rs_start, rs->rs_end, refcnt); +} + +/* + * Convert a reference tree into a range tree. The range tree will contain + * all members of the reference tree for which refcnt >= minref. + */ +void +space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref) +{ + uint64_t start = -1ULL; + int64_t refcnt = 0; + space_ref_t *sr; + + ASSERT(MUTEX_HELD(rt->rt_lock)); + + range_tree_vacate(rt, NULL, NULL); + + for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) { + refcnt += sr->sr_refcnt; + if (refcnt >= minref) { + if (start == -1ULL) { + start = sr->sr_offset; + } + } else { + if (start != -1ULL) { + uint64_t end = sr->sr_offset; + ASSERT(start <= end); + if (end > start) + range_tree_add(rt, start, end - start); + start = -1ULL; + } + } + } + ASSERT(refcnt == 0); + ASSERT(start == -1ULL); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/trim_map.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/trim_map.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/trim_map.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/trim_map.c 22 Apr 2017 10:02:25 -0000 @@ -0,0 +1,657 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012 Pawel Jakub Dawidek . + * All rights reserved. + */ + +#include +#include +#include +#include +#include + +/* + * Calculate the zio end, upgrading based on ashift which would be + * done by zio_vdev_io_start. + * + * This makes free range consolidation much more effective + * than it would otherwise be as well as ensuring that entire + * blocks are invalidated by writes. + */ +#define TRIM_ZIO_END(vd, offset, size) (offset + \ + P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift)) + +/* Maximal segment size for ATA TRIM. */ +#define TRIM_MAP_SIZE_FACTOR (512 << 16) + +#define TRIM_MAP_SEGS(size) (1 + (size) / TRIM_MAP_SIZE_FACTOR) + +#define TRIM_MAP_ADD(tm, ts) do { \ + list_insert_tail(&(tm)->tm_head, (ts)); \ + (tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \ +} while (0) + +#define TRIM_MAP_REM(tm, ts) do { \ + list_remove(&(tm)->tm_head, (ts)); \ + (tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \ +} while (0) + +typedef struct trim_map { + list_t tm_head; /* List of segments sorted by txg. */ + avl_tree_t tm_queued_frees; /* AVL tree of segments waiting for TRIM. */ + avl_tree_t tm_inflight_frees; /* AVL tree of in-flight TRIMs. */ + avl_tree_t tm_inflight_writes; /* AVL tree of in-flight writes. */ + list_t tm_pending_writes; /* Writes blocked on in-flight frees. */ + kmutex_t tm_lock; + uint64_t tm_pending; /* Count of pending TRIMs. */ +} trim_map_t; + +typedef struct trim_seg { + avl_node_t ts_node; /* AVL node. */ + list_node_t ts_next; /* List element. */ + uint64_t ts_start; /* Starting offset of this segment. */ + uint64_t ts_end; /* Ending offset (non-inclusive). */ + uint64_t ts_txg; /* Segment creation txg. */ + hrtime_t ts_time; /* Segment creation time. */ +} trim_seg_t; + +extern boolean_t zfs_trim_enabled; + +static u_int trim_txg_delay = 32; /* Keep deleted data up to 32 TXG */ +static u_int trim_timeout = 30; /* Keep deleted data up to 30s */ +static u_int trim_max_interval = 1; /* 1s delays between TRIMs */ +static u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */ + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM"); + +SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay, + 0, "Delay TRIMs by up to this many TXGs"); +SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0, + "Delay TRIMs by up to this many seconds"); +SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN, + &trim_max_interval, 0, + "Maximum interval between TRIM queue processing (seconds)"); + +SYSCTL_DECL(_vfs_zfs_vdev); +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN, + &trim_vdev_max_pending, 0, + "Maximum pending TRIM segments for a vdev"); + +static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd); + +static int +trim_map_seg_compare(const void *x1, const void *x2) +{ + const trim_seg_t *s1 = x1; + const trim_seg_t *s2 = x2; + + if (s1->ts_start < s2->ts_start) { + if (s1->ts_end > s2->ts_start) + return (0); + return (-1); + } + if (s1->ts_start > s2->ts_start) { + if (s1->ts_start < s2->ts_end) + return (0); + return (1); + } + return (0); +} + +static int +trim_map_zio_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_offset < z2->io_offset) { + if (z1->io_offset + z1->io_size > z2->io_offset) + return (0); + return (-1); + } + if (z1->io_offset > z2->io_offset) { + if (z1->io_offset < z2->io_offset + z2->io_size) + return (0); + return (1); + } + return (0); +} + +void +trim_map_create(vdev_t *vd) +{ + trim_map_t *tm; + + ASSERT(zfs_trim_enabled && !vd->vdev_notrim && + vd->vdev_ops->vdev_op_leaf); + + tm = kmem_zalloc(sizeof (*tm), KM_SLEEP); + mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&tm->tm_head, sizeof (trim_seg_t), + offsetof(trim_seg_t, ts_next)); + list_create(&tm->tm_pending_writes, sizeof (zio_t), + offsetof(zio_t, io_trim_link)); + avl_create(&tm->tm_queued_frees, trim_map_seg_compare, + sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); + avl_create(&tm->tm_inflight_frees, trim_map_seg_compare, + sizeof (trim_seg_t), offsetof(trim_seg_t, ts_node)); + avl_create(&tm->tm_inflight_writes, trim_map_zio_compare, + sizeof (zio_t), offsetof(zio_t, io_trim_node)); + vd->vdev_trimmap = tm; +} + +void +trim_map_destroy(vdev_t *vd) +{ + trim_map_t *tm; + trim_seg_t *ts; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (!zfs_trim_enabled) + return; + + tm = vd->vdev_trimmap; + if (tm == NULL) + return; + + /* + * We may have been called before trim_map_vdev_commit_done() + * had a chance to run, so do it now to prune the remaining + * inflight frees. + */ + trim_map_vdev_commit_done(vd->vdev_spa, vd); + + mutex_enter(&tm->tm_lock); + while ((ts = list_head(&tm->tm_head)) != NULL) { + avl_remove(&tm->tm_queued_frees, ts); + TRIM_MAP_REM(tm, ts); + kmem_free(ts, sizeof (*ts)); + } + mutex_exit(&tm->tm_lock); + + avl_destroy(&tm->tm_queued_frees); + avl_destroy(&tm->tm_inflight_frees); + avl_destroy(&tm->tm_inflight_writes); + list_destroy(&tm->tm_pending_writes); + list_destroy(&tm->tm_head); + mutex_destroy(&tm->tm_lock); + kmem_free(tm, sizeof (*tm)); + vd->vdev_trimmap = NULL; +} + +static void +trim_map_segment_add(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) +{ + avl_index_t where; + trim_seg_t tsearch, *ts_before, *ts_after, *ts; + boolean_t merge_before, merge_after; + hrtime_t time; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + VERIFY(start < end); + + time = gethrtime(); + tsearch.ts_start = start; + tsearch.ts_end = end; + + ts = avl_find(&tm->tm_queued_frees, &tsearch, &where); + if (ts != NULL) { + if (start < ts->ts_start) + trim_map_segment_add(tm, start, ts->ts_start, txg); + if (end > ts->ts_end) + trim_map_segment_add(tm, ts->ts_end, end, txg); + return; + } + + ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE); + ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER); + + merge_before = (ts_before != NULL && ts_before->ts_end == start); + merge_after = (ts_after != NULL && ts_after->ts_start == end); + + if (merge_before && merge_after) { + avl_remove(&tm->tm_queued_frees, ts_before); + TRIM_MAP_REM(tm, ts_before); + TRIM_MAP_REM(tm, ts_after); + ts_after->ts_start = ts_before->ts_start; + ts_after->ts_txg = txg; + ts_after->ts_time = time; + TRIM_MAP_ADD(tm, ts_after); + kmem_free(ts_before, sizeof (*ts_before)); + } else if (merge_before) { + TRIM_MAP_REM(tm, ts_before); + ts_before->ts_end = end; + ts_before->ts_txg = txg; + ts_before->ts_time = time; + TRIM_MAP_ADD(tm, ts_before); + } else if (merge_after) { + TRIM_MAP_REM(tm, ts_after); + ts_after->ts_start = start; + ts_after->ts_txg = txg; + ts_after->ts_time = time; + TRIM_MAP_ADD(tm, ts_after); + } else { + ts = kmem_alloc(sizeof (*ts), KM_SLEEP); + ts->ts_start = start; + ts->ts_end = end; + ts->ts_txg = txg; + ts->ts_time = time; + avl_insert(&tm->tm_queued_frees, ts, where); + TRIM_MAP_ADD(tm, ts); + } +} + +static void +trim_map_segment_remove(trim_map_t *tm, trim_seg_t *ts, uint64_t start, + uint64_t end) +{ + trim_seg_t *nts; + boolean_t left_over, right_over; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + left_over = (ts->ts_start < start); + right_over = (ts->ts_end > end); + + TRIM_MAP_REM(tm, ts); + if (left_over && right_over) { + nts = kmem_alloc(sizeof (*nts), KM_SLEEP); + nts->ts_start = end; + nts->ts_end = ts->ts_end; + nts->ts_txg = ts->ts_txg; + nts->ts_time = ts->ts_time; + ts->ts_end = start; + avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER); + TRIM_MAP_ADD(tm, ts); + TRIM_MAP_ADD(tm, nts); + } else if (left_over) { + ts->ts_end = start; + TRIM_MAP_ADD(tm, ts); + } else if (right_over) { + ts->ts_start = end; + TRIM_MAP_ADD(tm, ts); + } else { + avl_remove(&tm->tm_queued_frees, ts); + kmem_free(ts, sizeof (*ts)); + } +} + +static void +trim_map_free_locked(trim_map_t *tm, uint64_t start, uint64_t end, uint64_t txg) +{ + zio_t zsearch, *zs; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + + zsearch.io_offset = start; + zsearch.io_size = end - start; + + zs = avl_find(&tm->tm_inflight_writes, &zsearch, NULL); + if (zs == NULL) { + trim_map_segment_add(tm, start, end, txg); + return; + } + if (start < zs->io_offset) + trim_map_free_locked(tm, start, zs->io_offset, txg); + if (zs->io_offset + zs->io_size < end) + trim_map_free_locked(tm, zs->io_offset + zs->io_size, end, txg); +} + +void +trim_map_free(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) +{ + trim_map_t *tm = vd->vdev_trimmap; + + if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + trim_map_free_locked(tm, offset, TRIM_ZIO_END(vd, offset, size), txg); + mutex_exit(&tm->tm_lock); +} + +boolean_t +trim_map_write_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t tsearch, *ts; + boolean_t left_over, right_over; + uint64_t start, end; + + if (!zfs_trim_enabled || vd->vdev_notrim || tm == NULL) + return (B_TRUE); + + start = zio->io_offset; + end = TRIM_ZIO_END(zio->io_vd, start, zio->io_size); + tsearch.ts_start = start; + tsearch.ts_end = end; + + mutex_enter(&tm->tm_lock); + + /* + * Checking for colliding in-flight frees. + */ + ts = avl_find(&tm->tm_inflight_frees, &tsearch, NULL); + if (ts != NULL) { + list_insert_tail(&tm->tm_pending_writes, zio); + mutex_exit(&tm->tm_lock); + return (B_FALSE); + } + + ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); + if (ts != NULL) { + /* + * Loop until all overlapping segments are removed. + */ + do { + trim_map_segment_remove(tm, ts, start, end); + ts = avl_find(&tm->tm_queued_frees, &tsearch, NULL); + } while (ts != NULL); + } + avl_add(&tm->tm_inflight_writes, zio); + + mutex_exit(&tm->tm_lock); + + return (B_TRUE); +} + +void +trim_map_write_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + trim_map_t *tm = vd->vdev_trimmap; + + /* + * Don't check for vdev_notrim, since the write could have + * started before vdev_notrim was set. + */ + if (!zfs_trim_enabled || tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + /* + * Don't fail if the write isn't in the tree, since the write + * could have started after vdev_notrim was set. + */ + if (zio->io_trim_node.avl_child[0] || + zio->io_trim_node.avl_child[1] || + AVL_XPARENT(&zio->io_trim_node) || + tm->tm_inflight_writes.avl_root == &zio->io_trim_node) + avl_remove(&tm->tm_inflight_writes, zio); + mutex_exit(&tm->tm_lock); +} + +/* + * Return the oldest segment (the one with the lowest txg / time) or NULL if: + * 1. The list is empty + * 2. The first element's txg is greater than txgsafe + * 3. The first element's txg is not greater than the txg argument and the + * the first element's time is not greater than time argument + */ +static trim_seg_t * +trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time, + boolean_t force) +{ + trim_seg_t *ts; + + ASSERT(MUTEX_HELD(&tm->tm_lock)); + VERIFY(txgsafe >= txg); + + ts = list_head(&tm->tm_head); + if (ts != NULL && ts->ts_txg <= txgsafe && + (ts->ts_txg <= txg || ts->ts_time <= time || force)) + return (ts); + return (NULL); +} + +static void +trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd) +{ + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t *ts; + uint64_t size, offset, txgtarget, txgsafe; + int64_t hard, soft; + hrtime_t timelimit; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (tm == NULL) + return; + + timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC; + if (vd->vdev_isl2cache) { + txgsafe = UINT64_MAX; + txgtarget = UINT64_MAX; + } else { + txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa)); + if (txgsafe > trim_txg_delay) + txgtarget = txgsafe - trim_txg_delay; + else + txgtarget = 0; + } + + mutex_enter(&tm->tm_lock); + hard = 0; + if (tm->tm_pending > trim_vdev_max_pending) + hard = (tm->tm_pending - trim_vdev_max_pending) / 4; + soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64); + /* Loop until we have sent all outstanding free's */ + while (soft > 0 && + (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0)) + != NULL) { + TRIM_MAP_REM(tm, ts); + avl_remove(&tm->tm_queued_frees, ts); + avl_add(&tm->tm_inflight_frees, ts); + size = ts->ts_end - ts->ts_start; + offset = ts->ts_start; + /* + * We drop the lock while we call zio_nowait as the IO + * scheduler can result in a different IO being run e.g. + * a write which would result in a recursive lock. + */ + mutex_exit(&tm->tm_lock); + + zio_nowait(zio_trim(zio, spa, vd, offset, size)); + + soft -= TRIM_MAP_SEGS(size); + hard -= TRIM_MAP_SEGS(size); + mutex_enter(&tm->tm_lock); + } + mutex_exit(&tm->tm_lock); +} + +static void +trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd) +{ + trim_map_t *tm = vd->vdev_trimmap; + trim_seg_t *ts; + list_t pending_writes; + zio_t *zio; + uint64_t start, size; + void *cookie; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if (tm == NULL) + return; + + mutex_enter(&tm->tm_lock); + if (!avl_is_empty(&tm->tm_inflight_frees)) { + cookie = NULL; + while ((ts = avl_destroy_nodes(&tm->tm_inflight_frees, + &cookie)) != NULL) { + kmem_free(ts, sizeof (*ts)); + } + } + list_create(&pending_writes, sizeof (zio_t), offsetof(zio_t, + io_trim_link)); + list_move_tail(&pending_writes, &tm->tm_pending_writes); + mutex_exit(&tm->tm_lock); + + while ((zio = list_remove_head(&pending_writes)) != NULL) { + zio_vdev_io_reissue(zio); + zio_execute(zio); + } + list_destroy(&pending_writes); +} + +static void +trim_map_commit(spa_t *spa, zio_t *zio, vdev_t *vd) +{ + int c; + + if (vd == NULL) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + trim_map_vdev_commit(spa, zio, vd); + } else { + for (c = 0; c < vd->vdev_children; c++) + trim_map_commit(spa, zio, vd->vdev_child[c]); + } +} + +static void +trim_map_commit_done(spa_t *spa, vdev_t *vd) +{ + int c; + + if (vd == NULL) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + trim_map_vdev_commit_done(spa, vd); + } else { + for (c = 0; c < vd->vdev_children; c++) + trim_map_commit_done(spa, vd->vdev_child[c]); + } +} + +static void +trim_thread(void *arg) +{ + spa_t *spa = arg; + zio_t *zio; + +#ifdef __FreeBSD__ +#ifdef _KERNEL + (void) snprintf(curthread->td_name, sizeof(curthread->td_name), + "trim %s", spa_name(spa)); +#endif +#endif +#ifdef __NetBSD__ +#ifdef _KERNEL + size_t sz; + char *name, *oname; + struct lwp *l = curlwp; + + name = kmem_alloc(MAXCOMLEN, KM_SLEEP); + snprintf(name, MAXCOMLEN, "trim %s", spa_name(spa)); + name[MAXCOMLEN - 1] = 0; + + lwp_lock(l); + oname = l->l_name; + l->l_name = name; + lwp_unlock(l); + + if (oname != NULL) + kmem_free(oname, MAXCOMLEN); +#endif +#endif + + for (;;) { + mutex_enter(&spa->spa_trim_lock); + if (spa->spa_trim_thread == NULL) { + spa->spa_trim_thread = curthread; + cv_signal(&spa->spa_trim_cv); + mutex_exit(&spa->spa_trim_lock); + thread_exit(); + } + + (void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock, + hz * trim_max_interval); + mutex_exit(&spa->spa_trim_lock); + + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + trim_map_commit(spa, zio, spa->spa_root_vdev); + (void) zio_wait(zio); + trim_map_commit_done(spa, spa->spa_root_vdev); + spa_config_exit(spa, SCL_STATE, FTAG); + } +} + +void +trim_thread_create(spa_t *spa) +{ + + if (!zfs_trim_enabled) + return; + + mutex_init(&spa->spa_trim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&spa->spa_trim_cv, NULL, CV_DEFAULT, NULL); + mutex_enter(&spa->spa_trim_lock); + spa->spa_trim_thread = thread_create(NULL, 0, trim_thread, spa, 0, &p0, + TS_RUN, minclsyspri); + mutex_exit(&spa->spa_trim_lock); +} + +void +trim_thread_destroy(spa_t *spa) +{ + + if (!zfs_trim_enabled) + return; + if (spa->spa_trim_thread == NULL) + return; + + mutex_enter(&spa->spa_trim_lock); + /* Setting spa_trim_thread to NULL tells the thread to stop. */ + spa->spa_trim_thread = NULL; + cv_signal(&spa->spa_trim_cv); + /* The thread will set it back to != NULL on exit. */ + while (spa->spa_trim_thread == NULL) + cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock); + spa->spa_trim_thread = NULL; + mutex_exit(&spa->spa_trim_lock); + + cv_destroy(&spa->spa_trim_cv); + mutex_destroy(&spa->spa_trim_lock); +} + +void +trim_thread_wakeup(spa_t *spa) +{ + + if (!zfs_trim_enabled) + return; + if (spa->spa_trim_thread == NULL) + return; + + mutex_enter(&spa->spa_trim_lock); + cv_signal(&spa->spa_trim_cv); + mutex_exit(&spa->spa_trim_lock); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c,v retrieving revision 1.4 diff -u -p -r1.4 txg.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c 20 Nov 2011 02:54:25 -0000 1.4 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c 4 Feb 2015 07:24:17 -0000 @@ -19,8 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright 2011 Martin Matuska + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include @@ -28,16 +29,91 @@ #include #include #include +#include #include /* - * Pool-wide transaction groups. - */ - -static void txg_sync_thread(void *); -static void txg_quiesce_thread(void *); - -int zfs_txg_timeout = 30; /* max seconds worth of delta per txg */ + * ZFS Transaction Groups + * ---------------------- + * + * ZFS transaction groups are, as the name implies, groups of transactions + * that act on persistent state. ZFS asserts consistency at the granularity of + * these transaction groups. Each successive transaction group (txg) is + * assigned a 64-bit consecutive identifier. There are three active + * transaction group states: open, quiescing, or syncing. At any given time, + * there may be an active txg associated with each state; each active txg may + * either be processing, or blocked waiting to enter the next state. There may + * be up to three active txgs, and there is always a txg in the open state + * (though it may be blocked waiting to enter the quiescing state). In broad + * strokes, transactions -- operations that change in-memory structures -- are + * accepted into the txg in the open state, and are completed while the txg is + * in the open or quiescing states. The accumulated changes are written to + * disk in the syncing state. + * + * Open + * + * When a new txg becomes active, it first enters the open state. New + * transactions -- updates to in-memory structures -- are assigned to the + * currently open txg. There is always a txg in the open state so that ZFS can + * accept new changes (though the txg may refuse new changes if it has hit + * some limit). ZFS advances the open txg to the next state for a variety of + * reasons such as it hitting a time or size threshold, or the execution of an + * administrative action that must be completed in the syncing state. + * + * Quiescing + * + * After a txg exits the open state, it enters the quiescing state. The + * quiescing state is intended to provide a buffer between accepting new + * transactions in the open state and writing them out to stable storage in + * the syncing state. While quiescing, transactions can continue their + * operation without delaying either of the other states. Typically, a txg is + * in the quiescing state very briefly since the operations are bounded by + * software latencies rather than, say, slower I/O latencies. After all + * transactions complete, the txg is ready to enter the next state. + * + * Syncing + * + * In the syncing state, the in-memory state built up during the open and (to + * a lesser degree) the quiescing states is written to stable storage. The + * process of writing out modified data can, in turn modify more data. For + * example when we write new blocks, we need to allocate space for them; those + * allocations modify metadata (space maps)... which themselves must be + * written to stable storage. During the sync state, ZFS iterates, writing out + * data until it converges and all in-memory changes have been written out. + * The first such pass is the largest as it encompasses all the modified user + * data (as opposed to filesystem metadata). Subsequent passes typically have + * far less data to write as they consist exclusively of filesystem metadata. + * + * To ensure convergence, after a certain number of passes ZFS begins + * overwriting locations on stable storage that had been allocated earlier in + * the syncing state (and subsequently freed). ZFS usually allocates new + * blocks to optimize for large, continuous, writes. For the syncing state to + * converge however it must complete a pass where no new blocks are allocated + * since each allocation requires a modification of persistent metadata. + * Further, to hasten convergence, after a prescribed number of passes, ZFS + * also defers frees, and stops compressing. + * + * In addition to writing out user data, we must also execute synctasks during + * the syncing context. A synctask is the mechanism by which some + * administrative activities work such as creating and destroying snapshots or + * datasets. Note that when a synctask is initiated it enters the open txg, + * and ZFS then pushes that txg as quickly as possible to completion of the + * syncing state in order to reduce the latency of the administrative + * activity. To complete the syncing state, ZFS writes out a new uberblock, + * the root of the tree of blocks that comprise all state stored on the ZFS + * pool. Finally, if there is a quiesced txg waiting, we signal that it can + * now transition to the syncing state. + */ + +static void txg_sync_thread(void *arg); +static void txg_quiesce_thread(void *arg); + +int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS TXG"); +SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RWTUN, &zfs_txg_timeout, 0, + "Maximum seconds worth of delta per txg"); /* * Prepare the txg subsystem. @@ -55,6 +131,8 @@ txg_init(dsl_pool_t *dp, uint64_t txg) int i; mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT, + NULL); for (i = 0; i < TXG_SIZE; i++) { cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL); @@ -97,6 +175,7 @@ txg_fini(dsl_pool_t *dp) for (c = 0; c < max_ncpus; c++) { int i; + mutex_destroy(&tx->tx_cpu[c].tc_open_lock); mutex_destroy(&tx->tx_cpu[c].tc_lock); for (i = 0; i < TXG_SIZE; i++) { cv_destroy(&tx->tx_cpu[c].tc_cv[i]); @@ -136,7 +215,7 @@ txg_sync_start(dsl_pool_t *dp) * 32-bit x86. This is due in part to nested pools and * scrub_visitbp() recursion. */ - tx->tx_sync_thread = thread_create(NULL, 12<<10, txg_sync_thread, + tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread, dp, 0, &p0, TS_RUN, minclsyspri); mutex_exit(&tx->tx_sync_lock); @@ -161,7 +240,7 @@ txg_thread_exit(tx_state_t *tx, callb_cp } static void -txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time) +txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time) { CALLB_CPR_SAFE_BEGIN(cpr); @@ -220,10 +299,12 @@ txg_hold_open(dsl_pool_t *dp, txg_handle tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID]; uint64_t txg; - mutex_enter(&tc->tc_lock); - + mutex_enter(&tc->tc_open_lock); txg = tx->tx_open_txg; + + mutex_enter(&tc->tc_lock); tc->tc_count[txg & TXG_MASK]++; + mutex_exit(&tc->tc_lock); th->th_cpu = tc; th->th_txg = txg; @@ -236,7 +317,8 @@ txg_rele_to_quiesce(txg_handle_t *th) { tx_cpu_t *tc = th->th_cpu; - mutex_exit(&tc->tc_lock); + ASSERT(!MUTEX_HELD(&tc->tc_lock)); + mutex_exit(&tc->tc_open_lock); } void @@ -265,7 +347,13 @@ txg_rele_to_sync(txg_handle_t *th) th->th_cpu = NULL; /* defensive */ } -static void +/* + * Blocks until all transactions in the group are committed. + * + * On return, the transaction group has reached a stable state in which it can + * then be passed off to the syncing context. + */ +static __noinline void txg_quiesce(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; @@ -273,20 +361,24 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg int c; /* - * Grab all tx_cpu locks so nobody else can get into this txg. + * Grab all tc_open_locks so nobody else can get into this txg. */ for (c = 0; c < max_ncpus; c++) - mutex_enter(&tx->tx_cpu[c].tc_lock); + mutex_enter(&tx->tx_cpu[c].tc_open_lock); ASSERT(txg == tx->tx_open_txg); tx->tx_open_txg++; + tx->tx_open_time = gethrtime(); + + DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg); + DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg); /* * Now that we've incremented tx_open_txg, we can let threads * enter the next transaction group. */ for (c = 0; c < max_ncpus; c++) - mutex_exit(&tx->tx_cpu[c].tc_lock); + mutex_exit(&tx->tx_cpu[c].tc_open_lock); /* * Quiesce the transaction group by waiting for everyone to txg_exit(). @@ -301,8 +393,10 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg } static void -txg_do_callbacks(list_t *cb_list) +txg_do_callbacks(void *arg) { + list_t *cb_list = arg; + dmu_tx_do_callbacks(cb_list, 0); list_destroy(cb_list); @@ -312,6 +406,9 @@ txg_do_callbacks(list_t *cb_list) /* * Dispatch the commit callbacks registered on this txg to worker threads. + * + * If no callbacks are registered for a given TXG, nothing happens. + * This function creates a taskq for the associated pool, if needed. */ static void txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) @@ -322,7 +419,10 @@ txg_dispatch_callbacks(dsl_pool_t *dp, u for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; - /* No need to lock tx_cpu_t at this point */ + /* + * No need to lock tx_cpu_t at this point, since this can + * only be called once a txg has been synced. + */ int g = txg & TXG_MASK; @@ -342,7 +442,7 @@ txg_dispatch_callbacks(dsl_pool_t *dp, u list_create(cb_list, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); - list_move_tail(&tc->tc_callbacks[g], cb_list); + list_move_tail(cb_list, &tc->tc_callbacks[g]); (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) txg_do_callbacks, cb_list, TQ_SLEEP); @@ -359,24 +459,24 @@ txg_sync_thread(void *arg) uint64_t start, delta; txg_thread_enter(tx, &cpr); - dprintf("txg_sync_thread called\n"); + start = delta = 0; for (;;) { - uint64_t timer, timeout = zfs_txg_timeout * hz; + uint64_t timeout = zfs_txg_timeout * hz; + uint64_t timer; uint64_t txg; - dprintf("txg_sync_thread thread for\n"); + /* - * We sync when we're scrubbing, there's someone waiting + * We sync when we're scanning, there's someone waiting * on us, or the quiesce thread has handed off a txg to * us, or we have reached our timeout. */ timer = (delta >= timeout ? 0 : timeout - delta); - while ((dp->dp_scrub_func == SCRUB_FUNC_NONE || - spa_load_state(spa) != SPA_LOAD_NONE || - spa_shutting_down(spa)) && + while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && - tx->tx_quiesced_txg == 0) { + tx->tx_quiesced_txg == 0 && + dp->dp_dirty_total < zfs_dirty_data_sync) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); @@ -406,6 +506,7 @@ txg_sync_thread(void *arg) txg = tx->tx_quiesced_txg; tx->tx_quiesced_txg = 0; tx->tx_syncing_txg = txg; + DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg); cv_broadcast(&tx->tx_quiesce_more_cv); dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", @@ -419,6 +520,7 @@ txg_sync_thread(void *arg) mutex_enter(&tx->tx_sync_lock); tx->tx_synced_txg = txg; tx->tx_syncing_txg = 0; + DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg); cv_broadcast(&tx->tx_sync_done_cv); /* @@ -468,23 +570,24 @@ txg_quiesce_thread(void *arg) */ dprintf("quiesce done, handing off txg %llu\n", txg); tx->tx_quiesced_txg = txg; + DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); cv_broadcast(&tx->tx_sync_more_cv); cv_broadcast(&tx->tx_quiesce_done_cv); } } /* - * Delay this thread by 'ticks' if we are still in the open transaction - * group and there is already a waiting txg quiesing or quiesced. Abort - * the delay if this txg stalls or enters the quiesing state. + * Delay this thread by delay nanoseconds if we are still in the open + * transaction group and there is already a waiting txg quiesing or quiesced. + * Abort the delay if this txg stalls or enters the quiesing state. */ void -txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) +txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) { tx_state_t *tx = &dp->dp_tx; - int timeout = ddi_get_lbolt() + ticks; + hrtime_t start = gethrtime(); - /* don't delay if this txg could transition to quiesing immediately */ + /* don't delay if this txg could transition to quiescing immediately */ if (tx->tx_open_txg > txg || tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) return; @@ -495,10 +598,11 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, return; } - while (ddi_get_lbolt() < timeout && - tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) - (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock, - timeout); + while (gethrtime() - start < delay && + tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) { + (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv, + &tx->tx_sync_lock, delay, resolution, 0); + } mutex_exit(&tx->tx_sync_lock); } @@ -508,6 +612,8 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t { tx_state_t *tx = &dp->dp_tx; + ASSERT(!dsl_pool_config_held(dp)); + mutex_enter(&tx->tx_sync_lock); ASSERT(tx->tx_threads == 2); if (txg == 0) @@ -531,6 +637,8 @@ txg_wait_open(dsl_pool_t *dp, uint64_t t { tx_state_t *tx = &dp->dp_tx; + ASSERT(!dsl_pool_config_held(dp)); + mutex_enter(&tx->tx_sync_lock); ASSERT(tx->tx_threads == 2); if (txg == 0) @@ -546,6 +654,28 @@ txg_wait_open(dsl_pool_t *dp, uint64_t t mutex_exit(&tx->tx_sync_lock); } +/* + * If there isn't a txg syncing or in the pipeline, push another txg through + * the pipeline by queiscing the open txg. + */ +void +txg_kick(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + ASSERT(!dsl_pool_config_held(dp)); + + mutex_enter(&tx->tx_sync_lock); + if (tx->tx_syncing_txg == 0 && + tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && + tx->tx_sync_txg_waiting <= tx->tx_synced_txg && + tx->tx_quiesced_txg <= tx->tx_synced_txg) { + tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; + cv_broadcast(&tx->tx_quiesce_more_cv); + } + mutex_exit(&tx->tx_sync_lock); +} + boolean_t txg_stalled(dsl_pool_t *dp) { @@ -589,33 +719,80 @@ txg_list_destroy(txg_list_t *tl) mutex_destroy(&tl->tl_lock); } -int +boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg) { return (tl->tl_head[txg & TXG_MASK] == NULL); } /* - * Add an entry to the list. - * Returns 0 if it's a new entry, 1 if it's already there. + * Returns true if all txg lists are empty. + * + * Warning: this is inherently racy (an item could be added immediately after this + * function returns). We don't bother with the lock because it wouldn't change the + * semantics. + */ +boolean_t +txg_all_lists_empty(txg_list_t *tl) +{ + for (int i = 0; i < TXG_SIZE; i++) { + if (!txg_list_empty(tl, i)) { + return (B_FALSE); + } + } + return (B_TRUE); +} + +/* + * Add an entry to the list (unless it's already on the list). + * Returns B_TRUE if it was actually added. */ -int +boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); - int already_on_list; + boolean_t add; mutex_enter(&tl->tl_lock); - already_on_list = tn->tn_member[t]; - if (!already_on_list) { + add = (tn->tn_member[t] == 0); + if (add) { tn->tn_member[t] = 1; tn->tn_next[t] = tl->tl_head[t]; tl->tl_head[t] = tn; } mutex_exit(&tl->tl_lock); - return (already_on_list); + return (add); +} + +/* + * Add an entry to the end of the list, unless it's already on the list. + * (walks list to find end) + * Returns B_TRUE if it was actually added. + */ +boolean_t +txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + boolean_t add; + + mutex_enter(&tl->tl_lock); + add = (tn->tn_member[t] == 0); + if (add) { + txg_node_t **tp; + + for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) + continue; + + tn->tn_member[t] = 1; + tn->tn_next[t] = NULL; + *tp = tn; + } + mutex_exit(&tl->tl_lock); + + return (add); } /* @@ -666,13 +843,13 @@ txg_list_remove_this(txg_list_t *tl, voi return (NULL); } -int +boolean_t txg_list_member(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); - return (tn->tn_member[t]); + return (tn->tn_member[t] != 0); } /* Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/uberblock.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/uberblock.c,v retrieving revision 1.1.1.1 diff -u -p -r1.1.1.1 uberblock.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/uberblock.c 7 Aug 2009 18:33:14 -0000 1.1.1.1 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/uberblock.c 4 Feb 2015 07:24:13 -0000 @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -36,16 +34,16 @@ uberblock_verify(uberblock_t *ub) byteswap_uint64_array(ub, sizeof (uberblock_t)); if (ub->ub_magic != UBERBLOCK_MAGIC) - return (EINVAL); + return (SET_ERROR(EINVAL)); return (0); } /* - * Update the uberblock and return a boolean value indicating whether - * anything changed in this transaction group. + * Update the uberblock and return TRUE if anything changed in this + * transaction group. */ -int +boolean_t uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg) { ASSERT(ub->ub_txg < txg); @@ -58,6 +56,7 @@ uberblock_update(uberblock_t *ub, vdev_t ub->ub_txg = txg; ub->ub_guid_sum = rvd->vdev_guid_sum; ub->ub_timestamp = gethrestime_sec(); + ub->ub_software_version = SPA_VERSION; return (ub->ub_rootbp.blk_birth == txg); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c,v retrieving revision 1.4 diff -u -p -r1.4 vdev.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c 20 Nov 2011 02:54:25 -0000 1.4 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c 4 May 2017 13:47:07 -0000 @@ -20,8 +20,12 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright 2013 Martin Matuska . All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Toomas Soome */ #include @@ -35,31 +39,134 @@ #include #include #include +#include #include #include #include #include #include +#include +#include + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); /* * Virtual device management. */ +/* + * The limit for ZFS to automatically increase a top-level vdev's ashift + * from logical ashift to physical ashift. + * + * Example: one or more 512B emulation child vdevs + * child->vdev_ashift = 9 (512 bytes) + * child->vdev_physical_ashift = 12 (4096 bytes) + * zfs_max_auto_ashift = 11 (2048 bytes) + * zfs_min_auto_ashift = 9 (512 bytes) + * + * On pool creation or the addition of a new top-level vdev, ZFS will + * increase the ashift of the top-level vdev to 2048 as limited by + * zfs_max_auto_ashift. + * + * Example: one or more 512B emulation child vdevs + * child->vdev_ashift = 9 (512 bytes) + * child->vdev_physical_ashift = 12 (4096 bytes) + * zfs_max_auto_ashift = 13 (8192 bytes) + * zfs_min_auto_ashift = 9 (512 bytes) + * + * On pool creation or the addition of a new top-level vdev, ZFS will + * increase the ashift of the top-level vdev to 4096 to match the + * max vdev_physical_ashift. + * + * Example: one or more 512B emulation child vdevs + * child->vdev_ashift = 9 (512 bytes) + * child->vdev_physical_ashift = 9 (512 bytes) + * zfs_max_auto_ashift = 13 (8192 bytes) + * zfs_min_auto_ashift = 12 (4096 bytes) + * + * On pool creation or the addition of a new top-level vdev, ZFS will + * increase the ashift of the top-level vdev to 4096 to match the + * zfs_min_auto_ashift. + */ +static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; +static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT; + +#ifdef __FreeBSD__ +static int +sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_max_auto_ashift; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift) + return (EINVAL); + + zfs_max_auto_ashift = val; + + return (0); +} +SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), + sysctl_vfs_zfs_max_auto_ashift, "QU", + "Max ashift used when optimising for logical -> physical sectors size on " + "new top-level vdevs."); + +static int +sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_min_auto_ashift; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift) + return (EINVAL); + + zfs_min_auto_ashift = val; + + return (0); +} +SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), + sysctl_vfs_zfs_min_auto_ashift, "QU", + "Min ashift used when creating new top-level vdevs."); +#endif + static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, +#if defined(__FreeBSD__) && defined(_KERNEL) + &vdev_geom_ops, +#else &vdev_disk_ops, +#endif &vdev_file_ops, &vdev_missing_ops, &vdev_hole_ops, NULL }; -/* maximum scrub/resilver I/O queue per leaf vdev */ -int zfs_scrub_limit = 10; + +/* + * When a vdev is added, it will be divided into approximately (but no + * more than) this number of metaslabs. + */ +int metaslabs_per_vdev = 200; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN, + &metaslabs_per_vdev, 0, + "When a vdev is added, how many metaslabs the vdev should be divided into"); /* * Given a vdev type, return the appropriate ops vector. @@ -106,7 +213,7 @@ vdev_get_min_asize(vdev_t *vd) vdev_t *pvd = vd->vdev_parent; /* - * The our parent is NULL (inactive spare or cache) or is the root, + * If our parent is NULL (inactive spare or cache) or is the root, * just return our own asize. */ if (pvd == NULL) @@ -169,14 +276,35 @@ vdev_lookup_by_guid(vdev_t *vd, uint64_t return (NULL); } +static int +vdev_count_leaves_impl(vdev_t *vd) +{ + int n = 0; + + if (vd->vdev_ops->vdev_op_leaf) + return (1); + + for (int c = 0; c < vd->vdev_children; c++) + n += vdev_count_leaves_impl(vd->vdev_child[c]); + + return (n); +} + +int +vdev_count_leaves(spa_t *spa) +{ + return (vdev_count_leaves_impl(spa->spa_root_vdev)); +} + void vdev_add_child(vdev_t *pvd, vdev_t *cvd) { size_t oldsize, newsize; uint64_t id = cvd->vdev_id; vdev_t **newchild; + spa_t *spa = cvd->vdev_spa; - ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(cvd->vdev_parent == NULL); cvd->vdev_parent = pvd; @@ -207,9 +335,6 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd) */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum += cvd->vdev_guid_sum; - - if (cvd->vdev_ops->vdev_op_leaf) - cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; } void @@ -244,9 +369,6 @@ vdev_remove_child(vdev_t *pvd, vdev_t *c */ for (; pvd != NULL; pvd = pvd->vdev_parent) pvd->vdev_guid_sum -= cvd->vdev_guid_sum; - - if (cvd->vdev_ops->vdev_op_leaf) - cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; } /* @@ -292,6 +414,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, if (spa->spa_root_vdev == NULL) { ASSERT(ops == &vdev_root_ops); spa->spa_root_vdev = vd; + spa->spa_load_guid = spa_generate_guid(NULL); } if (guid == 0 && ops != &vdev_hole_ops) { @@ -321,8 +444,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { - space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0, + vd->vdev_dtl[t] = range_tree_create(NULL, NULL, &vd->vdev_dtl_lock); } txg_list_create(&vd->vdev_ms_list, @@ -353,10 +477,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) - return (EINVAL); + return (SET_ERROR(EINVAL)); if ((ops = vdev_getops(type)) == NULL) - return (EINVAL); + return (SET_ERROR(EINVAL)); /* * If this is a load, get the vdev guid from the nvlist. @@ -367,26 +491,26 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || label_id != id) - return (EINVAL); + return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) - return (EINVAL); + return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) - return (EINVAL); + return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) - return (EINVAL); + return (SET_ERROR(EINVAL)); } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) - return (EINVAL); + return (SET_ERROR(EINVAL)); } /* * The first allocated vdev must be of type 'root'. */ if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) - return (EINVAL); + return (SET_ERROR(EINVAL)); /* * Determine whether we're a log vdev. @@ -394,10 +518,10 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl islog = 0; (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); if (islog && spa_version(spa) < SPA_VERSION_SLOGS) - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); /* * Set the nparity property for RAID-Z vdevs. @@ -407,24 +531,24 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) - return (EINVAL); + return (SET_ERROR(EINVAL)); /* * Previous versions could only support 1 or 2 parity * device. */ if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ if (spa_version(spa) >= SPA_VERSION_RAIDZ2) - return (EINVAL); + return (SET_ERROR(EINVAL)); /* * Otherwise, we default to 1 parity device for RAID-Z. */ @@ -487,9 +611,15 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl &vd->vdev_ms_shift); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, &vd->vdev_asize); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, + &vd->vdev_removing); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, + &vd->vdev_top_zap); + } else { + ASSERT0(vd->vdev_top_zap); } - if (parent && !parent->vdev_parent) { + if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { ASSERT(alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_ADD || alloctype == VDEV_ALLOC_SPLIT || @@ -498,15 +628,24 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl spa_log_class(spa) : spa_normal_class(spa), vd); } + if (vd->vdev_ops->vdev_op_leaf && + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { + (void) nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); + } else { + ASSERT0(vd->vdev_leaf_zap); + } + /* * If we're a leaf vdev, try to load the DTL object and other state. */ + if (vd->vdev_ops->vdev_op_leaf && (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, - &vd->vdev_dtl_smo.smo_object); + &vd->vdev_dtl_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } @@ -522,6 +661,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvl (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, + &vd->vdev_resilver_txg); + /* * When importing a pool, we want to ignore the persistent fault * state, as the diagnosis made on another system may not be @@ -590,9 +732,9 @@ vdev_free(vdev_t *vd) metaslab_group_destroy(vd->vdev_mg); } - ASSERT3U(vd->vdev_stat.vs_space, ==, 0); - ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); - ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); + ASSERT0(vd->vdev_stat.vs_space); + ASSERT0(vd->vdev_stat.vs_dspace); + ASSERT0(vd->vdev_stat.vs_alloc); /* * Remove this vdev from its parent's child list. @@ -625,12 +767,14 @@ vdev_free(vdev_t *vd) txg_list_destroy(&vd->vdev_dtl_list); mutex_enter(&vd->vdev_dtl_lock); + space_map_close(vd->vdev_dtl_sm); for (int t = 0; t < DTL_TYPES; t++) { - space_map_unload(&vd->vdev_dtl[t]); - space_map_destroy(&vd->vdev_dtl[t]); + range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); + range_tree_destroy(vd->vdev_dtl[t]); } mutex_exit(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_queue_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); @@ -657,11 +801,15 @@ vdev_top_transfer(vdev_t *svd, vdev_t *t tvd->vdev_ms_array = svd->vdev_ms_array; tvd->vdev_ms_shift = svd->vdev_ms_shift; tvd->vdev_ms_count = svd->vdev_ms_count; + tvd->vdev_top_zap = svd->vdev_top_zap; svd->vdev_ms_array = 0; svd->vdev_ms_shift = 0; svd->vdev_ms_count = 0; + svd->vdev_top_zap = 0; + if (tvd->vdev_mg) + ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); tvd->vdev_mg = svd->vdev_mg; tvd->vdev_ms = svd->vdev_ms; @@ -733,7 +881,10 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t mvd->vdev_asize = cvd->vdev_asize; mvd->vdev_min_asize = cvd->vdev_min_asize; + mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_ashift = cvd->vdev_ashift; + mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; + mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; @@ -765,6 +916,8 @@ vdev_remove_parent(vdev_t *cvd) mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; + cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; + cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); @@ -815,9 +968,9 @@ vdev_metaslab_init(vdev_t *vd, uint64_t /* * Compute the raidz-deflation ratio. Note, we hard-code - * in 128k (1 << 17) because it is the current "typical" blocksize. - * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change, - * or we will inconsistently account for existing bp's. + * in 128k (1 << 17) because it is the "typical" blocksize. + * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, + * otherwise it would inconsistently account for existing bp's. */ vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); @@ -835,33 +988,31 @@ vdev_metaslab_init(vdev_t *vd, uint64_t vd->vdev_ms_count = newc; for (m = oldc; m < newc; m++) { - space_map_obj_t smo = { 0, 0, 0 }; + uint64_t object = 0; + if (txg == 0) { - uint64_t object = 0; error = dmu_read(mos, vd->vdev_ms_array, m * sizeof (uint64_t), sizeof (uint64_t), &object, DMU_READ_PREFETCH); if (error) return (error); - if (object != 0) { - dmu_buf_t *db; - error = dmu_bonus_hold(mos, object, FTAG, &db); - if (error) - return (error); - ASSERT3U(db->db_size, >=, sizeof (smo)); - bcopy(db->db_data, &smo, sizeof (smo)); - ASSERT3U(smo.smo_object, ==, object); - dmu_buf_rele(db, FTAG); - } } - vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, - m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); + + error = metaslab_init(vd->vdev_mg, m, object, txg, + &(vd->vdev_ms[m])); + if (error) + return (error); } if (txg == 0) spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); - if (oldc == 0) + /* + * If the vdev is being removed we don't activate + * the metaslabs since we want to ensure that no new + * allocations are performed on this device. + */ + if (oldc == 0 && !vd->vdev_removing) metaslab_group_activate(vd->vdev_mg); if (txg == 0) @@ -878,9 +1029,12 @@ vdev_metaslab_fini(vdev_t *vd) if (vd->vdev_ms != NULL) { metaslab_group_passivate(vd->vdev_mg); - for (m = 0; m < count; m++) - if (vd->vdev_ms[m] != NULL) - metaslab_fini(vd->vdev_ms[m]); + for (m = 0; m < count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp != NULL) + metaslab_fini(msp); + } kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); vd->vdev_ms = NULL; } @@ -929,7 +1083,7 @@ vdev_probe_done(zio_t *zio) ASSERT(zio->io_error != 0); zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, 0, 0); - zio->io_error = ENXIO; + zio->io_error = SET_ERROR(ENXIO); } mutex_enter(&vd->vdev_probe_lock); @@ -937,18 +1091,21 @@ vdev_probe_done(zio_t *zio) vd->vdev_probe_zio = NULL; mutex_exit(&vd->vdev_probe_lock); - while ((pio = zio_walk_parents(zio)) != NULL) + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) if (!vdev_accessible(vd, pio)) - pio->io_error = ENXIO; + pio->io_error = SET_ERROR(ENXIO); kmem_free(vps, sizeof (*vps)); } } /* - * Determine whether this device is accessible by reading and writing - * to several known locations: the pad regions of each vdev label - * but the first (which we leave alone in case it contains a VTOC). + * Determine whether this device is accessible. + * + * Read and write to several known locations: the pad regions of each + * vdev label but the first, which we leave alone in case it contains + * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) @@ -1005,6 +1162,10 @@ vdev_probe(vdev_t *vd, zio_t *zio) vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); + /* + * We can't change the vdev state in this context, so we + * kick off an async task to do it on our behalf. + */ if (zio != NULL) { vd->vdev_probe_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_PROBE); @@ -1070,7 +1231,7 @@ vdev_open_children(vdev_t *vd) * in a single thread so that the same thread holds the * spa_namespace_lock */ - if (vdev_uses_zvols(vd)) { + if (B_TRUE || vdev_uses_zvols(vd)) { for (int c = 0; c < children; c++) vd->vdev_child[c]->vdev_open_error = vdev_open(vd->vdev_child[c]); @@ -1095,8 +1256,10 @@ vdev_open(vdev_t *vd) spa_t *spa = vd->vdev_spa; int error; uint64_t osize = 0; - uint64_t asize, psize; - uint64_t ashift = 0; + uint64_t max_osize = 0; + uint64_t asize, max_asize, psize; + uint64_t logical_ashift = 0; + uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); @@ -1107,6 +1270,7 @@ vdev_open(vdev_t *vd) vd->vdev_stat.vs_aux = VDEV_AUX_NONE; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; + vd->vdev_notrim = B_FALSE; vd->vdev_min_asize = vdev_get_min_asize(vd); /* @@ -1119,14 +1283,15 @@ vdev_open(vdev_t *vd) vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); - return (ENXIO); + return (SET_ERROR(ENXIO)); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); - return (ENXIO); + return (SET_ERROR(ENXIO)); } - error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); + error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, + &logical_ashift, &physical_ashift); /* * Reset the vdev_reopening flag so that we actually close @@ -1158,7 +1323,7 @@ vdev_open(vdev_t *vd) vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, vd->vdev_label_aux); - return (ENXIO); + return (SET_ERROR(ENXIO)); } if (vd->vdev_degraded) { @@ -1175,6 +1340,9 @@ vdev_open(vdev_t *vd) if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) return (0); + if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf) + trim_map_create(vd); + for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, @@ -1184,24 +1352,28 @@ vdev_open(vdev_t *vd) } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); + max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); - return (EOVERFLOW); + return (SET_ERROR(EOVERFLOW)); } psize = osize; asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); + max_asize = max_osize - (VDEV_LABEL_START_SIZE + + VDEV_LABEL_END_SIZE); } else { if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_TOO_SMALL); - return (EOVERFLOW); + return (SET_ERROR(EOVERFLOW)); } psize = 0; asize = osize; + max_asize = max_osize; } vd->vdev_psize = psize; @@ -1212,6 +1384,17 @@ vdev_open(vdev_t *vd) if (asize < vd->vdev_min_asize) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); + return (SET_ERROR(EINVAL)); + } + + vd->vdev_physical_ashift = + MAX(physical_ashift, vd->vdev_physical_ashift); + vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); + vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); + + if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_ASHIFT_TOO_BIG); return (EINVAL); } @@ -1221,16 +1404,18 @@ vdev_open(vdev_t *vd) * For testing purposes, a higher ashift can be requested. */ vd->vdev_asize = asize; - vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); + vd->vdev_max_asize = max_asize; } else { /* * Make sure the alignment requirement hasn't increased. */ - if (ashift > vd->vdev_top->vdev_ashift) { + if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && + vd->vdev_ops->vdev_op_leaf) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (EINVAL); } + vd->vdev_max_asize = max_asize; } /* @@ -1250,12 +1435,23 @@ vdev_open(vdev_t *vd) */ if (vd->vdev_ops->vdev_op_leaf && (error = zio_wait(vdev_probe(vd, NULL))) != 0) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_IO_FAILURE); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); return (error); } /* + * Track the min and max ashift values for normal data devices. + */ + if (vd->vdev_top == vd && vd->vdev_ashift != 0 && + !vd->vdev_islog && vd->vdev_aux == NULL) { + if (vd->vdev_ashift > spa->spa_max_ashift) + spa->spa_max_ashift = vd->vdev_ashift; + if (vd->vdev_ashift < spa->spa_min_ashift) + spa->spa_min_ashift = vd->vdev_ashift; + } + + /* * If a leaf vdev has a DTL, and seems healthy, then kick off a * resilver. But don't do this if we are doing a reopen for a scrub, * since this would just restart the scrub we are already doing. @@ -1272,13 +1468,18 @@ vdev_open(vdev_t *vd) * contents. This needs to be done before vdev_load() so that we don't * inadvertently do repair I/Os to the wrong device. * + * If 'strict' is false ignore the spa guid check. This is necessary because + * if the machine crashed during a re-guid the new guid might have been written + * to all of the vdev labels, but not the cached config. The strict check + * will be performed when the pool is opened again using the mos config. + * * This function will only return failure if one of the vdevs indicates that it * has since been destroyed or exported. This is only possible if * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state * will be updated but the function will return 0. */ int -vdev_validate(vdev_t *vd) +vdev_validate(vdev_t *vd, boolean_t strict) { spa_t *spa = vd->vdev_spa; nvlist_t *label; @@ -1286,8 +1487,8 @@ vdev_validate(vdev_t *vd) uint64_t state; for (int c = 0; c < vd->vdev_children; c++) - if (vdev_validate(vd->vdev_child[c]) != 0) - return (EBADF); + if (vdev_validate(vd->vdev_child[c], strict) != 0) + return (SET_ERROR(EBADF)); /* * If the device has already failed, or was marked offline, don't do @@ -1297,8 +1498,10 @@ vdev_validate(vdev_t *vd) if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { uint64_t aux_guid = 0; nvlist_t *nvl; + uint64_t txg = spa_last_synced_txg(spa) != 0 ? + spa_last_synced_txg(spa) : -1ULL; - if ((label = vdev_label_read_config(vd)) == NULL) { + if ((label = vdev_label_read_config(vd, txg)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_BAD_LABEL); return (0); @@ -1316,8 +1519,9 @@ vdev_validate(vdev_t *vd) return (0); } - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, - &guid) != 0 || guid != spa_guid(spa)) { + if (strict && (nvlist_lookup_uint64(label, + ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || + guid != spa_guid(spa))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); @@ -1364,13 +1568,13 @@ vdev_validate(vdev_t *vd) nvlist_free(label); /* - * If spa->spa_load_verbatim is true, no need to check the + * If this is a verbatim import, no need to check the * state of the pool. */ - if (!spa->spa_load_verbatim && + if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) - return (EBADF); + return (SET_ERROR(EBADF)); /* * If we were able to open and validate a vdev that was @@ -1406,6 +1610,9 @@ vdev_close(vdev_t *vd) vdev_cache_purge(vd); + if (vd->vdev_ops->vdev_op_leaf) + trim_map_destroy(vd); + /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that @@ -1420,6 +1627,35 @@ vdev_close(vdev_t *vd) vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } +void +vdev_hold(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_is_root(spa)); + if (spa->spa_state == POOL_STATE_UNINITIALIZED) + return; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_hold(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_hold(vd); +} + +void +vdev_rele(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_is_root(spa)); + for (int c = 0; c < vd->vdev_children; c++) + vdev_rele(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_rele(vd); +} + /* * Reopen all interior vdevs and any unopened leaves. We don't actually * reopen leaf vdevs which had previously been opened as they might deadlock @@ -1450,7 +1686,7 @@ vdev_reopen(vdev_t *vd) !l2arc_vdev_present(vd)) l2arc_add_vdev(spa, vd); } else { - (void) vdev_validate(vd); + (void) vdev_validate(vd, B_TRUE); } /* @@ -1477,9 +1713,10 @@ vdev_create(vdev_t *vd, uint64_t txg, bo } /* - * Recursively initialize all labels. + * Recursively load DTLs and initialize all labels. */ - if ((error = vdev_label_init(vd, txg, isreplacing ? + if ((error = vdev_dtl_load(vd)) != 0 || + (error = vdev_label_init(vd, txg, isreplacing ? VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { vdev_close(vd); return (error); @@ -1492,18 +1729,47 @@ void vdev_metaslab_set_size(vdev_t *vd) { /* - * Aim for roughly 200 metaslabs per vdev. + * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev. */ - vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); + vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev); vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); } +/* + * Maximize performance by inflating the configured ashift for top level + * vdevs to be as close to the physical ashift as possible while maintaining + * administrator defined limits and ensuring it doesn't go below the + * logical ashift. + */ +void +vdev_ashift_optimize(vdev_t *vd) +{ + if (vd == vd->vdev_top) { + if (vd->vdev_ashift < vd->vdev_physical_ashift) { + vd->vdev_ashift = MIN( + MAX(zfs_max_auto_ashift, vd->vdev_ashift), + MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift)); + } else { + /* + * Unusual case where logical ashift > physical ashift + * so we can't cap the calculated ashift based on max + * ashift as that would cause failures. + * We still check if we need to increase it to match + * the min ashift. + */ + vd->vdev_ashift = MAX(zfs_min_auto_ashift, + vd->vdev_ashift); + } + } +} + void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); ASSERT(!vd->vdev_ishole); ASSERT(ISP2(flags)); + ASSERT(spa_writeable(vd->vdev_spa)); if (flags & VDD_METASLAB) (void) txg_list_add(&vd->vdev_ms_list, arg, txg); @@ -1514,11 +1780,21 @@ vdev_dirty(vdev_t *vd, int flags, void * (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } +void +vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_dirty_leaves(vd->vdev_child[c], flags, txg); + + if (vd->vdev_ops->vdev_op_leaf) + vdev_dirty(vd->vdev_top, flags, vd, txg); +} + /* * DTLs. * * A vdev's DTL (dirty time log) is the set of transaction groups for which - * the vdev has less than perfect replication. There are three kinds of DTL: + * the vdev has less than perfect replication. There are four kinds of DTL: * * DTL_MISSING: txgs for which the vdev has no valid copies of the data * @@ -1555,30 +1831,31 @@ vdev_dirty(vdev_t *vd, int flags, void * void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { - space_map_t *sm = &vd->vdev_dtl[t]; + range_tree_t *rt = vd->vdev_dtl[t]; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); + ASSERT(spa_writeable(vd->vdev_spa)); - mutex_enter(sm->sm_lock); - if (!space_map_contains(sm, txg, size)) - space_map_add(sm, txg, size); - mutex_exit(sm->sm_lock); + mutex_enter(rt->rt_lock); + if (!range_tree_contains(rt, txg, size)) + range_tree_add(rt, txg, size); + mutex_exit(rt->rt_lock); } boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { - space_map_t *sm = &vd->vdev_dtl[t]; + range_tree_t *rt = vd->vdev_dtl[t]; boolean_t dirty = B_FALSE; ASSERT(t < DTL_TYPES); ASSERT(vd != vd->vdev_spa->spa_root_vdev); - mutex_enter(sm->sm_lock); - if (sm->sm_space != 0) - dirty = space_map_contains(sm, txg, size); - mutex_exit(sm->sm_lock); + mutex_enter(rt->rt_lock); + if (range_tree_space(rt) != 0) + dirty = range_tree_contains(rt, txg, size); + mutex_exit(rt->rt_lock); return (dirty); } @@ -1586,17 +1863,86 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_t boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) { - space_map_t *sm = &vd->vdev_dtl[t]; + range_tree_t *rt = vd->vdev_dtl[t]; boolean_t empty; - mutex_enter(sm->sm_lock); - empty = (sm->sm_space == 0); - mutex_exit(sm->sm_lock); + mutex_enter(rt->rt_lock); + empty = (range_tree_space(rt) == 0); + mutex_exit(rt->rt_lock); return (empty); } /* + * Returns the lowest txg in the DTL range. + */ +static uint64_t +vdev_dtl_min(vdev_t *vd) +{ + range_seg_t *rs; + + ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); + ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); + ASSERT0(vd->vdev_children); + + rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); + return (rs->rs_start - 1); +} + +/* + * Returns the highest txg in the DTL. + */ +static uint64_t +vdev_dtl_max(vdev_t *vd) +{ + range_seg_t *rs; + + ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); + ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); + ASSERT0(vd->vdev_children); + + rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); + return (rs->rs_end); +} + +/* + * Determine if a resilvering vdev should remove any DTL entries from + * its range. If the vdev was resilvering for the entire duration of the + * scan then it should excise that range from its DTLs. Otherwise, this + * vdev is considered partially resilvered and should leave its DTL + * entries intact. The comment in vdev_dtl_reassess() describes how we + * excise the DTLs. + */ +static boolean_t +vdev_dtl_should_excise(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + + ASSERT0(scn->scn_phys.scn_errors); + ASSERT0(vd->vdev_children); + + if (vd->vdev_resilver_txg == 0 || + range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) + return (B_TRUE); + + /* + * When a resilver is initiated the scan will assign the scn_max_txg + * value to the highest txg value that exists in all DTLs. If this + * device's max DTL is not part of this scan (i.e. it is not in + * the range (scn_min_txg, scn_max_txg] then it is not eligible + * for excision. + */ + if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { + ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); + ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); + ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); + return (B_TRUE); + } + return (B_FALSE); +} + +/* * Reassess DTLs after a config change or scrub completion. */ void @@ -1616,10 +1962,20 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t t return; if (vd->vdev_ops->vdev_op_leaf) { + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + mutex_enter(&vd->vdev_dtl_lock); + + /* + * If we've completed a scan cleanly then determine + * if this vdev should remove any DTLs. We only want to + * excise regions on vdevs that were available during + * the entire duration of this scan. + */ if (scrub_txg != 0 && - (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { - /* XXX should check scrub_done? */ + (spa->spa_scrub_started || + (scn != NULL && scn->scn_phys.scn_errors == 0)) && + vdev_dtl_should_excise(vd)) { /* * We completed a scrub up to scrub_txg. If we * did it without rebooting, then the scrub dtl @@ -1637,27 +1993,40 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t t * positive refcnt -- either 1 or 2. We then convert * the reference tree into the new DTL_MISSING map. */ - space_map_ref_create(&reftree); - space_map_ref_add_map(&reftree, - &vd->vdev_dtl[DTL_MISSING], 1); - space_map_ref_add_seg(&reftree, 0, scrub_txg, -1); - space_map_ref_add_map(&reftree, - &vd->vdev_dtl[DTL_SCRUB], 2); - space_map_ref_generate_map(&reftree, - &vd->vdev_dtl[DTL_MISSING], 1); - space_map_ref_destroy(&reftree); - } - space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); - space_map_walk(&vd->vdev_dtl[DTL_MISSING], - space_map_add, &vd->vdev_dtl[DTL_PARTIAL]); + space_reftree_create(&reftree); + space_reftree_add_map(&reftree, + vd->vdev_dtl[DTL_MISSING], 1); + space_reftree_add_seg(&reftree, 0, scrub_txg, -1); + space_reftree_add_map(&reftree, + vd->vdev_dtl[DTL_SCRUB], 2); + space_reftree_generate_map(&reftree, + vd->vdev_dtl[DTL_MISSING], 1); + space_reftree_destroy(&reftree); + } + range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); + range_tree_walk(vd->vdev_dtl[DTL_MISSING], + range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) - space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL); - space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); + range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); + range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); if (!vdev_readable(vd)) - space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); + range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); else - space_map_walk(&vd->vdev_dtl[DTL_MISSING], - space_map_add, &vd->vdev_dtl[DTL_OUTAGE]); + range_tree_walk(vd->vdev_dtl[DTL_MISSING], + range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); + + /* + * If the vdev was resilvering and no longer has any + * DTLs then reset its resilvering flag and dirty + * the top level so that we persist the change. + */ + if (vd->vdev_resilver_txg != 0 && + range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && + range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) { + vd->vdev_resilver_txg = 0; + vdev_config_dirty(vd->vdev_top); + } + mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) @@ -1677,112 +2046,187 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t t minref = vd->vdev_nparity + 1; /* RAID-Z */ else minref = vd->vdev_children; /* any kind of mirror */ - space_map_ref_create(&reftree); + space_reftree_create(&reftree); for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; mutex_enter(&cvd->vdev_dtl_lock); - space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1); + space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); mutex_exit(&cvd->vdev_dtl_lock); } - space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref); - space_map_ref_destroy(&reftree); + space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); + space_reftree_destroy(&reftree); } mutex_exit(&vd->vdev_dtl_lock); } -static int +int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - space_map_obj_t *smo = &vd->vdev_dtl_smo; objset_t *mos = spa->spa_meta_objset; - dmu_buf_t *db; - int error; + int error = 0; - ASSERT(vd->vdev_children == 0); + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { + ASSERT(!vd->vdev_ishole); - if (smo->smo_object == 0) - return (0); + error = space_map_open(&vd->vdev_dtl_sm, mos, + vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock); + if (error) + return (error); + ASSERT(vd->vdev_dtl_sm != NULL); - ASSERT(!vd->vdev_ishole); + mutex_enter(&vd->vdev_dtl_lock); - if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) - return (error); + /* + * Now that we've opened the space_map we need to update + * the in-core DTL. + */ + space_map_update(vd->vdev_dtl_sm); - ASSERT3U(db->db_size, >=, sizeof (*smo)); - bcopy(db->db_data, smo, sizeof (*smo)); - dmu_buf_rele(db, FTAG); + error = space_map_load(vd->vdev_dtl_sm, + vd->vdev_dtl[DTL_MISSING], SM_ALLOC); + mutex_exit(&vd->vdev_dtl_lock); - mutex_enter(&vd->vdev_dtl_lock); - error = space_map_load(&vd->vdev_dtl[DTL_MISSING], - NULL, SM_ALLOC, smo, mos); - mutex_exit(&vd->vdev_dtl_lock); + return (error); + } + + for (int c = 0; c < vd->vdev_children; c++) { + error = vdev_dtl_load(vd->vdev_child[c]); + if (error != 0) + break; + } return (error); } void +vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + + VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); + VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, + zapobj, tx)); +} + +uint64_t +vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, + DMU_OT_NONE, 0, tx); + + ASSERT(zap != 0); + VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, + zap, tx)); + + return (zap); +} + +void +vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) +{ + if (vd->vdev_ops != &vdev_hole_ops && + vd->vdev_ops != &vdev_missing_ops && + vd->vdev_ops != &vdev_root_ops && + !vd->vdev_top->vdev_removing) { + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { + vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); + } + if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { + vd->vdev_top_zap = vdev_create_link_zap(vd, tx); + } + } + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_construct_zaps(vd->vdev_child[i], tx); + } +} + +void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; - space_map_obj_t *smo = &vd->vdev_dtl_smo; - space_map_t *sm = &vd->vdev_dtl[DTL_MISSING]; + range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; - space_map_t smsync; - kmutex_t smlock; - dmu_buf_t *db; + range_tree_t *rtsync; + kmutex_t rtlock; dmu_tx_t *tx; + uint64_t object = space_map_object(vd->vdev_dtl_sm); ASSERT(!vd->vdev_ishole); + ASSERT(vd->vdev_ops->vdev_op_leaf); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - if (vd->vdev_detached) { - if (smo->smo_object != 0) { - int err = dmu_object_free(mos, smo->smo_object, tx); - ASSERT3U(err, ==, 0); - smo->smo_object = 0; + if (vd->vdev_detached || vd->vdev_top->vdev_removing) { + mutex_enter(&vd->vdev_dtl_lock); + space_map_free(vd->vdev_dtl_sm, tx); + space_map_close(vd->vdev_dtl_sm); + vd->vdev_dtl_sm = NULL; + mutex_exit(&vd->vdev_dtl_lock); + + /* + * We only destroy the leaf ZAP for detached leaves or for + * removed log devices. Removed data devices handle leaf ZAP + * cleanup later, once cancellation is no longer possible. + */ + if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || + vd->vdev_top->vdev_islog)) { + vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); + vd->vdev_leaf_zap = 0; } + dmu_tx_commit(tx); return; } - if (smo->smo_object == 0) { - ASSERT(smo->smo_objsize == 0); - ASSERT(smo->smo_alloc == 0); - smo->smo_object = dmu_object_alloc(mos, - DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, - DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); - ASSERT(smo->smo_object != 0); - vdev_config_dirty(vd->vdev_top); + if (vd->vdev_dtl_sm == NULL) { + uint64_t new_object; + + new_object = space_map_alloc(mos, tx); + VERIFY3U(new_object, !=, 0); + + VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, + 0, -1ULL, 0, &vd->vdev_dtl_lock)); + ASSERT(vd->vdev_dtl_sm != NULL); } - mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); + bzero(&rtlock, sizeof(rtlock)); + mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL); - space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, - &smlock); + rtsync = range_tree_create(NULL, NULL, &rtlock); - mutex_enter(&smlock); + mutex_enter(&rtlock); mutex_enter(&vd->vdev_dtl_lock); - space_map_walk(sm, space_map_add, &smsync); + range_tree_walk(rt, range_tree_add, rtsync); mutex_exit(&vd->vdev_dtl_lock); - space_map_truncate(smo, mos, tx); - space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); + space_map_truncate(vd->vdev_dtl_sm, tx); + space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); + range_tree_vacate(rtsync, NULL, NULL); - space_map_destroy(&smsync); + range_tree_destroy(rtsync); - mutex_exit(&smlock); - mutex_destroy(&smlock); + mutex_exit(&rtlock); + mutex_destroy(&rtlock); - VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); - dmu_buf_will_dirty(db, tx); - ASSERT3U(db->db_size, >=, sizeof (*smo)); - bcopy(smo, db->db_data, sizeof (*smo)); - dmu_buf_rele(db, FTAG); + /* + * If the object for the space map has changed then dirty + * the top level so that we update the config. + */ + if (object != space_map_object(vd->vdev_dtl_sm)) { + zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, " + "new object %llu", txg, spa_name(spa), object, + space_map_object(vd->vdev_dtl_sm)); + vdev_config_dirty(vd->vdev_top); + } dmu_tx_commit(tx); + + mutex_enter(&vd->vdev_dtl_lock); + space_map_update(vd->vdev_dtl_sm); + mutex_exit(&vd->vdev_dtl_lock); } /* @@ -1813,6 +2257,9 @@ vdev_dtl_required(vdev_t *vd) vd->vdev_cant_read = cant_read; vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + if (!required && zio_injection_enabled) + required = !!zio_handle_device_injection(vd, NULL, ECHILD); + return (required); } @@ -1828,14 +2275,11 @@ vdev_resilver_needed(vdev_t *vd, uint64_ if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); - if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 && + if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && vdev_writeable(vd)) { - space_seg_t *ss; - ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); - thismin = ss->ss_start - 1; - ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); - thismax = ss->ss_end; + thismin = vdev_dtl_min(vd); + thismax = vdev_dtl_max(vd); needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); @@ -1902,14 +2346,14 @@ vdev_validate_aux(vdev_t *vd) if (!vdev_readable(vd)) return (0); - if ((label = vdev_label_read_config(vd)) == NULL) { + if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (-1); } if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || - version > SPA_VERSION || + !SPA_VERSION_IS_SUPPORTED(version) || nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || guid != vd->vdev_guid || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { @@ -1935,30 +2379,53 @@ vdev_remove(vdev_t *vd, uint64_t txg) dmu_tx_t *tx; tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); - - if (vd->vdev_dtl_smo.smo_object) { - ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0); - (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx); - vd->vdev_dtl_smo.smo_object = 0; - } + ASSERT(vd == vd->vdev_top); + ASSERT3U(txg, ==, spa_syncing_txg(spa)); if (vd->vdev_ms != NULL) { + metaslab_group_t *mg = vd->vdev_mg; + + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - if (msp == NULL || msp->ms_smo.smo_object == 0) + if (msp == NULL || msp->ms_sm == NULL) continue; - ASSERT3U(msp->ms_smo.smo_alloc, ==, 0); - (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx); - msp->ms_smo.smo_object = 0; + mutex_enter(&msp->ms_lock); + /* + * If the metaslab was not loaded when the vdev + * was removed then the histogram accounting may + * not be accurate. Update the histogram information + * here so that we ensure that the metaslab group + * and metaslab class are up-to-date. + */ + metaslab_group_histogram_remove(mg, msp); + + VERIFY0(space_map_allocated(msp->ms_sm)); + space_map_free(msp->ms_sm, tx); + space_map_close(msp->ms_sm); + msp->ms_sm = NULL; + mutex_exit(&msp->ms_lock); } + + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + ASSERT0(mg->mg_histogram[i]); + } if (vd->vdev_ms_array) { (void) dmu_object_free(mos, vd->vdev_ms_array, tx); vd->vdev_ms_array = 0; - vd->vdev_ms_shift = 0; + } + + if (vd->vdev_islog && vd->vdev_top_zap != 0) { + vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); + vd->vdev_top_zap = 0; } dmu_tx_commit(tx); } @@ -1998,7 +2465,10 @@ vdev_sync(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } - if (vd->vdev_removing) + /* + * Remove the metadata associated with this vdev once it's empty. + */ + if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) vdev_remove(vd, txg); while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { @@ -2025,7 +2495,7 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { - vdev_t *vd; + vdev_t *vd, *tvd; spa_vdev_state_enter(spa, SCL_NONE); @@ -2035,6 +2505,8 @@ vdev_fault(spa_t *spa, uint64_t guid, vd if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; + /* * We don't directly use the aux state here, but if we do a * vdev_reopen(), we need this value to be present to remember why we @@ -2045,17 +2517,16 @@ vdev_fault(spa_t *spa, uint64_t guid, vd /* * Faulted state takes precedence over degraded. */ + vd->vdev_delayed_close = B_FALSE; vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* - * If marking the vdev as faulted cause the top-level vdev to become - * unavailable, then back off and simply mark the vdev as degraded - * instead. + * If this device has the only valid copy of the data, then + * back off and simply mark the vdev as degraded instead. */ - if (vdev_is_dead(vd->vdev_top) && !vd->vdev_islog && - vd->vdev_aux == NULL) { + if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; @@ -2063,7 +2534,7 @@ vdev_fault(spa_t *spa, uint64_t guid, vd * If we reopen the device and it's not dead, only then do we * mark it degraded. */ - vdev_reopen(vd); + vdev_reopen(tvd); if (vdev_readable(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); @@ -2105,15 +2576,18 @@ vdev_degrade(spa_t *spa, uint64_t guid, } /* - * Online the given vdev. If 'unspare' is set, it implies two things. First, - * any attached spare device should be detached when the device finishes - * resilvering. Second, the online should be treated like a 'test' online case, - * so no FMA events are generated if the device fails to open. + * Online the given vdev. + * + * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached + * spare device should be detached when the device finishes resilvering. + * Second, the online should be treated like a 'test' online case, so no FMA + * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; + boolean_t postevent = B_FALSE; spa_vdev_state_enter(spa, SCL_NONE); @@ -2123,6 +2597,10 @@ vdev_online(spa_t *spa, uint64_t guid, u if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + postevent = + (vd->vdev_offline == B_TRUE || vd->vdev_tmpoffline == B_TRUE) ? + B_TRUE : B_FALSE; + tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; @@ -2158,6 +2636,10 @@ vdev_online(spa_t *spa, uint64_t guid, u return (spa_vdev_state_exit(spa, vd, ENOTSUP)); spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } + + if (postevent) + spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE); + return (spa_vdev_state_exit(spa, vd, 0)); } @@ -2223,7 +2705,7 @@ top: (void) spa_vdev_state_exit(spa, vd, 0); goto top; } - ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0); + ASSERT0(tvd->vdev_stat.vs_alloc); } /* @@ -2289,6 +2771,14 @@ vdev_clear(spa_t *spa, vdev_t *vd) for (int c = 0; c < vd->vdev_children; c++) vdev_clear(spa, vd->vdev_child[c]); + if (vd == rvd) { + for (int c = 0; c < spa->spa_l2cache.sav_count; c++) + vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); + + for (int c = 0; c < spa->spa_spares.sav_count; c++) + vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); + } + /* * If we're in the FAULTED state or have experienced failed I/O, then * clear the persistent state and attempt to reopen the device. We @@ -2305,15 +2795,15 @@ vdev_clear(spa_t *spa, vdev_t *vd) */ vd->vdev_forcefault = B_TRUE; - vd->vdev_faulted = vd->vdev_degraded = 0; + vd->vdev_faulted = vd->vdev_degraded = 0ULL; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; - vdev_reopen(vd); + vdev_reopen(vd == rvd ? rvd : vd->vdev_top); vd->vdev_forcefault = B_FALSE; - if (vd != rvd) + if (vd != rvd && vdev_writeable(vd->vdev_top)) vdev_state_dirty(vd->vdev_top); if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) @@ -2373,7 +2863,8 @@ vdev_allocatable(vdev_t *vd) * we're asking two separate questions about it. */ return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && - !vd->vdev_cant_write && !vd->vdev_ishole && !vd->vdev_removing); + !vd->vdev_cant_write && !vd->vdev_ishole && + vd->vdev_mg->mg_initialized); } boolean_t @@ -2399,17 +2890,35 @@ vdev_accessible(vdev_t *vd, zio_t *zio) void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { - vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *tvd = vd->vdev_top; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); mutex_enter(&vd->vdev_stat_lock); bcopy(&vd->vdev_stat, vs, sizeof (*vs)); - vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; - mutex_exit(&vd->vdev_stat_lock); + /* + * Report expandable space on top-level, non-auxillary devices only. + * The expandable space is reported in terms of metaslab sized units + * since that determines how much space the pool can expand. + */ + if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) { + vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize, + 1ULL << tvd->vdev_ms_shift); + } + vs->vs_configured_ashift = vd->vdev_top != NULL + ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; + vs->vs_logical_ashift = vd->vdev_logical_ashift; + vs->vs_physical_ashift = vd->vdev_physical_ashift; + if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { + vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; + } /* * If we're getting stats on the root vdev, aggregate the I/O counts @@ -2420,15 +2929,14 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t * vdev_t *cvd = rvd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; - mutex_enter(&vd->vdev_stat_lock); for (int t = 0; t < ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } - vs->vs_scrub_examined += cvs->vs_scrub_examined; - mutex_exit(&vd->vdev_stat_lock); + cvs->vs_scan_removing = cvd->vdev_removing; } } + mutex_exit(&vd->vdev_stat_lock); } void @@ -2442,6 +2950,19 @@ vdev_clear_stats(vdev_t *vd) } void +vdev_scan_stat_init(vdev_t *vd) +{ + vdev_stat_t *vs = &vd->vdev_stat; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_scan_stat_init(vd->vdev_child[c]); + + mutex_enter(&vd->vdev_stat_lock); + vs->vs_scan_processed = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +void vdev_stat_update(zio_t *zio, uint64_t psize) { spa_t *spa = zio->io_spa; @@ -2485,8 +3006,17 @@ vdev_stat_update(zio_t *zio, uint64_t ps mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_IO_REPAIR) { - if (flags & ZIO_FLAG_SCRUB_THREAD) - vs->vs_scrub_repaired += psize; + if (flags & ZIO_FLAG_SCAN_THREAD) { + dsl_scan_phys_t *scn_phys = + &spa->spa_dsl_pool->dp_scan->scn_phys; + uint64_t *processed = &scn_phys->scn_processed; + + /* XXX cleanup? */ + if (vd->vdev_ops->vdev_op_leaf) + atomic_add_64(processed, psize); + vs->vs_scan_processed += psize; + } + if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; } @@ -2532,7 +3062,7 @@ vdev_stat_update(zio_t *zio, uint64_t ps if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || - (flags & ZIO_FLAG_SCRUB_THREAD) || + (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { /* * This is either a normal write (not a repair), or it's @@ -2551,7 +3081,7 @@ vdev_stat_update(zio_t *zio, uint64_t ps */ if (vd->vdev_ops->vdev_op_leaf) { uint64_t commit_txg = txg; - if (flags & ZIO_FLAG_SCRUB_THREAD) { + if (flags & ZIO_FLAG_SCAN_THREAD) { ASSERT(flags & ZIO_FLAG_IO_REPAIR); ASSERT(spa_sync_pass(spa) == 1); vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); @@ -2572,35 +3102,6 @@ vdev_stat_update(zio_t *zio, uint64_t ps } } -void -vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) -{ - vdev_stat_t *vs = &vd->vdev_stat; - - for (int c = 0; c < vd->vdev_children; c++) - vdev_scrub_stat_update(vd->vdev_child[c], type, complete); - - mutex_enter(&vd->vdev_stat_lock); - - if (type == POOL_SCRUB_NONE) { - /* - * Update completion and end time. Leave everything else alone - * so we can report what happened during the previous scrub. - */ - vs->vs_scrub_complete = complete; - vs->vs_scrub_end = gethrestime_sec(); - } else { - vs->vs_scrub_type = type; - vs->vs_scrub_complete = 0; - vs->vs_scrub_examined = 0; - vs->vs_scrub_repaired = 0; - vs->vs_scrub_start = gethrestime_sec(); - vs->vs_scrub_end = 0; - } - - mutex_exit(&vd->vdev_stat_lock); -} - /* * Update the in-core space usage stats for this vdev, its metaslab class, * and the root vdev. @@ -2663,6 +3164,8 @@ vdev_config_dirty(vdev_t *vd) vdev_t *rvd = spa->spa_root_vdev; int c; + ASSERT(spa_writeable(spa)); + /* * If this is an aux vdev (as with l2cache and spare devices), then we * update the vdev config manually and set the sync flag. @@ -2700,7 +3203,7 @@ vdev_config_dirty(vdev_t *vd) * sketchy, but it will work. */ nvlist_free(aux[c]); - aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE); + aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); return; } @@ -2751,6 +3254,7 @@ vdev_state_dirty(vdev_t *vd) { spa_t *spa = vd->vdev_spa; + ASSERT(spa_writeable(spa)); ASSERT(vd == vd->vdev_top); /* @@ -2865,28 +3369,18 @@ vdev_set_state(vdev_t *vd, boolean_t iso /* * If we are setting the vdev state to anything but an open state, then - * always close the underlying device. Otherwise, we keep accessible - * but invalid devices open forever. We don't call vdev_close() itself, - * because that implies some extra checks (offline, etc) that we don't - * want here. This is limited to leaf devices, because otherwise - * closing the device will affect other children. + * always close the underlying device unless the device has requested + * a delayed close (i.e. we're about to remove or fault the device). + * Otherwise, we keep accessible but invalid devices open forever. + * We don't call vdev_close() itself, because that implies some extra + * checks (offline, etc) that we don't want here. This is limited to + * leaf devices, because otherwise closing the device will affect other + * children. */ - if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) + if (!vd->vdev_delayed_close && vdev_is_dead(vd) && + vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); - /* - * If we have brought this vdev back into service, we need - * to notify fmd so that it can gracefully repair any outstanding - * cases due to a missing device. We do this in all cases, even those - * that probably don't correlate to a repaired fault. This is sure to - * catch all cases, and we let the zfs-retire agent sort it out. If - * this is a transient state it's OK, as the retire agent will - * double-check the state of the vdev before repairing it. - */ - if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && - vd->vdev_prevstate != state) - zfs_post_state_change(spa, vd); - if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { @@ -2905,12 +3399,13 @@ vdev_set_state(vdev_t *vd, boolean_t iso vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* - * If we fail to open a vdev during an import, we mark it as - * "not available", which signifies that it was never there to - * begin with. Failure to open such a device is not considered - * an error. + * If we fail to open a vdev during an import or recovery, we + * mark it as "not available", which signifies that it was + * never there to begin with. Failure to open such a device + * is not considered an error. */ - if (spa_load_state(spa) == SPA_LOAD_IMPORT && + if ((spa_load_state(spa) == SPA_LOAD_IMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER) && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; @@ -2953,9 +3448,6 @@ vdev_set_state(vdev_t *vd, boolean_t iso case VDEV_AUX_BAD_LABEL: class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; break; - case VDEV_AUX_IO_FAILURE: - class = FM_EREPORT_ZFS_IO_FAILURE; - break; default: class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } @@ -2969,69 +3461,99 @@ vdev_set_state(vdev_t *vd, boolean_t iso vd->vdev_removed = B_FALSE; } + /* + * Notify the fmd of the state change. Be verbose and post + * notifications even for stuff that's not important; the fmd agent can + * sort it out. Don't emit state change events for non-leaf vdevs since + * they can't change state on their own. The FMD can check their state + * if it wants to when it sees that a leaf vdev had a state change. + */ + if (vd->vdev_ops->vdev_op_leaf) + zfs_post_state_change(spa, vd); + if (!isopen && vd->vdev_parent) vdev_propagate_state(vd->vdev_parent); } /* * Check the vdev configuration to ensure that it's capable of supporting - * a root pool. Currently, we do not support RAID-Z or partial configuration. - * In addition, only a single top-level vdev is allowed and none of the leaves - * can be wholedisks. + * a root pool. We do not support partial configuration. + * In addition, only a single top-level vdev is allowed. + * + * FreeBSD does not have above limitations. */ boolean_t vdev_is_bootable(vdev_t *vd) { +#ifdef illumos if (!vd->vdev_ops->vdev_op_leaf) { char *vdev_type = vd->vdev_ops->vdev_op_type; if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && vd->vdev_children > 1) { return (B_FALSE); - } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || - strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { + } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { return (B_FALSE); } - } else if (vd->vdev_wholedisk == 1) { - return (B_FALSE); } for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } +#endif /* illumos */ return (B_TRUE); } /* * Load the state from the original vdev tree (ovd) which * we've retrieved from the MOS config object. If the original - * vdev was offline then we transfer that state to the device - * in the current vdev tree (nvd). + * vdev was offline or faulted then we transfer that state to the + * device in the current vdev tree (nvd). */ void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) { spa_t *spa = nvd->vdev_spa; + ASSERT(nvd->vdev_top->vdev_islog); ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); for (int c = 0; c < nvd->vdev_children; c++) vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); - if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) { + if (nvd->vdev_ops->vdev_op_leaf) { /* - * It would be nice to call vdev_offline() - * directly but the pool isn't fully loaded and - * the txg threads have not been started yet. + * Restore the persistent vdev state */ nvd->vdev_offline = ovd->vdev_offline; - vdev_reopen(nvd->vdev_top); + nvd->vdev_faulted = ovd->vdev_faulted; + nvd->vdev_degraded = ovd->vdev_degraded; + nvd->vdev_removed = ovd->vdev_removed; } } /* + * Determine if a log device has valid content. If the vdev was + * removed or faulted in the MOS config then we know that + * the content on the log device has already been written to the pool. + */ +boolean_t +vdev_log_state_valid(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && + !vd->vdev_removed) + return (B_TRUE); + + for (int c = 0; c < vd->vdev_children; c++) + if (vdev_log_state_valid(vd->vdev_child[c])) + return (B_TRUE); + + return (B_FALSE); +} + +/* * Expand a vdev if possible. */ void @@ -3064,3 +3586,50 @@ vdev_split(vdev_t *vd) } vdev_propagate_state(cvd); } + +void +vdev_deadman(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + vdev_deadman(cvd); + } + + if (vd->vdev_ops->vdev_op_leaf) { + vdev_queue_t *vq = &vd->vdev_queue; + + mutex_enter(&vq->vq_lock); + if (avl_numnodes(&vq->vq_active_tree) > 0) { + spa_t *spa = vd->vdev_spa; + zio_t *fio; + uint64_t delta; + + /* + * Look at the head of all the pending queues, + * if any I/O has been outstanding for longer than + * the spa_deadman_synctime we panic the system. + */ + fio = avl_first(&vq->vq_active_tree); + delta = gethrtime() - fio->io_timestamp; + if (delta > spa_deadman_synctime(spa)) { + zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " + "delta %lluns, last io %lluns", + fio->io_timestamp, delta, + vq->vq_io_complete_ts); + + printf("SLOW IO: zio timestamp %lluns, " + "delta %luns, last io %lluns", + fio->io_timestamp, delta, + vq->vq_io_complete_ts); + + fm_panic("I/O to pool '%s' appears to be " + "hung on vdev guid %llu at '%s'.", + spa_name(spa), + (long long unsigned int) vd->vdev_guid, + vd->vdev_path); + } + } + mutex_exit(&vq->vq_lock); + } +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 vdev_cache.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c 27 Feb 2010 22:31:10 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c 10 Oct 2016 11:09:56 -0000 @@ -22,6 +22,9 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + */ #include #include @@ -71,13 +74,29 @@ * 1<io_error || ve->ve_missed_update) @@ -238,9 +258,9 @@ vdev_cache_fill(zio_t *fio) } /* - * Read data from the cache. Returns 0 on cache hit, errno on a miss. + * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss. */ -int +boolean_t vdev_cache_read(zio_t *zio) { vdev_cache_t *vc = &zio->io_vd->vdev_cache; @@ -252,16 +272,16 @@ vdev_cache_read(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_READ); if (zio->io_flags & ZIO_FLAG_DONT_CACHE) - return (EINVAL); + return (B_FALSE); if (zio->io_size > zfs_vdev_cache_max) - return (EOVERFLOW); + return (B_FALSE); /* * If the I/O straddles two or more cache blocks, don't cache it. */ if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) - return (EXDEV); + return (B_FALSE); ASSERT(cache_phase + zio->io_size <= VCBS); @@ -273,7 +293,7 @@ vdev_cache_read(zio_t *zio) if (ve != NULL) { if (ve->ve_missed_update) { mutex_exit(&vc->vc_lock); - return (ESTALE); + return (B_FALSE); } if ((fio = ve->ve_fill_io) != NULL) { @@ -281,7 +301,7 @@ vdev_cache_read(zio_t *zio) zio_add_child(zio, fio); mutex_exit(&vc->vc_lock); VDCSTAT_BUMP(vdc_stat_delegations); - return (0); + return (B_TRUE); } vdev_cache_hit(vc, ve, zio); @@ -289,18 +309,18 @@ vdev_cache_read(zio_t *zio) mutex_exit(&vc->vc_lock); VDCSTAT_BUMP(vdc_stat_hits); - return (0); + return (B_TRUE); } ve = vdev_cache_allocate(zio); if (ve == NULL) { mutex_exit(&vc->vc_lock); - return (ENOMEM); + return (B_FALSE); } fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, - ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, + ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); ve->ve_fill_io = fio; @@ -311,7 +331,7 @@ vdev_cache_read(zio_t *zio) zio_nowait(fio); VDCSTAT_BUMP(vdc_stat_misses); - return (0); + return (B_TRUE); } /* Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c,v retrieving revision 1.6 diff -u -p -r1.6 vdev_disk.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c 8 Dec 2015 20:56:21 -0000 1.6 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c 12 May 2017 02:10:16 -0000 @@ -1,4 +1,3 @@ - /* * CDDL HEADER START * @@ -20,8 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 Joyent, Inc. All rights reserved. */ #include @@ -37,6 +38,15 @@ #include #include +#ifdef __NetBSD__ +static int +geterror(struct buf *bp) +{ + + return (bp->b_error); +} +#endif + /* * Virtual device vector for disks. */ @@ -44,6 +54,70 @@ static void vdev_disk_io_intr(buf_t *); static void +vdev_disk_alloc(vdev_t *vd) +{ + vdev_disk_t *dvd; + + dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); + +#ifdef illumos + /* + * Create the LDI event callback list. + */ + list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t), + offsetof(vdev_disk_ldi_cb_t, lcb_next)); +#endif +} + + +static void +vdev_disk_free(vdev_t *vd) +{ + vdev_disk_t *dvd = vd->vdev_tsd; +#ifdef illumos + vdev_disk_ldi_cb_t *lcb; +#endif + + if (dvd == NULL) + return; + +#ifdef illumos + /* + * We have already closed the LDI handle. Clean up the LDI event + * callbacks and free vd->vdev_tsd. + */ + while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) { + list_remove(&dvd->vd_ldi_cbs, lcb); + (void) ldi_ev_remove_callbacks(lcb->lcb_id); + kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t)); + } + list_destroy(&dvd->vd_ldi_cbs); +#endif + kmem_free(dvd, sizeof (vdev_disk_t)); + vd->vdev_tsd = NULL; +} + + +/* + * It's not clear what these hold/rele functions are supposed to do. + */ +static void +vdev_disk_hold(vdev_t *vd) +{ + + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + +} + +static void +vdev_disk_rele(vdev_t *vd) +{ + + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + +} + +static void vdev_disk_flush(struct work *work, void *cookie) { vdev_disk_t *dvd; @@ -54,20 +128,20 @@ vdev_disk_flush(struct work *work, void bp = (struct buf *)work; vp = bp->b_vp; dvd = cookie; - - KASSERT(vp == dvd->vd_vn); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + KASSERT(vp == dvd->vd_vp); + cmd = 1; - error = VOP_IOCTL(vp, DIOCCACHESYNC, &cmd, FREAD|FWRITE, - kauth_cred_get()); - VOP_UNLOCK(vp); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_IOCTL(vp, DIOCCACHESYNC, &cmd, FREAD|FWRITE, kcred); + VOP_UNLOCK(vp, 0); bp->b_error = error; vdev_disk_io_intr(bp); } static int -vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift, uint64_t *pashift) { spa_t *spa = vd->vdev_spa; vdev_disk_t *dvd; @@ -80,7 +154,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psi */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); + return (SET_ERROR(EINVAL)); } /* @@ -90,10 +164,16 @@ vdev_disk_open(vdev_t *vd, uint64_t *psi if (vd->vdev_tsd != NULL) { ASSERT(vd->vdev_reopening); dvd = vd->vdev_tsd; + vp = dvd->vd_vp; + KASSERT(vp != NULL); goto skip_open; } - dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); + /* + * Create vd->vdev_tsd. + */ + vdev_disk_alloc(vd); + dvd = vd->vdev_tsd; /* * When opening a disk device, we want to preserve the user's original @@ -113,6 +193,13 @@ vdev_disk_open(vdev_t *vd, uint64_t *psi */ if (vd->vdev_devid != NULL) { /* XXXNETBSD wedges */ +#ifdef illumos + if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, + &dvd->vd_minor) != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } +#endif } error = EINVAL; /* presume failure */ @@ -121,74 +208,137 @@ vdev_disk_open(vdev_t *vd, uint64_t *psi &vp, CRCREAT, 0); if (error != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return error; + return (SET_ERROR(error)); } if (vp->v_type != VBLK) { vrele(vp); vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return EINVAL; + return (SET_ERROR(EINVAL)); } /* * XXXNETBSD Compare the devid to the stored value. */ + /* + * Create a workqueue to process cache-flushes concurrently. + */ + error = workqueue_create(&dvd->vd_wq, "vdevsync", + vdev_disk_flush, dvd, PRI_NONE, IPL_NONE, WQ_MPSAFE); + if (error != 0) { + vrele(vp); + return (SET_ERROR(error)); + } + + dvd->vd_vp = vp; + skip_open: /* * Determine the actual size of the device. * XXXNETBSD wedges. */ - error = VOP_IOCTL(vp, DIOCGPARTINFO, &pinfo, FREAD|FWRITE, - kauth_cred_get()); + error = VOP_IOCTL(vp, DIOCGPARTINFO, &pinfo, FREAD|FWRITE, kcred); if (error != 0) { - vrele(vp); vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return error; + return (SET_ERROR(error)); } *psize = pinfo.pi_size * pinfo.pi_secsize; + *max_psize = *psize; + *ashift = highbit(MAX(pinfo.pi_secsize, SPA_MINBLOCKSIZE)) - 1; + *pashift = *ashift; vd->vdev_wholedisk = (pinfo.pi_offset == 0); /* XXXNETBSD */ /* - * Create a workqueue to process cache-flushes concurrently. - */ - error = workqueue_create(&dvd->vd_wq, "vdevsync", - vdev_disk_flush, dvd, PRI_NONE, IPL_NONE, WQ_MPSAFE); - if (error != 0) { - vrele(vp); - return error; - } - - /* * Clear the nowritecache bit, so that on a vdev_reopen() we will * try again. */ vd->vdev_nowritecache = B_FALSE; - dvd->vd_vn = vp; - return 0; + return (0); } static void vdev_disk_close(vdev_t *vd) { vdev_disk_t *dvd = vd->vdev_tsd; - vnode_t *vp; if (vd->vdev_reopening || dvd == NULL) return; - if ((vp = dvd->vd_vn) != NULL) { -/* XXX NetBSD Sometimes we deadlock on this why ? */ -// vprint("vnode close info", vp); - vn_close(vp, FREAD|FWRITE, kauth_cred_get()); -// vprint("vnode close info", vp); -/* XXX is this needed ? vrele(vp); */ +#ifdef illumos + if (dvd->vd_minor != NULL) { + ddi_devid_str_free(dvd->vd_minor); + dvd->vd_minor = NULL; + } + + if (dvd->vd_devid != NULL) { + ddi_devid_free(dvd->vd_devid); + dvd->vd_devid = NULL; + } + + if (dvd->vd_lh != NULL) { + (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); + dvd->vd_lh = NULL; + } +#endif + +#ifdef __NetBSD__ + if (dvd->vd_vp != NULL) { + vn_close(dvd->vd_vp, FREAD|FWRITE, kcred); + dvd->vd_vp = NULL; + } + if (dvd->vd_wq != NULL) { workqueue_destroy(dvd->vd_wq); + dvd->vd_wq = NULL; } +#endif - kmem_free(dvd, sizeof (vdev_disk_t)); - vd->vdev_tsd = NULL; + vd->vdev_delayed_close = B_FALSE; +#ifdef illumos + /* + * If we closed the LDI handle due to an offline notify from LDI, + * don't free vd->vdev_tsd or unregister the callbacks here; + * the offline finalize callback or a reopen will take care of it. + */ + if (dvd->vd_ldi_offline) + return; +#endif + + vdev_disk_free(vd); +} + +int +vdev_disk_physio(vdev_t *vd, caddr_t data, + size_t size, uint64_t offset, int flags, boolean_t isdump) +{ +#ifdef illumos + vdev_disk_t *dvd = vd->vdev_tsd; + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) + return (EIO); + + ASSERT(vd->vdev_ops == &vdev_disk_ops); + + /* + * If in the context of an active crash dump, use the ldi_dump(9F) + * call instead of ldi_strategy(9F) as usual. + */ + if (isdump) { + ASSERT3P(dvd, !=, NULL); + return (ldi_dump(dvd->vd_lh, data, lbtodb(offset), + lbtodb(size))); + } + + return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags)); +#endif +#ifdef __NetBSD__ + return (EIO); +#endif } static void @@ -201,19 +351,13 @@ vdev_disk_io_intr(buf_t *bp) * Rather than teach the rest of the stack about other error * possibilities (EFAULT, etc), we normalize the error value here. */ - if (bp->b_error == 0) { - if (bp->b_resid != 0) { - zio->io_error = EIO; - } else { - zio->io_error = 0; - } - } else { - zio->io_error = EIO; - } + zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0); + if (zio->io_error == 0 && bp->b_resid != 0) + zio->io_error = SET_ERROR(EIO); putiobuf(bp); - zio_interrupt(zio); + zio_delay_interrupt(zio); } static void @@ -237,7 +381,7 @@ vdev_disk_ioctl_done(void *zio_arg, int zio_interrupt(zio); } -static int +static void vdev_disk_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -246,12 +390,32 @@ vdev_disk_io_start(zio_t *zio) buf_t *bp, *nbp; int error, size, off, resid; - vp = dvd->vd_vn; + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ +#ifdef illumos + if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } +#endif +#ifdef __NetBSD__ + if (dvd == NULL) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + vp = dvd->vd_vp; +#endif + if (zio->io_type == ZIO_TYPE_IOCTL) { /* XXPOLICY */ if (!vdev_readable(vd)) { - zio->io_error = ENXIO; - return (ZIO_PIPELINE_CONTINUE); + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; } switch (zio->io_cmd) { @@ -268,15 +432,15 @@ vdev_disk_io_start(zio_t *zio) bp = getiobuf(vp, true); bp->b_private = zio; workqueue_enqueue(dvd->vd_wq, &bp->b_work, NULL); - return (ZIO_PIPELINE_STOP); - break; + return; default: - zio->io_error = ENOTSUP; + zio->io_error = SET_ERROR(ENOTSUP); break; } - return (ZIO_PIPELINE_CONTINUE); + zio_execute(zio); + return; } bp = getiobuf(vp, true); @@ -294,7 +458,7 @@ vdev_disk_io_start(zio_t *zio) vp->v_numoutput++; mutex_exit(vp->v_interlock); } - + if (bp->b_bcount <= MAXPHYS) { /* We can do this I/O in one pass. */ (void)VOP_STRATEGY(vp, bp); @@ -316,15 +480,40 @@ vdev_disk_io_start(zio_t *zio) off += size; } } - - return (ZIO_PIPELINE_STOP); } static void vdev_disk_io_done(zio_t *zio) { +#ifdef illumos + vdev_t *vd = zio->io_vd; - /* NetBSD: nothing */ + /* + * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if + * the device has been removed. If this is the case, then we trigger an + * asynchronous removal of the device. Otherwise, probe the device and + * make sure it's still accessible. + */ + if (zio->io_error == EIO && !vd->vdev_remove_wanted) { + vdev_disk_t *dvd = vd->vdev_tsd; + int state = DKIO_NONE; + + if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, + FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { + /* + * We post the resource as soon as possible, instead of + * when the async removal actually happens, because the + * DE is using this information to discard previous I/O + * errors. + */ + zfs_post_remove(zio->io_spa, vd); + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } else if (!vd->vdev_delayed_close) { + vd->vdev_delayed_close = B_TRUE; + } + } +#endif } vdev_ops_t vdev_disk_ops = { @@ -334,6 +523,8 @@ vdev_ops_t vdev_disk_ops = { vdev_disk_io_start, vdev_disk_io_done, NULL, + vdev_disk_hold, + vdev_disk_rele, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -345,6 +536,78 @@ vdev_ops_t vdev_disk_ops = { int vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) { +#ifdef __NetBSD__ + return (ENOTSUP); +#else + ldi_handle_t vd_lh; + vdev_label_t *label; + uint64_t s, size; + int l; + ddi_devid_t tmpdevid; + int error = -1; + char *minor_name; + + /* + * Read the device label and build the nvlist. + */ + if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, + &minor_name) == 0) { + error = ldi_open_by_devid(tmpdevid, minor_name, + FREAD, kcred, &vd_lh, zfs_li); + ddi_devid_free(tmpdevid); + ddi_devid_str_free(minor_name); + } + + if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, + zfs_li))) + return (error); + + if (ldi_get_size(vd_lh, &s)) { + (void) ldi_close(vd_lh, FREAD, kcred); + return (SET_ERROR(EIO)); + } + + size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); + label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); + + *config = NULL; + for (l = 0; l < VDEV_LABELS; l++) { + uint64_t offset, state, txg = 0; + + /* read vdev label */ + offset = vdev_label_offset(size, l, 0); + if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label, + VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) + continue; + + if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, + sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { + *config = NULL; + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state >= POOL_STATE_DESTROYED) { + nvlist_free(*config); + *config = NULL; + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0) { + nvlist_free(*config); + *config = NULL; + continue; + } + + break; + } + + kmem_free(label, sizeof (vdev_label_t)); + (void) ldi_close(vd_lh, FREAD, kcred); + if (*config == NULL) + error = SET_ERROR(EIDRM); - return EOPNOTSUPP; + return (error); +#endif } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c,v retrieving revision 1.2 diff -u -p -r1.2 vdev_file.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c 23 Sep 2013 20:41:19 -0000 1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c 5 May 2017 18:02:28 -0000 @@ -19,8 +19,8 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. */ #include @@ -35,8 +35,21 @@ * Virtual device vector for files. */ +static void +vdev_file_hold(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static void +vdev_file_rele(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + static int -vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_file_t *vf; vnode_t *vp; @@ -48,7 +61,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psi */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); + return (SET_ERROR(EINVAL)); } /* @@ -58,6 +71,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psi if (vd->vdev_tsd != NULL) { ASSERT(vd->vdev_reopening); vf = vd->vdev_tsd; + vp = vf->vf_vnode; goto skip_open; } @@ -75,6 +89,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psi if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; return (error); } @@ -85,24 +101,40 @@ vdev_file_open(vdev_t *vd, uint64_t *psi * Make sure it's a regular file. */ if (vp->v_type != VREG) { + (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); + kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (ENODEV); + return (SET_ERROR(ENODEV)); } -#endif +#endif /* _KERNEL */ skip_open: /* * Determine the physical size of the file. */ vattr.va_mask = AT_SIZE; - error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL); +#ifdef __FreeBSD__ + vn_lock(vp, LK_SHARED | LK_RETRY); + error = VOP_GETATTR(vp, &vattr, kcred); + VOP_UNLOCK(vp, 0); +#endif +#ifdef __NetBSD__ + error = VOP_GETATTR(vp, &vattr, 0, kcred, NULL); +#endif if (error) { + (void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + kmem_free(vd->vdev_tsd, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; return (error); } - *psize = vattr.va_size; - *ashift = SPA_MINBLOCKSHIFT; + vd->vdev_notrim = B_TRUE; + + *max_psize = *psize = vattr.va_size; + *logical_ashift = SPA_MINBLOCKSHIFT; + *physical_ashift = SPA_MINBLOCKSHIFT; return (0); } @@ -116,53 +148,62 @@ vdev_file_close(vdev_t *vd) return; if (vf->vf_vnode != NULL) { - (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL); (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL); - VN_RELE(vf->vf_vnode); } + vd->vdev_delayed_close = B_FALSE; kmem_free(vf, sizeof (vdev_file_t)); vd->vdev_tsd = NULL; } -static int +static void vdev_file_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; - vdev_file_t *vf = vd->vdev_tsd; - ssize_t resid = 0; + vdev_file_t *vf; + vnode_t *vp; + ssize_t resid; - if (zio->io_type == ZIO_TYPE_IOCTL) { - /* XXPOLICY */ - if (!vdev_readable(vd)) { - zio->io_error = ENXIO; - return (ZIO_PIPELINE_CONTINUE); - } + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + vf = vd->vdev_tsd; + vp = vf->vf_vnode; + if (zio->io_type == ZIO_TYPE_IOCTL) { switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: - zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, + zio->io_error = VOP_FSYNC(vp, FSYNC | FDSYNC, kcred, NULL); break; default: - zio->io_error = ENOTSUP; + zio->io_error = SET_ERROR(ENOTSUP); } - return (ZIO_PIPELINE_CONTINUE); + zio_execute(zio); + return; } + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + zio->io_target_timestamp = zio_handle_io_delay(zio); + zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, - zio->io_size, zio->io_offset, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, &resid); + UIO_READ : UIO_WRITE, vp, zio->io_data, zio->io_size, + zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); if (resid != 0 && zio->io_error == 0) zio->io_error = ENOSPC; - zio_interrupt(zio); + zio_delay_interrupt(zio); - return (ZIO_PIPELINE_STOP); +#ifdef illumos + VERIFY3U(taskq_dispatch(system_taskq, vdev_file_io_strategy, bp, + TQ_SLEEP), !=, 0); +#endif } /* ARGSUSED */ @@ -178,6 +219,8 @@ vdev_ops_t vdev_file_ops = { vdev_file_io_start, vdev_file_io_done, NULL, + vdev_file_hold, + vdev_file_rele, VDEV_TYPE_FILE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -194,6 +237,8 @@ vdev_ops_t vdev_disk_ops = { vdev_file_io_start, vdev_file_io_done, NULL, + vdev_file_hold, + vdev_file_rele, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_geom.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_geom.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_geom.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_geom.c 3 Dec 2016 17:03:49 -0000 @@ -0,0 +1,1077 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006 Pawel Jakub Dawidek + * All rights reserved. + * + * Portions Copyright (c) 2012 Martin Matuska + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Virtual device vector for GEOM. + */ + +static g_attrchanged_t vdev_geom_attrchanged; +struct g_class zfs_vdev_class = { + .name = "ZFS::VDEV", + .version = G_VERSION, + .attrchanged = vdev_geom_attrchanged, +}; + +DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); + +SYSCTL_DECL(_vfs_zfs_vdev); +/* Don't send BIO_FLUSH. */ +static int vdev_geom_bio_flush_disable; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN, + &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); +/* Don't send BIO_DELETE. */ +static int vdev_geom_bio_delete_disable; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN, + &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); + +/* Declare local functions */ +static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read); + +/* + * Thread local storage used to indicate when a thread is probing geoms + * for their guids. If NULL, this thread is not tasting geoms. If non NULL, + * it is looking for a replacement for the vdev_t* that is its value. + */ +uint_t zfs_geom_probe_vdev_key; + +static void +vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp) +{ + int error; + uint16_t rate; + + error = g_getattr("GEOM::rotation_rate", cp, &rate); + if (error == 0) + vd->vdev_rotation_rate = rate; + else + vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN; +} + +static void +vdev_geom_set_physpath(struct g_consumer *cp, boolean_t do_null_update) +{ + boolean_t needs_update = B_FALSE; + vdev_t *vd; + char *physpath; + int error, physpath_len; + + if (g_access(cp, 1, 0, 0) != 0) + return; + + vd = cp->private; + physpath_len = MAXPATHLEN; + physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); + error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); + g_access(cp, -1, 0, 0); + if (error == 0) { + char *old_physpath; + + /* g_topology lock ensures that vdev has not been closed */ + g_topology_assert(); + old_physpath = vd->vdev_physpath; + vd->vdev_physpath = spa_strdup(physpath); + + if (old_physpath != NULL) { + needs_update = (strcmp(old_physpath, + vd->vdev_physpath) != 0); + spa_strfree(old_physpath); + } else + needs_update = do_null_update; + } + g_free(physpath); + + /* + * If the physical path changed, update the config. + * Only request an update for previously unset physpaths if + * requested by the caller. + */ + if (needs_update) + spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE); + +} + +static void +vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) +{ + vdev_t *vd; + char *old_physpath; + int error; + + vd = cp->private; + if (vd == NULL) + return; + + if (strcmp(attr, "GEOM::rotation_rate") == 0) { + vdev_geom_set_rotation_rate(vd, cp); + return; + } + + if (strcmp(attr, "GEOM::physpath") == 0) { + vdev_geom_set_physpath(cp, /*do_null_update*/B_TRUE); + return; + } +} + +static void +vdev_geom_orphan(struct g_consumer *cp) +{ + vdev_t *vd; + + g_topology_assert(); + + vd = cp->private; + if (vd == NULL) { + /* Vdev close in progress. Ignore the event. */ + return; + } + + /* + * Orphan callbacks occur from the GEOM event thread. + * Concurrent with this call, new I/O requests may be + * working their way through GEOM about to find out + * (only once executed by the g_down thread) that we've + * been orphaned from our disk provider. These I/Os + * must be retired before we can detach our consumer. + * This is most easily achieved by acquiring the + * SPA ZIO configuration lock as a writer, but doing + * so with the GEOM topology lock held would cause + * a lock order reversal. Instead, rely on the SPA's + * async removal support to invoke a close on this + * vdev once it is safe to do so. + */ + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); +} + +static struct g_consumer * +vdev_geom_attach(struct g_provider *pp, vdev_t *vd) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error; + + g_topology_assert(); + + ZFS_LOG(1, "Attaching to %s.", pp->name); + + if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) { + ZFS_LOG(1, "Failing attach of %s. Incompatible sectorsize %d\n", + pp->name, pp->sectorsize); + return (NULL); + } else if (pp->mediasize < SPA_MINDEVSIZE) { + ZFS_LOG(1, "Failing attach of %s. Incompatible mediasize %ju\n", + pp->name, pp->mediasize); + return (NULL); + } + + /* Do we have geom already? No? Create one. */ + LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + if (strcmp(gp->name, "zfs::vdev") != 0) + continue; + break; + } + if (gp == NULL) { + gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); + gp->orphan = vdev_geom_orphan; + gp->attrchanged = vdev_geom_attrchanged; + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, + __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d", __func__, + __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); + } else { + /* Check if we are already connected to this provider. */ + LIST_FOREACH(cp, &gp->consumer, consumer) { + if (cp->provider == pp) { + ZFS_LOG(1, "Found consumer for %s.", pp->name); + break; + } + } + if (cp == NULL) { + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", + __func__, __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", + __func__, __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + ZFS_LOG(1, "Created consumer for %s.", pp->name); + } else { + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", + __func__, __LINE__, error); + return (NULL); + } + ZFS_LOG(1, "Used existing consumer for %s.", pp->name); + } + } + + /* + * BUG: cp may already belong to a vdev. This could happen if: + * 1) That vdev is a shared spare, or + * 2) We are trying to reopen a missing vdev and we are scanning by + * guid. In that case, we'll ultimately fail to open this consumer, + * but not until after setting the private field. + * The solution is to: + * 1) Don't set the private field until after the open succeeds, and + * 2) Set it to a linked list of vdevs, not just a single vdev + */ + cp->private = vd; + if (vd != NULL) { + vd->vdev_tsd = cp; + vdev_geom_set_physpath(cp, /*do_null_update*/B_FALSE); + } + + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; + return (cp); +} + +static void +vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read) +{ + struct g_geom *gp; + vdev_t *vd; + + g_topology_assert(); + + ZFS_LOG(1, "Detaching from %s.", + cp->provider && cp->provider->name ? cp->provider->name : "NULL"); + + vd = cp->private; + cp->private = NULL; + + gp = cp->geom; + if (open_for_read) + g_access(cp, -1, 0, -1); + /* Destroy consumer on last close. */ + if (cp->acr == 0 && cp->ace == 0) { + if (cp->acw > 0) + g_access(cp, 0, -cp->acw, 0); + if (cp->provider != NULL) { + ZFS_LOG(1, "Destroying consumer for %s.", + cp->provider->name ? cp->provider->name : "NULL"); + g_detach(cp); + } + g_destroy_consumer(cp); + } + /* Destroy geom if there are no consumers left. */ + if (LIST_EMPTY(&gp->consumer)) { + ZFS_LOG(1, "Destroyed geom %s.", gp->name); + g_wither_geom(gp, ENXIO); + } +} + +static void +vdev_geom_close_locked(vdev_t *vd) +{ + struct g_consumer *cp; + + g_topology_assert(); + + cp = vd->vdev_tsd; + vd->vdev_tsd = NULL; + vd->vdev_delayed_close = B_FALSE; + if (cp == NULL) + return; + + ZFS_LOG(1, "Closing access to %s.", cp->provider->name); + + vdev_geom_detach(cp, B_TRUE); +} + +/* + * Issue one or more bios to the vdev in parallel + * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO + * operation is described by parallel entries from each array. There may be + * more bios actually issued than entries in the array + */ +static void +vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, + off_t *sizes, int *errors, int ncmds) +{ + struct bio **bios; + u_char *p; + off_t off, maxio, s, end; + int i, n_bios, j; + size_t bios_size; + + maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); + n_bios = 0; + + /* How many bios are required for all commands ? */ + for (i = 0; i < ncmds; i++) + n_bios += (sizes[i] + maxio - 1) / maxio; + + /* Allocate memory for the bios */ + bios_size = n_bios * sizeof(struct bio*); + bios = kmem_zalloc(bios_size, KM_SLEEP); + + /* Prepare and issue all of the bios */ + for (i = j = 0; i < ncmds; i++) { + off = offsets[i]; + p = datas[i]; + s = sizes[i]; + end = off + s; + ASSERT((off % cp->provider->sectorsize) == 0); + ASSERT((s % cp->provider->sectorsize) == 0); + + for (; off < end; off += maxio, p += maxio, s -= maxio, j++) { + bios[j] = g_alloc_bio(); + bios[j]->bio_cmd = cmds[i]; + bios[j]->bio_done = NULL; + bios[j]->bio_offset = off; + bios[j]->bio_length = MIN(s, maxio); + bios[j]->bio_data = p; + g_io_request(bios[j], cp); + } + } + ASSERT(j == n_bios); + + /* Wait for all of the bios to complete, and clean them up */ + for (i = j = 0; i < ncmds; i++) { + off = offsets[i]; + s = sizes[i]; + end = off + s; + + for (; off < end; off += maxio, s -= maxio, j++) { + errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i]; + g_destroy_bio(bios[j]); + } + } + kmem_free(bios, bios_size); +} + +static int +vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config) +{ + struct g_provider *pp; + vdev_phys_t *vdev_lists[VDEV_LABELS]; + char *p, *buf; + size_t buflen; + uint64_t psize, state, txg; + off_t offsets[VDEV_LABELS]; + off_t size; + off_t sizes[VDEV_LABELS]; + int cmds[VDEV_LABELS]; + int errors[VDEV_LABELS]; + int l, len; + + g_topology_assert_not(); + + pp = cp->provider; + ZFS_LOG(1, "Reading config from %s...", pp->name); + + psize = pp->mediasize; + psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t)); + + size = sizeof(*vdev_lists[0]) + pp->sectorsize - + ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1; + + buflen = sizeof(vdev_lists[0]->vp_nvlist); + + *config = NULL; + /* Create all of the IO requests */ + for (l = 0; l < VDEV_LABELS; l++) { + cmds[l] = BIO_READ; + vdev_lists[l] = kmem_alloc(size, KM_SLEEP); + offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; + sizes[l] = size; + errors[l] = 0; + ASSERT(offsets[l] % pp->sectorsize == 0); + } + + /* Issue the IO requests */ + vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors, + VDEV_LABELS); + + /* Parse the labels */ + for (l = 0; l < VDEV_LABELS; l++) { + if (errors[l] != 0) + continue; + + buf = vdev_lists[l]->vp_nvlist; + + if (nvlist_unpack(buf, buflen, config, 0) != 0) + continue; + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state > POOL_STATE_L2CACHE) { + nvlist_free(*config); + *config = NULL; + continue; + } + + if (state != POOL_STATE_SPARE && + state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0)) { + nvlist_free(*config); + *config = NULL; + continue; + } + + break; + } + + /* Free the label storage */ + for (l = 0; l < VDEV_LABELS; l++) + kmem_free(vdev_lists[l], size); + + return (*config == NULL ? ENOENT : 0); +} + +static void +resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) +{ + nvlist_t **new_configs; + uint64_t i; + + if (id < *count) + return; + new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *), + KM_SLEEP); + for (i = 0; i < *count; i++) + new_configs[i] = (*configs)[i]; + if (*configs != NULL) + kmem_free(*configs, *count * sizeof(void *)); + *configs = new_configs; + *count = id + 1; +} + +static void +process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, + const char *name, uint64_t* known_pool_guid) +{ + nvlist_t *vdev_tree; + uint64_t pool_guid; + uint64_t vdev_guid, known_guid; + uint64_t id, txg, known_txg; + char *pname; + int i; + + if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || + strcmp(pname, name) != 0) + goto ignore; + + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) + goto ignore; + + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) + goto ignore; + + if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) + goto ignore; + + if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) + goto ignore; + + VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); + + if (*known_pool_guid != 0) { + if (pool_guid != *known_pool_guid) + goto ignore; + } else + *known_pool_guid = pool_guid; + + resize_configs(configs, count, id); + + if ((*configs)[id] != NULL) { + VERIFY(nvlist_lookup_uint64((*configs)[id], + ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0); + if (txg <= known_txg) + goto ignore; + nvlist_free((*configs)[id]); + } + + (*configs)[id] = cfg; + return; + +ignore: + nvlist_free(cfg); +} + +int +vdev_geom_read_pool_label(const char *name, + nvlist_t ***configs, uint64_t *count) +{ + struct g_class *mp; + struct g_geom *gp; + struct g_provider *pp; + struct g_consumer *zcp; + nvlist_t *vdev_cfg; + uint64_t pool_guid; + int error; + + DROP_GIANT(); + g_topology_lock(); + + *configs = NULL; + *count = 0; + pool_guid = 0; + LIST_FOREACH(mp, &g_classes, class) { + if (mp == &zfs_vdev_class) + continue; + LIST_FOREACH(gp, &mp->geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + LIST_FOREACH(pp, &gp->provider, provider) { + if (pp->flags & G_PF_WITHER) + continue; + zcp = vdev_geom_attach(pp, NULL); + if (zcp == NULL) + continue; + g_topology_unlock(); + error = vdev_geom_read_config(zcp, &vdev_cfg); + g_topology_lock(); + vdev_geom_detach(zcp, B_TRUE); + if (error) + continue; + ZFS_LOG(1, "successfully read vdev config"); + + process_vdev_config(configs, count, + vdev_cfg, name, &pool_guid); + } + } + } + g_topology_unlock(); + PICKUP_GIANT(); + + return (*count > 0 ? 0 : ENOENT); +} + +enum match { + NO_MATCH, + TOP_MATCH, + FULL_MATCH +}; + +static enum match +vdev_attach_ok(vdev_t *vd, struct g_provider *pp) +{ + nvlist_t *config; + uint64_t pool_guid, top_guid, vdev_guid; + struct g_consumer *cp; + + cp = vdev_geom_attach(pp, NULL); + if (cp == NULL) { + ZFS_LOG(1, "Unable to attach tasting instance to %s.", + pp->name); + return (NO_MATCH); + } + g_topology_unlock(); + if (vdev_geom_read_config(cp, &config) != 0) { + g_topology_lock(); + vdev_geom_detach(cp, B_TRUE); + ZFS_LOG(1, "Unable to read config from %s.", pp->name); + return (NO_MATCH); + } + g_topology_lock(); + vdev_geom_detach(cp, B_TRUE); + + pool_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid); + top_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid); + vdev_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); + nvlist_free(config); + + /* + * Check that the label's pool guid matches the desired guid. + * Inactive spares and L2ARCs do not have any pool guid in the label. + */ + if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) { + ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.", + pp->name, + (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid); + return (NO_MATCH); + } + + /* + * Check that the label's vdev guid matches the desired guid. + * The second condition handles possible race on vdev detach, when + * remaining vdev receives GUID of destroyed top level mirror vdev. + */ + if (vdev_guid == vd->vdev_guid) { + ZFS_LOG(1, "guids match for provider %s.", pp->name); + return (FULL_MATCH); + } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) { + ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name); + return (TOP_MATCH); + } + ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.", + pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid); + return (NO_MATCH); +} + +static struct g_consumer * +vdev_geom_attach_by_guids(vdev_t *vd) +{ + struct g_class *mp; + struct g_geom *gp; + struct g_provider *pp; + struct g_consumer *cp; + enum match m; + + g_topology_assert(); + + cp = NULL; + LIST_FOREACH(mp, &g_classes, class) { + if (mp == &zfs_vdev_class) + continue; + LIST_FOREACH(gp, &mp->geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + LIST_FOREACH(pp, &gp->provider, provider) { + m = vdev_attach_ok(vd, pp); + if (m == NO_MATCH) + continue; + if (cp != NULL) { + if (m == FULL_MATCH) + vdev_geom_detach(cp, B_TRUE); + else + continue; + } + cp = vdev_geom_attach(pp, vd); + if (cp == NULL) { + printf("ZFS WARNING: Unable to " + "attach to %s.\n", pp->name); + continue; + } + if (m == FULL_MATCH) + return (cp); + } + } + } + return (cp); +} + +static struct g_consumer * +vdev_geom_open_by_guids(vdev_t *vd) +{ + struct g_consumer *cp; + char *buf; + size_t len; + + g_topology_assert(); + + ZFS_LOG(1, "Searching by guids [%ju:%ju].", + (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); + cp = vdev_geom_attach_by_guids(vd); + if (cp != NULL) { + len = strlen(cp->provider->name) + strlen("/dev/") + 1; + buf = kmem_alloc(len, KM_SLEEP); + + snprintf(buf, len, "/dev/%s", cp->provider->name); + spa_strfree(vd->vdev_path); + vd->vdev_path = buf; + + ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", + (uintmax_t)spa_guid(vd->vdev_spa), + (uintmax_t)vd->vdev_guid, vd->vdev_path); + } else { + ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", + (uintmax_t)spa_guid(vd->vdev_spa), + (uintmax_t)vd->vdev_guid); + } + + return (cp); +} + +static struct g_consumer * +vdev_geom_open_by_path(vdev_t *vd, int check_guid) +{ + struct g_provider *pp; + struct g_consumer *cp; + + g_topology_assert(); + + cp = NULL; + pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1); + if (pp != NULL) { + ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); + if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH) + cp = vdev_geom_attach(pp, vd); + } + + return (cp); +} + +static int +vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + struct g_provider *pp; + struct g_consumer *cp; + size_t bufsize; + int error; + + /* Set the TLS to indicate downstack that we should not access zvols*/ + VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0); + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if ((cp = vd->vdev_tsd) != NULL) { + ASSERT(vd->vdev_reopening); + goto skip_open; + } + + DROP_GIANT(); + g_topology_lock(); + error = 0; + + if (vd->vdev_spa->spa_splitting_newspa || + (vd->vdev_prevstate == VDEV_STATE_UNKNOWN && + vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || + vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) { + /* + * We are dealing with a vdev that hasn't been previously + * opened (since boot), and we are not loading an + * existing pool configuration. This looks like a + * vdev add operation to a new or existing pool. + * Assume the user knows what he/she is doing and find + * GEOM provider by its name, ignoring GUID mismatches. + * + * XXPOLICY: It would be safer to only allow a device + * that is unlabeled or labeled but missing + * GUID information to be opened in this fashion, + * unless we are doing a split, in which case we + * should allow any guid. + */ + cp = vdev_geom_open_by_path(vd, 0); + } else { + /* + * Try using the recorded path for this device, but only + * accept it if its label data contains the expected GUIDs. + */ + cp = vdev_geom_open_by_path(vd, 1); + if (cp == NULL) { + /* + * The device at vd->vdev_path doesn't have the + * expected GUIDs. The disks might have merely + * moved around so try all other GEOM providers + * to find one with the right GUIDs. + */ + cp = vdev_geom_open_by_guids(vd); + } + } + + /* Clear the TLS now that tasting is done */ + VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0); + + if (cp == NULL) { + ZFS_LOG(1, "Provider %s not found.", vd->vdev_path); + error = ENOENT; + } else if (cp->provider->sectorsize > VDEV_PAD_SIZE || + !ISP2(cp->provider->sectorsize)) { + ZFS_LOG(1, "Provider %s has unsupported sectorsize.", + vd->vdev_path); + + vdev_geom_close_locked(vd); + error = EINVAL; + cp = NULL; + } else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) { + int i; + + for (i = 0; i < 5; i++) { + error = g_access(cp, 0, 1, 0); + if (error == 0) + break; + g_topology_unlock(); + tsleep(vd, 0, "vdev", hz / 2); + g_topology_lock(); + } + if (error != 0) { + printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n", + vd->vdev_path, error); + vdev_geom_close_locked(vd); + cp = NULL; + } + } + + /* Fetch initial physical path information for this device. */ + if (cp != NULL) + vdev_geom_attrchanged(cp, "GEOM::physpath"); + + g_topology_unlock(); + PICKUP_GIANT(); + if (cp == NULL) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } +skip_open: + pp = cp->provider; + + /* + * Determine the actual size of the device. + */ + *max_psize = *psize = pp->mediasize; + + /* + * Determine the device's minimum transfer size and preferred + * transfer size. + */ + *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; + *physical_ashift = 0; + if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) && + pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0) + *physical_ashift = highbit(pp->stripesize) - 1; + + /* + * Clear the nowritecache settings, so that on a vdev_reopen() + * we will try again. + */ + vd->vdev_nowritecache = B_FALSE; + + /* + * Determine the device's rotation rate. + */ + vdev_geom_set_rotation_rate(vd, cp); + + return (0); +} + +static void +vdev_geom_close(vdev_t *vd) +{ + + if (vd->vdev_reopening) + return; + + DROP_GIANT(); + g_topology_lock(); + vdev_geom_close_locked(vd); + g_topology_unlock(); + PICKUP_GIANT(); +} + +static void +vdev_geom_io_intr(struct bio *bp) +{ + vdev_t *vd; + zio_t *zio; + + zio = bp->bio_caller1; + vd = zio->io_vd; + zio->io_error = bp->bio_error; + if (zio->io_error == 0 && bp->bio_resid != 0) + zio->io_error = SET_ERROR(EIO); + + switch(zio->io_error) { + case ENOTSUP: + /* + * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know + * that future attempts will never succeed. In this case + * we set a persistent flag so that we don't bother with + * requests in the future. + */ + switch(bp->bio_cmd) { + case BIO_FLUSH: + vd->vdev_nowritecache = B_TRUE; + break; + case BIO_DELETE: + vd->vdev_notrim = B_TRUE; + break; + } + break; + case ENXIO: + if (!vd->vdev_remove_wanted) { + /* + * If provider's error is set we assume it is being + * removed. + */ + if (bp->bio_to->error != 0) { + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, + SPA_ASYNC_REMOVE); + } else if (!vd->vdev_delayed_close) { + vd->vdev_delayed_close = B_TRUE; + } + } + break; + } + g_destroy_bio(bp); + zio_delay_interrupt(zio); +} + +static void +vdev_geom_io_start(zio_t *zio) +{ + vdev_t *vd; + struct g_consumer *cp; + struct bio *bp; + int error; + + vd = zio->io_vd; + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } else { + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + if (zfs_nocacheflush || vdev_geom_bio_flush_disable) + break; + if (vd->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + goto sendreq; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + } + + zio_execute(zio); + return; + case ZIO_TYPE_FREE: + if (vd->vdev_notrim) { + zio->io_error = SET_ERROR(ENOTSUP); + } else if (!vdev_geom_bio_delete_disable) { + goto sendreq; + } + zio_execute(zio); + return; + } +sendreq: + ASSERT(zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_FREE || + zio->io_type == ZIO_TYPE_IOCTL); + + cp = vd->vdev_tsd; + if (cp == NULL) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + bp = g_alloc_bio(); + bp->bio_caller1 = zio; + switch (zio->io_type) { + case ZIO_TYPE_READ: + case ZIO_TYPE_WRITE: + zio->io_target_timestamp = zio_handle_io_delay(zio); + bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE; + bp->bio_data = zio->io_data; + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + break; + case ZIO_TYPE_FREE: + bp->bio_cmd = BIO_DELETE; + bp->bio_data = NULL; + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + break; + case ZIO_TYPE_IOCTL: + bp->bio_cmd = BIO_FLUSH; + bp->bio_flags |= BIO_ORDERED; + bp->bio_data = NULL; + bp->bio_offset = cp->provider->mediasize; + bp->bio_length = 0; + break; + } + bp->bio_done = vdev_geom_io_intr; + + g_io_request(bp, cp); +} + +static void +vdev_geom_io_done(zio_t *zio) +{ +} + +static void +vdev_geom_hold(vdev_t *vd) +{ +} + +static void +vdev_geom_rele(vdev_t *vd) +{ +} + +vdev_ops_t vdev_geom_ops = { + vdev_geom_open, + vdev_geom_close, + vdev_default_asize, + vdev_geom_io_start, + vdev_geom_io_done, + NULL, + vdev_geom_hold, + vdev_geom_rele, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 vdev_label.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c 27 Feb 2010 22:31:12 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c 3 Dec 2016 17:03:49 -0000 @@ -18,9 +18,10 @@ * * CDDL HEADER END */ + /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ /* @@ -122,6 +123,8 @@ * txg Transaction group in which this label was written * pool_guid Unique identifier for this pool * vdev_tree An nvlist describing vdev tree. + * features_for_read + * An nvlist of the features necessary for reading the MOS. * * Each leaf device label also contains the following: * @@ -141,8 +144,15 @@ #include #include #include +#include +#include #include +static boolean_t vdev_trim_on_init = B_TRUE; +SYSCTL_DECL(_vfs_zfs_vdev); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RW, + &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation"); + /* * Basic routines to read and write from a vdev label. * Used throughout the rest of this file. @@ -175,7 +185,7 @@ vdev_label_number(uint64_t psize, uint64 static void vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private, int flags) + uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); @@ -189,7 +199,7 @@ vdev_label_read(zio_t *zio, vdev_t *vd, static void vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, - uint64_t size, zio_done_func_t *done, void *private, int flags) + uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) == @@ -208,34 +218,29 @@ vdev_label_write(zio_t *zio, vdev_t *vd, */ nvlist_t * vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - boolean_t isspare, boolean_t isl2cache) + vdev_config_flag_t flags) { nvlist_t *nv = NULL; - VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + nv = fnvlist_alloc(); - VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, - vd->vdev_ops->vdev_op_type) == 0); - if (!isspare && !isl2cache) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) - == 0); - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); + fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type); + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE))) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid); if (vd->vdev_path != NULL) - VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, - vd->vdev_path) == 0); + fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path); if (vd->vdev_devid != NULL) - VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, - vd->vdev_devid) == 0); + fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid); if (vd->vdev_physpath != NULL) - VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, - vd->vdev_physpath) == 0); + fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, + vd->vdev_physpath); if (vd->vdev_fru != NULL) - VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU, - vd->vdev_fru) == 0); + fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); if (vd->vdev_nparity != 0) { ASSERT(strcmp(vd->vdev_ops->vdev_op_type, @@ -256,65 +261,103 @@ vdev_config_generate(spa_t *spa, vdev_t * that only support a single parity device -- older software * will just ignore it. */ - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, - vd->vdev_nparity) == 0); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); } if (vd->vdev_wholedisk != -1ULL) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, - vd->vdev_wholedisk) == 0); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + vd->vdev_wholedisk); if (vd->vdev_not_present) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); if (vd->vdev_isspare) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); - if (!isspare && !isl2cache && vd == vd->vdev_top) { - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, - vd->vdev_ms_array) == 0); - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, - vd->vdev_ms_shift) == 0); - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, - vd->vdev_ashift) == 0); - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, - vd->vdev_asize) == 0); - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, - vd->vdev_islog) == 0); - } - - if (vd->vdev_dtl_smo.smo_object != 0) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, - vd->vdev_dtl_smo.smo_object) == 0); + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && + vd == vd->vdev_top) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, + vd->vdev_ms_array); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, + vd->vdev_ms_shift); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, + vd->vdev_asize); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); + if (vd->vdev_removing) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, + vd->vdev_removing); + } + + if (vd->vdev_dtl_sm != NULL) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, + space_map_object(vd->vdev_dtl_sm)); + } if (vd->vdev_crtxg) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, - vd->vdev_crtxg) == 0); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); + + if (flags & VDEV_CONFIG_MOS) { + if (vd->vdev_leaf_zap != 0) { + ASSERT(vd->vdev_ops->vdev_op_leaf); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, + vd->vdev_leaf_zap); + } + + if (vd->vdev_top_zap != 0) { + ASSERT(vd == vd->vdev_top); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, + vd->vdev_top_zap); + } + } if (getstats) { vdev_stat_t vs; + pool_scan_stat_t ps; + vdev_get_stats(vd, &vs); - VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_STATS, - (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0); + fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)); + + /* provide either current or previous scan information */ + if (spa_scan_get_stats(spa, &ps) == 0) { + fnvlist_add_uint64_array(nv, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, + sizeof (pool_scan_stat_t) / sizeof (uint64_t)); + } } if (!vd->vdev_ops->vdev_op_leaf) { nvlist_t **child; - int c; + int c, idx; ASSERT(!vd->vdev_ishole); child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); - for (c = 0; c < vd->vdev_children; c++) - child[c] = vdev_config_generate(spa, vd->vdev_child[c], - getstats, isspare, isl2cache); + for (c = 0, idx = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + /* + * If we're generating an nvlist of removing + * vdevs then skip over any device which is + * not being removed. + */ + if ((flags & VDEV_CONFIG_REMOVING) && + !cvd->vdev_removing) + continue; - VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - child, vd->vdev_children) == 0); + child[idx++] = vdev_config_generate(spa, cvd, + getstats, flags); + } + + if (idx) { + fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + child, idx); + } - for (c = 0; c < vd->vdev_children; c++) + for (c = 0; c < idx; c++) nvlist_free(child[c]); kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); @@ -323,23 +366,20 @@ vdev_config_generate(spa_t *spa, vdev_t const char *aux = NULL; if (vd->vdev_offline && !vd->vdev_tmpoffline) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, - B_TRUE) == 0); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE); + if (vd->vdev_resilver_txg != 0) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, + vd->vdev_resilver_txg); if (vd->vdev_faulted) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, - B_TRUE) == 0); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE); if (vd->vdev_degraded) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, - B_TRUE) == 0); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE); if (vd->vdev_removed) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, - B_TRUE) == 0); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE); if (vd->vdev_unspare) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, - B_TRUE) == 0); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE); if (vd->vdev_ishole) - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, - B_TRUE) == 0); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE); switch (vd->vdev_stat.vs_aux) { case VDEV_AUX_ERR_EXCEEDED: @@ -352,12 +392,11 @@ vdev_config_generate(spa_t *spa, vdev_t } if (aux != NULL) - VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, - aux) == 0); + fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux); if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { - VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, - vd->vdev_orig_guid) == 0); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, + vd->vdev_orig_guid); } } @@ -375,12 +414,11 @@ vdev_top_config_generate(spa_t *spa, nvl { vdev_t *rvd = spa->spa_root_vdev; uint64_t *array; - uint_t idx; + uint_t c, idx; array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP); - idx = 0; - for (int c = 0; c < rvd->vdev_children; c++) { + for (c = 0, idx = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; if (tvd->vdev_ishole) @@ -398,13 +436,23 @@ vdev_top_config_generate(spa_t *spa, nvl kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); } +/* + * Returns the configuration from the label of the given vdev. For vdevs + * which don't have a txg value stored on their label (i.e. spares/cache) + * or have not been completely initialized (txg = 0) just return + * the configuration from the first valid label we find. Otherwise, + * find the most up-to-date label that does not exceed the specified + * 'txg' value. + */ nvlist_t * -vdev_label_read_config(vdev_t *vd) +vdev_label_read_config(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; vdev_phys_t *vp; zio_t *zio; + uint64_t best_txg = 0; + int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; @@ -417,6 +465,7 @@ vdev_label_read_config(vdev_t *vd) retry: for (int l = 0; l < VDEV_LABELS; l++) { + nvlist_t *label = NULL; zio = zio_root(spa, NULL, NULL, flags); @@ -426,12 +475,31 @@ retry: if (zio_wait(zio) == 0 && nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), - &config, 0) == 0) - break; + &label, 0) == 0) { + uint64_t label_txg = 0; - if (config != NULL) { - nvlist_free(config); - config = NULL; + /* + * Auxiliary vdevs won't have txg values in their + * labels and newly added vdevs may not have been + * completely initialized so just return the + * configuration from the first valid label we + * encounter. + */ + error = nvlist_lookup_uint64(label, + ZPOOL_CONFIG_POOL_TXG, &label_txg); + if ((error || label_txg == 0) && !config) { + config = label; + break; + } else if (label_txg <= txg && label_txg > best_txg) { + best_txg = label_txg; + nvlist_free(config); + config = fnvlist_dup(label); + } + } + + if (label != NULL) { + nvlist_free(label); + label = NULL; } } @@ -466,7 +534,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, v /* * Read the label, if any, and perform some basic sanity checks. */ - if ((label = vdev_label_read_config(vd)) == NULL) + if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) return (B_FALSE); (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, @@ -544,6 +612,16 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, v return (B_TRUE); /* + * We can't rely on a pool's state if it's been imported + * read-only. Instead we look to see if the pools is marked + * read-only in the namespace and set the state to active. + */ + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + (spa = spa_by_guid(pool_guid, device_guid)) != NULL && + spa_mode(spa) == FREAD) + state = POOL_STATE_ACTIVE; + + /* * If the device is marked ACTIVE, then this device is in use by another * pool on the system. */ @@ -583,21 +661,21 @@ vdev_label_init(vdev_t *vd, uint64_t crt /* Track the creation time for this vdev */ vd->vdev_crtxg = crtxg; - if (!vd->vdev_ops->vdev_op_leaf) + if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa)) return (0); /* * Dead vdevs cannot be initialized. */ if (vdev_is_dead(vd)) - return (EIO); + return (SET_ERROR(EIO)); /* * Determine if the vdev is in use. */ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT && vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) - return (EBUSY); + return (SET_ERROR(EBUSY)); /* * If this is a request to add or replace a spare or l2cache device @@ -645,6 +723,17 @@ vdev_label_init(vdev_t *vd, uint64_t crt } /* + * TRIM the whole thing so that we start with a clean slate. + * It's just an optimization, so we don't care if it fails. + * Don't TRIM if removing so that we don't interfere with zpool + * disaster recovery. + */ + if (zfs_trim_enabled && vdev_trim_on_init && !vd->vdev_notrim && + (reason == VDEV_LABEL_CREATE || reason == VDEV_LABEL_SPARE || + reason == VDEV_LABEL_L2CACHE)) + zio_wait(zio_trim(NULL, spa, vd, 0, vd->vdev_psize)); + + /* * Initialize its label. */ vp = zio_buf_alloc(sizeof (vdev_phys_t)); @@ -782,6 +871,44 @@ retry: return (error); } +int +vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size) +{ + spa_t *spa = vd->vdev_spa; + zio_t *zio; + char *pad2; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + int error; + + if (size > VDEV_PAD_SIZE) + return (EINVAL); + + if (!vd->vdev_ops->vdev_op_leaf) + return (ENODEV); + if (vdev_is_dead(vd)) + return (ENXIO); + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + pad2 = zio_buf_alloc(VDEV_PAD_SIZE); + bzero(pad2, VDEV_PAD_SIZE); + memcpy(pad2, buf, size); + +retry: + zio = zio_root(spa, NULL, NULL, flags); + vdev_label_write(zio, vd, 0, pad2, + offsetof(vdev_label_t, vl_pad2), + VDEV_PAD_SIZE, NULL, NULL, flags); + error = zio_wait(zio); + if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + + zio_buf_free(pad2, VDEV_PAD_SIZE); + return (error); +} + /* * ========================================================================== * uberblock load/sync @@ -794,7 +921,7 @@ retry: * come back up, we fail to see the uberblock for txg + 1 because, say, * it was on a mirrored device and the replica to which we wrote txg + 1 * is now offline. If we then make some changes and sync txg + 1, and then - * the missing replica comes back, then for a new seconds we'll have two + * the missing replica comes back, then for a few seconds we'll have two * conflicting uberblocks on disk with the same txg. The solution is simple: * among uberblocks with equal txg, choose the one with the latest timestamp. */ @@ -814,46 +941,47 @@ vdev_uberblock_compare(uberblock_t *ub1, return (0); } +struct ubl_cbdata { + uberblock_t *ubl_ubbest; /* Best uberblock */ + vdev_t *ubl_vd; /* vdev associated with the above */ +}; + static void vdev_uberblock_load_done(zio_t *zio) { + vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; uberblock_t *ub = zio->io_data; - uberblock_t *ubbest = rio->io_private; + struct ubl_cbdata *cbp = rio->io_private; - ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(zio->io_vd)); + ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); if (ub->ub_txg <= spa->spa_load_max_txg && - vdev_uberblock_compare(ub, ubbest) > 0) - *ubbest = *ub; + vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { + /* + * Keep track of the vdev in which this uberblock + * was found. We will use this information later + * to obtain the config nvlist associated with + * this uberblock. + */ + *cbp->ubl_ubbest = *ub; + cbp->ubl_vd = vd; + } mutex_exit(&rio->io_lock); } zio_buf_free(zio->io_data, zio->io_size); } -void -vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) +static void +vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, + struct ubl_cbdata *cbp) { - spa_t *spa = vd->vdev_spa; - vdev_t *rvd = spa->spa_root_vdev; - int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | - ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; - - if (vd == rvd) { - ASSERT(zio == NULL); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - zio = zio_root(spa, NULL, ubbest, flags); - bzero(ubbest, sizeof (uberblock_t)); - } - - ASSERT(zio != NULL); - for (int c = 0; c < vd->vdev_children; c++) - vdev_uberblock_load(zio, vd->vdev_child[c], ubbest); + vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { for (int l = 0; l < VDEV_LABELS; l++) { @@ -866,11 +994,46 @@ vdev_uberblock_load(zio_t *zio, vdev_t * } } } +} - if (vd == rvd) { - (void) zio_wait(zio); - spa_config_exit(spa, SCL_ALL, FTAG); - } +/* + * Reads the 'best' uberblock from disk along with its associated + * configuration. First, we read the uberblock array of each label of each + * vdev, keeping track of the uberblock with the highest txg in each array. + * Then, we read the configuration from the same vdev as the best uberblock. + */ +void +vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) +{ + zio_t *zio; + spa_t *spa = rvd->vdev_spa; + struct ubl_cbdata cb; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; + + ASSERT(ub); + ASSERT(config); + + bzero(ub, sizeof (uberblock_t)); + *config = NULL; + + cb.ubl_ubbest = ub; + cb.ubl_vd = NULL; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + zio = zio_root(spa, NULL, &cb, flags); + vdev_uberblock_load_impl(zio, rvd, flags, &cb); + (void) zio_wait(zio); + + /* + * It's possible that the best uberblock was discovered on a label + * that has a configuration which was written in a future txg. + * Search all labels on this vdev to find the configuration that + * matches the txg for our uberblock. + */ + if (cb.ubl_vd != NULL) + *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); + spa_config_exit(spa, SCL_ALL, FTAG); } /* @@ -883,7 +1046,7 @@ vdev_uberblock_sync_done(zio_t *zio) uint64_t *good_writes = zio->io_private; if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) - atomic_add_64(good_writes, 1); + atomic_inc_64(good_writes); } /* @@ -919,6 +1082,7 @@ vdev_uberblock_sync(zio_t *zio, uberbloc zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); } +/* Sync the uberblocks to all vdevs in svd[] */ int vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { @@ -957,7 +1121,7 @@ vdev_label_sync_done(zio_t *zio) uint64_t *good_writes = zio->io_private; if (zio->io_error == 0) - atomic_add_64(good_writes, 1); + atomic_inc_64(good_writes); } /* @@ -969,7 +1133,7 @@ vdev_label_sync_top_done(zio_t *zio) uint64_t *good_writes = zio->io_private; if (*good_writes == 0) - zio->io_error = EIO; + zio->io_error = SET_ERROR(EIO); kmem_free(good_writes, sizeof (uint64_t)); } @@ -1082,15 +1246,16 @@ vdev_label_sync_list(spa_t *spa, int l, * at any time, you can just call it again, and it will resume its work. */ int -vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard) +vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) { spa_t *spa = svd[0]->vdev_spa; uberblock_t *ub = &spa->spa_uberblock; vdev_t *vd; zio_t *zio; - int error; + int error = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; +retry: /* * Normally, we don't want to try too hard to write every label and * uberblock. If there is a flaky disk, we don't want the rest of the @@ -1098,8 +1263,11 @@ vdev_config_sync(vdev_t **svd, int svdco * single label out, we should retry with ZIO_FLAG_TRYHARD before * bailing out and declaring the pool faulted. */ - if (tryhard) + if (error != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) + return (error); flags |= ZIO_FLAG_TRYHARD; + } ASSERT(ub->ub_txg <= txg); @@ -1143,7 +1311,7 @@ vdev_config_sync(vdev_t **svd, int svdco * are committed to stable storage before the uberblock update. */ if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) - return (error); + goto retry; /* * Sync the uberblocks to all vdevs in svd[]. @@ -1161,7 +1329,7 @@ vdev_config_sync(vdev_t **svd, int svdco * to the new uberblocks. */ if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) - return (error); + goto retry; /* * Sync out odd labels for every dirty vdev. If the system dies @@ -1173,5 +1341,10 @@ vdev_config_sync(vdev_t **svd, int svdco * to disk to ensure that all odd-label updates are committed to * stable storage before the next transaction group begins. */ - return (vdev_label_sync_list(spa, 1, txg, flags)); + if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) + goto retry;; + + trim_thread_wakeup(spa); + + return (0); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 vdev_mirror.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c 27 Feb 2010 22:31:06 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c 18 Apr 2017 00:26:18 -0000 @@ -19,10 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + */ + #include #include #include @@ -37,27 +41,105 @@ typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; int mc_error; + int mc_load; uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; } mirror_child_t; typedef struct mirror_map { + int *mm_preferred; + int mm_preferred_cnt; int mm_children; - int mm_replacing; - int mm_preferred; - int mm_root; - mirror_child_t mm_child[1]; + boolean_t mm_replacing; + boolean_t mm_root; + mirror_child_t mm_child[]; } mirror_map_t; -int vdev_mirror_shift = 21; +static int vdev_mirror_shift = 21; + +#ifdef __FreeBSD__ +#ifdef _KERNEL +SYSCTL_DECL(_vfs_zfs_vdev); +static SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0, + "ZFS VDEV Mirror"); +#endif +#endif + +/* + * The load configuration settings below are tuned by default for + * the case where all devices are of the same rotational type. + * + * If there is a mixture of rotating and non-rotating media, setting + * non_rotating_seek_inc to 0 may well provide better results as it + * will direct more reads to the non-rotating vdevs which are more + * likely to have a higher performance. + */ + +/* Rotating media load calculation configuration. */ +static int rotating_inc = 0; +#ifdef _KERNEL +SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_inc, CTLFLAG_RWTUN, + &rotating_inc, 0, "Rotating media load increment for non-seeking I/O's"); +#endif + +static int rotating_seek_inc = 5; +#ifdef _KERNEL +SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_inc, CTLFLAG_RWTUN, + &rotating_seek_inc, 0, "Rotating media load increment for seeking I/O's"); +#endif + +static int rotating_seek_offset = 1 * 1024 * 1024; +#ifdef _KERNEL +SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_offset, CTLFLAG_RWTUN, + &rotating_seek_offset, 0, "Offset in bytes from the last I/O which " + "triggers a reduced rotating media seek increment"); +#endif + +/* Non-rotating media load calculation configuration. */ +static int non_rotating_inc = 0; +#ifdef _KERNEL +SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_inc, CTLFLAG_RWTUN, + &non_rotating_inc, 0, + "Non-rotating media load increment for non-seeking I/O's"); +#endif + +static int non_rotating_seek_inc = 1; +#ifdef _KERNEL +SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_seek_inc, CTLFLAG_RWTUN, + &non_rotating_seek_inc, 0, + "Non-rotating media load increment for seeking I/O's"); +#endif + + +static inline size_t +vdev_mirror_map_size(int children) +{ + return (offsetof(mirror_map_t, mm_child[children]) + + sizeof(int) * children); +} + +static inline mirror_map_t * +vdev_mirror_map_alloc(int children, boolean_t replacing, boolean_t root) +{ + mirror_map_t *mm; + + mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP); + mm->mm_children = children; + mm->mm_replacing = replacing; + mm->mm_root = root; + mm->mm_preferred = (int *)((uintptr_t)mm + + offsetof(mirror_map_t, mm_child[children])); + + return mm; +} static void vdev_mirror_map_free(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; - kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); + kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); } static const zio_vsd_ops_t vdev_mirror_vsd_ops = { @@ -65,55 +147,80 @@ static const zio_vsd_ops_t vdev_mirror_v zio_vsd_default_cksum_report }; +static int +vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) +{ + uint64_t lastoffset; + int load; + + /* All DVAs have equal weight at the root. */ + if (mm->mm_root) + return (INT_MAX); + + /* + * We don't return INT_MAX if the device is resilvering i.e. + * vdev_resilver_txg != 0 as when tested performance was slightly + * worse overall when resilvering with compared to without. + */ + + /* Standard load based on pending queue length. */ + load = vdev_queue_length(vd); + lastoffset = vdev_queue_lastoffset(vd); + + if (vd->vdev_rotation_rate == VDEV_RATE_NON_ROTATING) { + /* Non-rotating media. */ + if (lastoffset == zio_offset) + return (load + non_rotating_inc); + + /* + * Apply a seek penalty even for non-rotating devices as + * sequential I/O'a can be aggregated into fewer operations + * on the device, thus avoiding unnecessary per-command + * overhead and boosting performance. + */ + return (load + non_rotating_seek_inc); + } + + /* Rotating media I/O's which directly follow the last I/O. */ + if (lastoffset == zio_offset) + return (load + rotating_inc); + + /* + * Apply half the seek increment to I/O's within seek offset + * of the last I/O queued to this vdev as they should incure less + * of a seek increment. + */ + if (ABS(lastoffset - zio_offset) < rotating_seek_offset) + return (load + (rotating_seek_inc / 2)); + + /* Apply the full seek increment to all other I/O's. */ + return (load + rotating_seek_inc); +} + + static mirror_map_t * -vdev_mirror_map_alloc(zio_t *zio) +vdev_mirror_map_init(zio_t *zio) { mirror_map_t *mm = NULL; mirror_child_t *mc; vdev_t *vd = zio->io_vd; - int c, d; + int c; if (vd == NULL) { dva_t *dva = zio->io_bp->blk_dva; spa_t *spa = zio->io_spa; - c = BP_GET_NDVAS(zio->io_bp); - - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); - mm->mm_children = c; - mm->mm_replacing = B_FALSE; - mm->mm_preferred = spa_get_random(c); - mm->mm_root = B_TRUE; - - /* - * Check the other, lower-index DVAs to see if they're on - * the same vdev as the child we picked. If they are, use - * them since they are likely to have been allocated from - * the primary metaslab in use at the time, and hence are - * more likely to have locality with single-copy data. - */ - for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) { - if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c])) - mm->mm_preferred = d; - } - + mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE, + B_TRUE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; - mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); mc->mc_offset = DVA_GET_OFFSET(&dva[c]); } } else { - c = vd->vdev_children; - - mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP); - mm->mm_children = c; - mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops || - vd->vdev_ops == &vdev_spare_ops); - mm->mm_preferred = mm->mm_replacing ? 0 : - (zio->io_offset >> vdev_mirror_shift) % c; - mm->mm_root = B_FALSE; - + mm = vdev_mirror_map_alloc(vd->vdev_children, + (vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops), B_FALSE); for (c = 0; c < mm->mm_children; c++) { mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; @@ -127,14 +234,15 @@ vdev_mirror_map_alloc(zio_t *zio) } static int -vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *logical_ashift, uint64_t *physical_ashift) { int numerrors = 0; int lasterror = 0; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); + return (SET_ERROR(EINVAL)); } vdev_open_children(vd); @@ -149,7 +257,10 @@ vdev_mirror_open(vdev_t *vd, uint64_t *a } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; - *ashift = MAX(*ashift, cvd->vdev_ashift); + *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; + *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); + *physical_ashift = MAX(*physical_ashift, + cvd->vdev_physical_ashift); } if (numerrors == vd->vdev_children) { @@ -184,9 +295,10 @@ vdev_mirror_scrub_done(zio_t *zio) if (zio->io_error == 0) { zio_t *pio; + zio_link_t *zl = NULL; mutex_enter(&zio->io_lock); - while ((pio = zio_walk_parents(zio)) != NULL) { + while ((pio = zio_walk_parents(zio, &zl)) != NULL) { mutex_enter(&pio->io_lock); ASSERT3U(zio->io_size, >=, pio->io_size); bcopy(zio->io_data, pio->io_data, pio->io_size); @@ -203,50 +315,124 @@ vdev_mirror_scrub_done(zio_t *zio) } /* - * Try to find a child whose DTL doesn't contain the block we want to read. + * Check the other, lower-index DVAs to see if they're on the same + * vdev as the child we picked. If they are, use them since they + * are likely to have been allocated from the primary metaslab in + * use at the time, and hence are more likely to have locality with + * single-copy data. + */ +static int +vdev_mirror_dva_select(zio_t *zio, int p) +{ + dva_t *dva = zio->io_bp->blk_dva; + mirror_map_t *mm = zio->io_vsd; + int preferred; + int c; + + preferred = mm->mm_preferred[p]; + for (p-- ; p >= 0; p--) { + c = mm->mm_preferred[p]; + if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred])) + preferred = c; + } + return (preferred); +} + +static int +vdev_mirror_preferred_child_randomize(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + int p; + + if (mm->mm_root) { + p = spa_get_random(mm->mm_preferred_cnt); + return (vdev_mirror_dva_select(zio, p)); + } + + /* + * To ensure we don't always favour the first matching vdev, + * which could lead to wear leveling issues on SSD's, we + * use the I/O offset as a pseudo random seed into the vdevs + * which have the lowest load. + */ + p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt; + return (mm->mm_preferred[p]); +} + +/* + * Try to find a vdev whose DTL doesn't contain the block we want to read + * prefering vdevs based on determined load. + * * If we can't, try the read on any vdev we haven't already tried. */ static int vdev_mirror_child_select(zio_t *zio) { mirror_map_t *mm = zio->io_vsd; - mirror_child_t *mc; uint64_t txg = zio->io_txg; - int i, c; + int c, lowest_load; ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); - /* - * Try to find a child whose DTL doesn't contain the block to read. - * If a child is known to be completely inaccessible (indicated by - * vdev_readable() returning B_FALSE), don't even try. - */ - for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) { - if (c >= mm->mm_children) - c = 0; + lowest_load = INT_MAX; + mm->mm_preferred_cnt = 0; + for (c = 0; c < mm->mm_children; c++) { + mirror_child_t *mc; + mc = &mm->mm_child[c]; if (mc->mc_tried || mc->mc_skipped) continue; + if (!vdev_readable(mc->mc_vd)) { - mc->mc_error = ENXIO; + mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } - if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) - return (c); - mc->mc_error = ESTALE; - mc->mc_skipped = 1; - mc->mc_speculative = 1; + + if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + mc->mc_error = SET_ERROR(ESTALE); + mc->mc_skipped = 1; + mc->mc_speculative = 1; + continue; + } + + mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); + if (mc->mc_load > lowest_load) + continue; + + if (mc->mc_load < lowest_load) { + lowest_load = mc->mc_load; + mm->mm_preferred_cnt = 0; + } + mm->mm_preferred[mm->mm_preferred_cnt] = c; + mm->mm_preferred_cnt++; + } + + if (mm->mm_preferred_cnt == 1) { + vdev_queue_register_lastoffset( + mm->mm_child[mm->mm_preferred[0]].mc_vd, zio); + return (mm->mm_preferred[0]); + } + + if (mm->mm_preferred_cnt > 1) { + int c = vdev_mirror_preferred_child_randomize(zio); + + vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio); + return (c); } /* * Every device is either missing or has this txg in its DTL. * Look for any child we haven't already tried before giving up. */ - for (c = 0; c < mm->mm_children; c++) - if (!mm->mm_child[c].mc_tried) + for (c = 0; c < mm->mm_children; c++) { + if (!mm->mm_child[c].mc_tried) { + vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, + zio); return (c); + } + } /* * Every child failed. There's no place left to look. @@ -254,17 +440,18 @@ vdev_mirror_child_select(zio_t *zio) return (-1); } -static int +static void vdev_mirror_io_start(zio_t *zio) { mirror_map_t *mm; mirror_child_t *mc; int c, children; - mm = vdev_mirror_map_alloc(zio); + mm = vdev_mirror_map_init(zio); if (zio->io_type == ZIO_TYPE_READ) { - if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) { + if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing && + mm->mm_children > 1) { /* * For scrubbing reads we need to allocate a read * buffer for each child and issue reads to all @@ -279,7 +466,8 @@ vdev_mirror_io_start(zio_t *zio) zio->io_type, zio->io_priority, 0, vdev_mirror_scrub_done, mc)); } - return (ZIO_PIPELINE_CONTINUE); + zio_execute(zio); + return; } /* * For normal reads just pick one child. @@ -287,10 +475,11 @@ vdev_mirror_io_start(zio_t *zio) c = vdev_mirror_child_select(zio); children = (c >= 0); } else { - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_FREE); /* - * Writes go to all children. + * Writes and frees go to all children. */ c = 0; children = mm->mm_children; @@ -305,7 +494,7 @@ vdev_mirror_io_start(zio_t *zio) c++; } - return (ZIO_PIPELINE_CONTINUE); + zio_execute(zio); } static int @@ -371,6 +560,8 @@ vdev_mirror_io_done(zio_t *zio) zio->io_error = vdev_mirror_worst_error(mm); } return; + } else if (zio->io_type == ZIO_TYPE_FREE) { + return; } ASSERT(zio->io_type == ZIO_TYPE_READ); @@ -420,13 +611,13 @@ vdev_mirror_io_done(zio_t *zio) !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; - mc->mc_error = ESTALE; + mc->mc_error = SET_ERROR(ESTALE); } zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, - ZIO_TYPE_WRITE, zio->io_priority, + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } @@ -452,6 +643,8 @@ vdev_ops_t vdev_mirror_ops = { vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, + NULL, + NULL, VDEV_TYPE_MIRROR, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -463,6 +656,8 @@ vdev_ops_t vdev_replacing_ops = { vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, + NULL, + NULL, VDEV_TYPE_REPLACING, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -474,6 +669,8 @@ vdev_ops_t vdev_spare_ops = { vdev_mirror_io_start, vdev_mirror_io_done, vdev_mirror_state_change, + NULL, + NULL, VDEV_TYPE_SPARE, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 vdev_missing.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c 27 Feb 2010 22:31:12 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c 19 Nov 2014 12:25:12 -0000 @@ -19,11 +19,15 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +/* * The 'missing' vdev is a special vdev type used only during import. It * signifies a placeholder in the root vdev for some vdev that we know is * missing. We pass it down to the kernel to allow the rest of the @@ -40,7 +44,8 @@ /* ARGSUSED */ static int -vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) +vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) { /* * Really this should just fail. But then the root vdev will be in the @@ -49,7 +54,9 @@ vdev_missing_open(vdev_t *vd, uint64_t * * will fail the GUID sum check before ever trying to open the pool. */ *psize = 0; - *ashift = 0; + *max_psize = 0; + *logical_ashift = 0; + *physical_ashift = 0; return (0); } @@ -60,11 +67,11 @@ vdev_missing_close(vdev_t *vd) } /* ARGSUSED */ -static int +static void vdev_missing_io_start(zio_t *zio) { - zio->io_error = ENOTSUP; - return (ZIO_PIPELINE_CONTINUE); + zio->io_error = SET_ERROR(ENOTSUP); + zio_execute(zio); } /* ARGSUSED */ @@ -80,6 +87,8 @@ vdev_ops_t vdev_missing_ops = { vdev_missing_io_start, vdev_missing_io_done, NULL, + NULL, + NULL, VDEV_TYPE_MISSING, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -91,6 +100,8 @@ vdev_ops_t vdev_hole_ops = { vdev_missing_io_start, vdev_missing_io_done, NULL, + NULL, + NULL, VDEV_TYPE_HOLE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 vdev_queue.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c 27 Feb 2010 22:31:12 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c 27 Mar 2017 06:19:46 -0000 @@ -23,28 +23,148 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + #include #include +#include #include #include +#include +#include + +/* + * ZFS I/O Scheduler + * --------------- + * + * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The + * I/O scheduler determines when and in what order those operations are + * issued. The I/O scheduler divides operations into six I/O classes + * prioritized in the following order: sync read, sync write, async read, + * async write, scrub/resilver and trim. Each queue defines the minimum and + * maximum number of concurrent operations that may be issued to the device. + * In addition, the device has an aggregate maximum. Note that the sum of the + * per-queue minimums must not exceed the aggregate maximum, and if the + * aggregate maximum is equal to or greater than the sum of the per-queue + * maximums, the per-queue minimum has no effect. + * + * For many physical devices, throughput increases with the number of + * concurrent operations, but latency typically suffers. Further, physical + * devices typically have a limit at which more concurrent operations have no + * effect on throughput or can actually cause it to decrease. + * + * The scheduler selects the next operation to issue by first looking for an + * I/O class whose minimum has not been satisfied. Once all are satisfied and + * the aggregate maximum has not been hit, the scheduler looks for classes + * whose maximum has not been satisfied. Iteration through the I/O classes is + * done in the order specified above. No further operations are issued if the + * aggregate maximum number of concurrent operations has been hit or if there + * are no operations queued for an I/O class that has not hit its maximum. + * Every time an I/O is queued or an operation completes, the I/O scheduler + * looks for new operations to issue. + * + * All I/O classes have a fixed maximum number of outstanding operations + * except for the async write class. Asynchronous writes represent the data + * that is committed to stable storage during the syncing stage for + * transaction groups (see txg.c). Transaction groups enter the syncing state + * periodically so the number of queued async writes will quickly burst up and + * then bleed down to zero. Rather than servicing them as quickly as possible, + * the I/O scheduler changes the maximum number of active async write I/Os + * according to the amount of dirty data in the pool (see dsl_pool.c). Since + * both throughput and latency typically increase with the number of + * concurrent operations issued to physical devices, reducing the burstiness + * in the number of concurrent operations also stabilizes the response time of + * operations from other -- and in particular synchronous -- queues. In broad + * strokes, the I/O scheduler will issue more concurrent operations from the + * async write queue as there's more dirty data in the pool. + * + * Async Writes + * + * The number of concurrent operations issued for the async write I/O class + * follows a piece-wise linear function defined by a few adjustable points. + * + * | o---------| <-- zfs_vdev_async_write_max_active + * ^ | /^ | + * | | / | | + * active | / | | + * I/O | / | | + * count | / | | + * | / | | + * |------------o | | <-- zfs_vdev_async_write_min_active + * 0|____________^______|_________| + * 0% | | 100% of zfs_dirty_data_max + * | | + * | `-- zfs_vdev_async_write_active_max_dirty_percent + * `--------- zfs_vdev_async_write_active_min_dirty_percent + * + * Until the amount of dirty data exceeds a minimum percentage of the dirty + * data allowed in the pool, the I/O scheduler will limit the number of + * concurrent operations to the minimum. As that threshold is crossed, the + * number of concurrent operations issued increases linearly to the maximum at + * the specified maximum percentage of the dirty data allowed in the pool. + * + * Ideally, the amount of dirty data on a busy pool will stay in the sloped + * part of the function between zfs_vdev_async_write_active_min_dirty_percent + * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the + * maximum percentage, this indicates that the rate of incoming data is + * greater than the rate that the backend storage can handle. In this case, we + * must further throttle incoming writes (see dmu_tx_delay() for details). + */ /* - * These tunables are for performance analysis. + * The maximum number of I/Os active to each device. Ideally, this will be >= + * the sum of each queue's max_active. It must be at least the sum of each + * queue's min_active. */ +uint32_t zfs_vdev_max_active = 1000; + +/* + * Per-queue limits on the number of I/Os active to each device. If the + * sum of the queue's max_active is < zfs_vdev_max_active, then the + * min_active comes into play. We will send min_active from each queue, + * and then select from queues in the order defined by zio_priority_t. + * + * In general, smaller max_active's will lead to lower latency of synchronous + * operations. Larger max_active's may lead to higher overall throughput, + * depending on underlying storage. + * + * The ratio of the queues' max_actives determines the balance of performance + * between reads, writes, and scrubs. E.g., increasing + * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete + * more quickly, but reads and writes to have higher latency and lower + * throughput. + */ +uint32_t zfs_vdev_sync_read_min_active = 10; +uint32_t zfs_vdev_sync_read_max_active = 10; +uint32_t zfs_vdev_sync_write_min_active = 10; +uint32_t zfs_vdev_sync_write_max_active = 10; +uint32_t zfs_vdev_async_read_min_active = 1; +uint32_t zfs_vdev_async_read_max_active = 3; +uint32_t zfs_vdev_async_write_min_active = 1; +uint32_t zfs_vdev_async_write_max_active = 10; +uint32_t zfs_vdev_scrub_min_active = 1; +uint32_t zfs_vdev_scrub_max_active = 2; +uint32_t zfs_vdev_trim_min_active = 1; /* - * zfs_vdev_max_pending is the maximum number of i/os concurrently - * pending to each device. zfs_vdev_min_pending is the initial number - * of i/os pending to each device (before it starts ramping up to - * max_pending). + * TRIM max active is large in comparison to the other values due to the fact + * that TRIM IOs are coalesced at the device layer. This value is set such + * that a typical SSD can process the queued IOs in a single request. */ -int zfs_vdev_max_pending = 10; -int zfs_vdev_min_pending = 4; +uint32_t zfs_vdev_trim_max_active = 64; -/* deadline = pri + ddi_get_lbolt64() >> time_shift) */ -int zfs_vdev_time_shift = 6; -/* exponential I/O issue ramp-up rate */ -int zfs_vdev_ramp_rate = 2; +/* + * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent + * dirty data, use zfs_vdev_async_write_min_active. When it has more than + * zfs_vdev_async_write_active_max_dirty_percent, use + * zfs_vdev_async_write_max_active. The value is linearly interpolated + * between min and max. + */ +int zfs_vdev_async_write_active_min_dirty_percent = 30; +int zfs_vdev_async_write_active_max_dirty_percent = 60; /* * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. @@ -52,24 +172,135 @@ int zfs_vdev_ramp_rate = 2; * we include spans of optional I/Os to aid aggregation at the disk even when * they aren't able to help us aggregate at this level. */ -int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; +int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE; int zfs_vdev_read_gap_limit = 32 << 10; int zfs_vdev_write_gap_limit = 4 << 10; /* - * Virtual device vector for disk I/O scheduling. + * Define the queue depth percentage for each top-level. This percentage is + * used in conjunction with zfs_vdev_async_max_active to determine how many + * allocations a specific top-level vdev should handle. Once the queue depth + * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100 + * then allocator will stop allocating blocks on that top-level device. + * The default kernel setting is 1000% which will yield 100 allocations per + * device. For userland testing, the default setting is 300% which equates + * to 30 allocations per device. */ +#ifdef _KERNEL +int zfs_vdev_queue_depth_pct = 1000; +#else +int zfs_vdev_queue_depth_pct = 300; +#endif + + +#ifdef __FreeBSD__ +#ifdef _KERNEL +SYSCTL_DECL(_vfs_zfs_vdev); + +static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), + sysctl_zfs_async_write_active_min_dirty_percent, "I", + "Percentage of async write dirty data below which " + "async_write_min_active is used."); + +static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), + sysctl_zfs_async_write_active_max_dirty_percent, "I", + "Percentage of async write dirty data above which " + "async_write_max_active is used."); + +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN, + &zfs_vdev_max_active, 0, + "The maximum number of I/Os of all types active for each device."); + +#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \ +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\ + &zfs_vdev_ ## name ## _min_active, 0, \ + "Initial number of I/O requests of type " #name \ + " active for each device"); + +#define ZFS_VDEV_QUEUE_KNOB_MAX(name) \ +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN,\ + &zfs_vdev_ ## name ## _max_active, 0, \ + "Maximum number of I/O requests of type " #name \ + " active for each device"); + +ZFS_VDEV_QUEUE_KNOB_MIN(sync_read); +ZFS_VDEV_QUEUE_KNOB_MAX(sync_read); +ZFS_VDEV_QUEUE_KNOB_MIN(sync_write); +ZFS_VDEV_QUEUE_KNOB_MAX(sync_write); +ZFS_VDEV_QUEUE_KNOB_MIN(async_read); +ZFS_VDEV_QUEUE_KNOB_MAX(async_read); +ZFS_VDEV_QUEUE_KNOB_MIN(async_write); +ZFS_VDEV_QUEUE_KNOB_MAX(async_write); +ZFS_VDEV_QUEUE_KNOB_MIN(scrub); +ZFS_VDEV_QUEUE_KNOB_MAX(scrub); +ZFS_VDEV_QUEUE_KNOB_MIN(trim); +ZFS_VDEV_QUEUE_KNOB_MAX(trim); + +#undef ZFS_VDEV_QUEUE_KNOB + +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN, + &zfs_vdev_aggregation_limit, 0, + "I/O requests are aggregated up to this size"); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN, + &zfs_vdev_read_gap_limit, 0, + "Acceptable gap between two reads being aggregated"); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN, + &zfs_vdev_write_gap_limit, 0, + "Acceptable gap between two writes being aggregated"); +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN, + &zfs_vdev_queue_depth_pct, 0, + "Queue depth percentage for each top-level"); + +static int +sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS) +{ + int val, err; + + val = zfs_vdev_async_write_active_min_dirty_percent; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < 0 || val > 100 || + val >= zfs_vdev_async_write_active_max_dirty_percent) + return (EINVAL); + + zfs_vdev_async_write_active_min_dirty_percent = val; + + return (0); +} + +static int +sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS) +{ + int val, err; + + val = zfs_vdev_async_write_active_max_dirty_percent; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < 0 || val > 100 || + val <= zfs_vdev_async_write_active_min_dirty_percent) + return (EINVAL); + + zfs_vdev_async_write_active_max_dirty_percent = val; + + return (0); +} +#endif +#endif + int -vdev_queue_deadline_compare(const void *x1, const void *x2) +vdev_queue_offset_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; - if (z1->io_deadline < z2->io_deadline) - return (-1); - if (z1->io_deadline > z2->io_deadline) - return (1); - if (z1->io_offset < z2->io_offset) return (-1); if (z1->io_offset > z2->io_offset) @@ -83,12 +314,34 @@ vdev_queue_deadline_compare(const void * return (0); } +static inline avl_tree_t * +vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) +{ + return (&vq->vq_class[p].vqc_queued_tree); +} + +static inline avl_tree_t * +vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) +{ + if (t == ZIO_TYPE_READ) + return (&vq->vq_read_offset_tree); + else if (t == ZIO_TYPE_WRITE) + return (&vq->vq_write_offset_tree); + else + return (NULL); +} + int -vdev_queue_offset_compare(const void *x1, const void *x2) +vdev_queue_timestamp_compare(const void *x1, const void *x2) { const zio_t *z1 = x1; const zio_t *z2 = x2; + if (z1->io_timestamp < z2->io_timestamp) + return (-1); + if (z1->io_timestamp > z2->io_timestamp) + return (1); + if (z1->io_offset < z2->io_offset) return (-1); if (z1->io_offset > z2->io_offset) @@ -108,18 +361,35 @@ vdev_queue_init(vdev_t *vd) vdev_queue_t *vq = &vd->vdev_queue; mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); + vq->vq_vdev = vd; + + avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, + sizeof (zio_t), offsetof(struct zio, io_queue_node)); + avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); + avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); - avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, - sizeof (zio_t), offsetof(struct zio, io_deadline_node)); + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + int (*compfn) (const void *, const void *); - avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, - sizeof (zio_t), offsetof(struct zio, io_offset_node)); + /* + * The synchronous i/o queues are dispatched in FIFO rather + * than LBA order. This provides more consistent latency for + * these i/os. + */ + if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE) + compfn = vdev_queue_timestamp_compare; + else + compfn = vdev_queue_offset_compare; - avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, - sizeof (zio_t), offsetof(struct zio, io_offset_node)); + avl_create(vdev_queue_class_tree(vq, p), compfn, + sizeof (zio_t), offsetof(struct zio, io_queue_node)); + } - avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, - sizeof (zio_t), offsetof(struct zio, io_offset_node)); + vq->vq_lastoffset = 0; } void @@ -127,10 +397,11 @@ vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; - avl_destroy(&vq->vq_deadline_tree); - avl_destroy(&vq->vq_read_tree); - avl_destroy(&vq->vq_write_tree); - avl_destroy(&vq->vq_pending_tree); + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) + avl_destroy(vdev_queue_class_tree(vq, p)); + avl_destroy(&vq->vq_active_tree); + avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); + avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); mutex_destroy(&vq->vq_lock); } @@ -138,30 +409,231 @@ vdev_queue_fini(vdev_t *vd) static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { - avl_add(&vq->vq_deadline_tree, zio); - avl_add(zio->io_vdev_tree, zio); + spa_t *spa = zio->io_spa; + avl_tree_t *qtt; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); + qtt = vdev_queue_type_tree(vq, zio->io_type); + if (qtt) + avl_add(qtt, zio); + +#ifdef illumos + mutex_enter(&spa->spa_iokstat_lock); + spa->spa_queue_stats[zio->io_priority].spa_queued++; + if (spa->spa_iokstat != NULL) + kstat_waitq_enter(spa->spa_iokstat->ks_data); + mutex_exit(&spa->spa_iokstat_lock); +#endif } static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { - avl_remove(&vq->vq_deadline_tree, zio); - avl_remove(zio->io_vdev_tree, zio); + spa_t *spa = zio->io_spa; + avl_tree_t *qtt; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + qtt = vdev_queue_type_tree(vq, zio->io_type); + if (qtt) + avl_remove(qtt, zio); + +#ifdef illumos + mutex_enter(&spa->spa_iokstat_lock); + ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0); + spa->spa_queue_stats[zio->io_priority].spa_queued--; + if (spa->spa_iokstat != NULL) + kstat_waitq_exit(spa->spa_iokstat->ks_data); + mutex_exit(&spa->spa_iokstat_lock); +#endif } static void -vdev_queue_agg_io_done(zio_t *aio) +vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { - zio_t *pio; + spa_t *spa = zio->io_spa; + ASSERT(MUTEX_HELD(&vq->vq_lock)); + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + vq->vq_class[zio->io_priority].vqc_active++; + avl_add(&vq->vq_active_tree, zio); + +#ifdef illumos + mutex_enter(&spa->spa_iokstat_lock); + spa->spa_queue_stats[zio->io_priority].spa_active++; + if (spa->spa_iokstat != NULL) + kstat_runq_enter(spa->spa_iokstat->ks_data); + mutex_exit(&spa->spa_iokstat_lock); +#endif +} - while ((pio = zio_walk_parents(aio)) != NULL) - if (aio->io_type == ZIO_TYPE_READ) +static void +vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) +{ + spa_t *spa = zio->io_spa; + ASSERT(MUTEX_HELD(&vq->vq_lock)); + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + vq->vq_class[zio->io_priority].vqc_active--; + avl_remove(&vq->vq_active_tree, zio); + +#ifdef illumos + mutex_enter(&spa->spa_iokstat_lock); + ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0); + spa->spa_queue_stats[zio->io_priority].spa_active--; + if (spa->spa_iokstat != NULL) { + kstat_io_t *ksio = spa->spa_iokstat->ks_data; + + kstat_runq_exit(spa->spa_iokstat->ks_data); + if (zio->io_type == ZIO_TYPE_READ) { + ksio->reads++; + ksio->nread += zio->io_size; + } else if (zio->io_type == ZIO_TYPE_WRITE) { + ksio->writes++; + ksio->nwritten += zio->io_size; + } + } + mutex_exit(&spa->spa_iokstat_lock); +#endif +} + +static void +vdev_queue_agg_io_done(zio_t *aio) +{ + if (aio->io_type == ZIO_TYPE_READ) { + zio_t *pio; + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(aio, &zl)) != NULL) { bcopy((char *)aio->io_data + (pio->io_offset - aio->io_offset), pio->io_data, pio->io_size); + } + } zio_buf_free(aio->io_data, aio->io_size); } +static int +vdev_queue_class_min_active(zio_priority_t p) +{ + switch (p) { + case ZIO_PRIORITY_SYNC_READ: + return (zfs_vdev_sync_read_min_active); + case ZIO_PRIORITY_SYNC_WRITE: + return (zfs_vdev_sync_write_min_active); + case ZIO_PRIORITY_ASYNC_READ: + return (zfs_vdev_async_read_min_active); + case ZIO_PRIORITY_ASYNC_WRITE: + return (zfs_vdev_async_write_min_active); + case ZIO_PRIORITY_SCRUB: + return (zfs_vdev_scrub_min_active); + case ZIO_PRIORITY_TRIM: + return (zfs_vdev_trim_min_active); + default: + panic("invalid priority %u", p); + return (0); + } +} + +static __noinline int +vdev_queue_max_async_writes(spa_t *spa) +{ + int writes; + uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total; + uint64_t min_bytes = zfs_dirty_data_max * + zfs_vdev_async_write_active_min_dirty_percent / 100; + uint64_t max_bytes = zfs_dirty_data_max * + zfs_vdev_async_write_active_max_dirty_percent / 100; + + /* + * Sync tasks correspond to interactive user actions. To reduce the + * execution time of those actions we push data out as fast as possible. + */ + if (spa_has_pending_synctask(spa)) { + return (zfs_vdev_async_write_max_active); + } + + if (dirty < min_bytes) + return (zfs_vdev_async_write_min_active); + if (dirty > max_bytes) + return (zfs_vdev_async_write_max_active); + + /* + * linear interpolation: + * slope = (max_writes - min_writes) / (max_bytes - min_bytes) + * move right by min_bytes + * move up by min_writes + */ + writes = (dirty - min_bytes) * + (zfs_vdev_async_write_max_active - + zfs_vdev_async_write_min_active) / + (max_bytes - min_bytes) + + zfs_vdev_async_write_min_active; + ASSERT3U(writes, >=, zfs_vdev_async_write_min_active); + ASSERT3U(writes, <=, zfs_vdev_async_write_max_active); + return (writes); +} + +static int +vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) +{ + switch (p) { + case ZIO_PRIORITY_SYNC_READ: + return (zfs_vdev_sync_read_max_active); + case ZIO_PRIORITY_SYNC_WRITE: + return (zfs_vdev_sync_write_max_active); + case ZIO_PRIORITY_ASYNC_READ: + return (zfs_vdev_async_read_max_active); + case ZIO_PRIORITY_ASYNC_WRITE: + return (vdev_queue_max_async_writes(spa)); + case ZIO_PRIORITY_SCRUB: + return (zfs_vdev_scrub_max_active); + case ZIO_PRIORITY_TRIM: + return (zfs_vdev_trim_max_active); + default: + panic("invalid priority %u", p); + return (0); + } +} + +/* + * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if + * there is no eligible class. + */ +static zio_priority_t +vdev_queue_class_to_issue(vdev_queue_t *vq) +{ + spa_t *spa = vq->vq_vdev->vdev_spa; + zio_priority_t p; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) + return (ZIO_PRIORITY_NUM_QUEUEABLE); + + /* find a queue that has not reached its minimum # outstanding i/os */ + for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && + vq->vq_class[p].vqc_active < + vdev_queue_class_min_active(p)) + return (p); + } + + /* + * If we haven't found a queue, look for one that hasn't reached its + * maximum # outstanding i/os. + */ + for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && + vq->vq_class[p].vqc_active < + vdev_queue_class_max_active(spa, p)) + return (p); + } + + /* No eligible queued i/os */ + return (ZIO_PRIORITY_NUM_QUEUEABLE); +} + /* * Compute the range spanned by two i/os, which is the endpoint of the last * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). @@ -172,153 +644,192 @@ vdev_queue_agg_io_done(zio_t *aio) #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) static zio_t * -vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) +vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) { - zio_t *fio, *lio, *aio, *dio, *nio, *mio; + zio_t *first, *last, *aio, *dio, *mandatory, *nio; + void *abuf; + uint64_t maxgap = 0; + uint64_t size; + boolean_t stretch; avl_tree_t *t; - int flags; - uint64_t maxspan = zfs_vdev_aggregation_limit; - uint64_t maxgap; - int stretch; + enum zio_flag flags; -again: ASSERT(MUTEX_HELD(&vq->vq_lock)); - if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || - avl_numnodes(&vq->vq_deadline_tree) == 0) + if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) return (NULL); - fio = lio = avl_first(&vq->vq_deadline_tree); + first = last = zio; - t = fio->io_vdev_tree; - flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; - maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; + if (zio->io_type == ZIO_TYPE_READ) + maxgap = zfs_vdev_read_gap_limit; - if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { - /* - * We can aggregate I/Os that are sufficiently adjacent and of - * the same flavor, as expressed by the AGG_INHERIT flags. - * The latter requirement is necessary so that certain - * attributes of the I/O, such as whether it's a normal I/O - * or a scrub/resilver, can be preserved in the aggregate. - * We can include optional I/Os, but don't allow them - * to begin a range as they add no benefit in that situation. - */ + /* + * We can aggregate I/Os that are sufficiently adjacent and of + * the same flavor, as expressed by the AGG_INHERIT flags. + * The latter requirement is necessary so that certain + * attributes of the I/O, such as whether it's a normal I/O + * or a scrub/resilver, can be preserved in the aggregate. + * We can include optional I/Os, but don't allow them + * to begin a range as they add no benefit in that situation. + */ - /* - * We keep track of the last non-optional I/O. - */ - mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio; + /* + * We keep track of the last non-optional I/O. + */ + mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first; - /* - * Walk backwards through sufficiently contiguous I/Os - * recording the last non-option I/O. - */ - while ((dio = AVL_PREV(t, fio)) != NULL && - (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && - IO_SPAN(dio, lio) <= maxspan && - IO_GAP(dio, fio) <= maxgap) { - fio = dio; - if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL)) - mio = fio; - } + /* + * Walk backwards through sufficiently contiguous I/Os + * recording the last non-option I/O. + */ + flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; + t = vdev_queue_type_tree(vq, zio->io_type); + while (t != NULL && (dio = AVL_PREV(t, first)) != NULL && + (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && + IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit && + IO_GAP(dio, first) <= maxgap) { + first = dio; + if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) + mandatory = first; + } - /* - * Skip any initial optional I/Os. - */ - while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) { - fio = AVL_NEXT(t, fio); - ASSERT(fio != NULL); - } + /* + * Skip any initial optional I/Os. + */ + while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { + first = AVL_NEXT(t, first); + ASSERT(first != NULL); + } - /* - * Walk forward through sufficiently contiguous I/Os. - */ - while ((dio = AVL_NEXT(t, lio)) != NULL && - (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && - IO_SPAN(fio, dio) <= maxspan && - IO_GAP(lio, dio) <= maxgap) { - lio = dio; - if (!(lio->io_flags & ZIO_FLAG_OPTIONAL)) - mio = lio; - } + /* + * Walk forward through sufficiently contiguous I/Os. + */ + while ((dio = AVL_NEXT(t, last)) != NULL && + (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && + IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit && + IO_GAP(last, dio) <= maxgap) { + last = dio; + if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) + mandatory = last; + } - /* - * Now that we've established the range of the I/O aggregation - * we must decide what to do with trailing optional I/Os. - * For reads, there's nothing to do. While we are unable to - * aggregate further, it's possible that a trailing optional - * I/O would allow the underlying device to aggregate with - * subsequent I/Os. We must therefore determine if the next - * non-optional I/O is close enough to make aggregation - * worthwhile. - */ - stretch = B_FALSE; - if (t != &vq->vq_read_tree && mio != NULL) { - nio = lio; - while ((dio = AVL_NEXT(t, nio)) != NULL && - IO_GAP(nio, dio) == 0 && - IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) { - nio = dio; - if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { - stretch = B_TRUE; - break; - } + /* + * Now that we've established the range of the I/O aggregation + * we must decide what to do with trailing optional I/Os. + * For reads, there's nothing to do. While we are unable to + * aggregate further, it's possible that a trailing optional + * I/O would allow the underlying device to aggregate with + * subsequent I/Os. We must therefore determine if the next + * non-optional I/O is close enough to make aggregation + * worthwhile. + */ + stretch = B_FALSE; + if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { + zio_t *nio = last; + while ((dio = AVL_NEXT(t, nio)) != NULL && + IO_GAP(nio, dio) == 0 && + IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) { + nio = dio; + if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { + stretch = B_TRUE; + break; } } + } - if (stretch) { - /* This may be a no-op. */ - VERIFY((dio = AVL_NEXT(t, lio)) != NULL); - dio->io_flags &= ~ZIO_FLAG_OPTIONAL; - } else { - while (lio != mio && lio != fio) { - ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL); - lio = AVL_PREV(t, lio); - ASSERT(lio != NULL); - } + if (stretch) { + /* This may be a no-op. */ + dio = AVL_NEXT(t, last); + dio->io_flags &= ~ZIO_FLAG_OPTIONAL; + } else { + while (last != mandatory && last != first) { + ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); + last = AVL_PREV(t, last); + ASSERT(last != NULL); } } - if (fio != lio) { - uint64_t size = IO_SPAN(fio, lio); - ASSERT(size <= zfs_vdev_aggregation_limit); - - aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, - zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG, - flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, - vdev_queue_agg_io_done, NULL); - - nio = fio; - do { - dio = nio; - nio = AVL_NEXT(t, dio); - ASSERT(dio->io_type == aio->io_type); - ASSERT(dio->io_vdev_tree == t); - - if (dio->io_flags & ZIO_FLAG_NODATA) { - ASSERT(dio->io_type == ZIO_TYPE_WRITE); - bzero((char *)aio->io_data + (dio->io_offset - - aio->io_offset), dio->io_size); - } else if (dio->io_type == ZIO_TYPE_WRITE) { - bcopy(dio->io_data, (char *)aio->io_data + - (dio->io_offset - aio->io_offset), - dio->io_size); - } + if (first == last) + return (NULL); + + size = IO_SPAN(first, last); + ASSERT3U(size, <=, zfs_vdev_aggregation_limit); + + abuf = zio_buf_alloc_nowait(size); + if (abuf == NULL) + return (NULL); + + aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, + abuf, size, first->io_type, zio->io_priority, + flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, + vdev_queue_agg_io_done, NULL); + aio->io_timestamp = first->io_timestamp; + + nio = first; + do { + dio = nio; + nio = AVL_NEXT(t, dio); + ASSERT3U(dio->io_type, ==, aio->io_type); + + if (dio->io_flags & ZIO_FLAG_NODATA) { + ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); + bzero((char *)aio->io_data + (dio->io_offset - + aio->io_offset), dio->io_size); + } else if (dio->io_type == ZIO_TYPE_WRITE) { + bcopy(dio->io_data, (char *)aio->io_data + + (dio->io_offset - aio->io_offset), + dio->io_size); + } + + zio_add_child(dio, aio); + vdev_queue_io_remove(vq, dio); + zio_vdev_io_bypass(dio); + zio_execute(dio); + } while (dio != last); + + return (aio); +} + +static zio_t * +vdev_queue_io_to_issue(vdev_queue_t *vq) +{ + zio_t *zio, *aio; + zio_priority_t p; + avl_index_t idx; + avl_tree_t *tree; + zio_t search; - zio_add_child(dio, aio); - vdev_queue_io_remove(vq, dio); - zio_vdev_io_bypass(dio); - zio_execute(dio); - } while (dio != lio); +again: + ASSERT(MUTEX_HELD(&vq->vq_lock)); - avl_add(&vq->vq_pending_tree, aio); + p = vdev_queue_class_to_issue(vq); - return (aio); + if (p == ZIO_PRIORITY_NUM_QUEUEABLE) { + /* No eligible queued i/os */ + return (NULL); } - ASSERT(fio->io_vdev_tree == t); - vdev_queue_io_remove(vq, fio); + /* + * For LBA-ordered queues (async / scrub), issue the i/o which follows + * the most recently issued i/o in LBA (offset) order. + * + * For FIFO queues (sync), issue the i/o with the lowest timestamp. + */ + tree = vdev_queue_class_tree(vq, p); + search.io_timestamp = 0; + search.io_offset = vq->vq_last_offset + 1; + VERIFY3P(avl_find(tree, &search, &idx), ==, NULL); + zio = avl_nearest(tree, idx, AVL_AFTER); + if (zio == NULL) + zio = avl_first(tree); + ASSERT3U(zio->io_priority, ==, p); + + aio = vdev_queue_aggregate(vq, zio); + if (aio != NULL) + zio = aio; + else + vdev_queue_io_remove(vq, zio); /* * If the I/O is or was optional and therefore has no data, we need to @@ -326,17 +837,18 @@ again: * deadlock that we could encounter since this I/O will complete * immediately. */ - if (fio->io_flags & ZIO_FLAG_NODATA) { + if (zio->io_flags & ZIO_FLAG_NODATA) { mutex_exit(&vq->vq_lock); - zio_vdev_io_bypass(fio); - zio_execute(fio); + zio_vdev_io_bypass(zio); + zio_execute(zio); mutex_enter(&vq->vq_lock); goto again; } - avl_add(&vq->vq_pending_tree, fio); + vdev_queue_pending_add(vq, zio); + vq->vq_last_offset = zio->io_offset; - return (fio); + return (zio); } zio_t * @@ -345,27 +857,33 @@ vdev_queue_io(zio_t *zio) vdev_queue_t *vq = &zio->io_vd->vdev_queue; zio_t *nio; - ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); - if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) return (zio); - zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; + /* + * Children i/os inherent their parent's priority, which might + * not match the child's i/o type. Fix it up here. + */ + if (zio->io_type == ZIO_TYPE_READ) { + if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && + zio->io_priority != ZIO_PRIORITY_ASYNC_READ && + zio->io_priority != ZIO_PRIORITY_SCRUB) + zio->io_priority = ZIO_PRIORITY_ASYNC_READ; + } else if (zio->io_type == ZIO_TYPE_WRITE) { + if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && + zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE) + zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; + } else { + ASSERT(zio->io_type == ZIO_TYPE_FREE); + zio->io_priority = ZIO_PRIORITY_TRIM; + } - if (zio->io_type == ZIO_TYPE_READ) - zio->io_vdev_tree = &vq->vq_read_tree; - else - zio->io_vdev_tree = &vq->vq_write_tree; + zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; mutex_enter(&vq->vq_lock); - - zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) + - zio->io_priority; - + zio->io_timestamp = gethrtime(); vdev_queue_io_add(vq, zio); - - nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); - + nio = vdev_queue_io_to_issue(vq); mutex_exit(&vq->vq_lock); if (nio == NULL) @@ -383,15 +901,15 @@ void vdev_queue_io_done(zio_t *zio) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; + zio_t *nio; mutex_enter(&vq->vq_lock); - avl_remove(&vq->vq_pending_tree, zio); + vdev_queue_pending_remove(vq, zio); - for (int i = 0; i < zfs_vdev_ramp_rate; i++) { - zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); - if (nio == NULL) - break; + vq->vq_io_complete_ts = gethrtime(); + + while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { mutex_exit(&vq->vq_lock); if (nio->io_done == vdev_queue_agg_io_done) { zio_nowait(nio); @@ -404,3 +922,26 @@ vdev_queue_io_done(zio_t *zio) mutex_exit(&vq->vq_lock); } + +/* + * As these three methods are only used for load calculations we're not concerned + * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex + * use here, instead we prefer to keep it lock free for performance. + */ +int +vdev_queue_length(vdev_t *vd) +{ + return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); +} + +uint64_t +vdev_queue_lastoffset(vdev_t *vd) +{ + return (vd->vdev_queue.vq_lastoffset); +} + +void +vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio) +{ + vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size; +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 vdev_raidz.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c 27 Feb 2010 22:31:14 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c 5 May 2017 18:03:34 -0000 @@ -20,17 +20,27 @@ */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ #include #include #include +#ifndef __FreeBSD__ +#include +#endif +#include +#include #include #include #include #include +#ifdef __FreeBSD__ +#include +#endif /* * Virtual device vector for RAID-Z. @@ -60,6 +70,7 @@ * o addition (+) is represented by a bitwise XOR * o subtraction (-) is therefore identical to addition: A + B = A - B * o multiplication of A by 2 is defined by the following bitwise expression: + * * (A * 2)_7 = A_6 * (A * 2)_6 = A_5 * (A * 2)_5 = A_4 @@ -118,7 +129,7 @@ typedef struct raidz_map { uint64_t rm_missingparity; /* Count of missing parity devices */ uint64_t rm_firstdatacol; /* First data column/parity count */ uint64_t rm_nskip; /* Skipped sectors for padding */ - uint64_t rm_skipstart; /* Column index of padding start */ + uint64_t rm_skipstart; /* Column index of padding start */ void *rm_datacopy; /* rm_asize-buffer of copied data */ uintptr_t rm_reports; /* # of referencing checksum reports */ uint8_t rm_freed; /* map no longer has referencing ZIO */ @@ -153,15 +164,14 @@ typedef struct raidz_map { VDEV_RAIDZ_64MUL_2((x), mask); \ } +#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE) + /* * Force reconstruction to use the general purpose method. */ int vdev_raidz_default_to_general; -/* - * These two tables represent powers and logs of 2 in the Galois field defined - * above. These values were computed by repeatedly multiplying by 2 as above. - */ +/* Powers of 2 in the Galois field defined above. */ static const uint8_t vdev_raidz_pow2[256] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, @@ -196,6 +206,7 @@ static const uint8_t vdev_raidz_pow2[256 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 }; +/* Logs of 2 in the Galois field defined above. */ static const uint8_t vdev_raidz_log2[256] = { 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, @@ -259,7 +270,9 @@ vdev_raidz_map_free(raidz_map_t *rm) size_t size; for (c = 0; c < rm->rm_firstdatacol; c++) { - zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + if (rm->rm_col[c].rc_data != NULL) + zio_buf_free(rm->rm_col[c].rc_data, + rm->rm_col[c].rc_size); if (rm->rm_col[c].rc_gdata != NULL) zio_buf_free(rm->rm_col[c].rc_gdata, @@ -281,7 +294,7 @@ vdev_raidz_map_free_vsd(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; - ASSERT3U(rm->rm_freed, ==, 0); + ASSERT0(rm->rm_freed); rm->rm_freed = 1; if (rm->rm_reports == 0) @@ -431,23 +444,50 @@ static const zio_vsd_ops_t vdev_raidz_vs vdev_raidz_cksum_report }; +/* + * Divides the IO evenly across all child vdevs; usually, dcols is + * the number of children in the target vdev. + */ static raidz_map_t * -vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, - uint64_t nparity) +vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree, + uint64_t unit_shift, uint64_t dcols, uint64_t nparity) { raidz_map_t *rm; - uint64_t b = zio->io_offset >> unit_shift; - uint64_t s = zio->io_size >> unit_shift; + /* The starting RAIDZ (parent) vdev sector of the block. */ + uint64_t b = offset >> unit_shift; + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = size >> unit_shift; + /* The first column for this stripe. */ uint64_t f = b % dcols; + /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ q = s / (dcols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ r = s - q * (dcols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ tot = s + nparity * (q + (r == 0 ? 0 : 1)); + /* acols: The columns that will be accessed. */ + /* scols: The columns that will be accessed or skipped. */ if (q == 0) { + /* Our I/O request doesn't span all child vdevs. */ acols = bc; scols = MIN(dcols, roundup(bc, nparity + 1)); } else { @@ -504,14 +544,20 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_ ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); ASSERT3U(rm->rm_nskip, <=, nparity); - for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); - - rm->rm_col[c].rc_data = zio->io_data; + if (!dofree) { + for (c = 0; c < rm->rm_firstdatacol; c++) { + rm->rm_col[c].rc_data = + zio_buf_alloc(rm->rm_col[c].rc_size); + } - for (c = c + 1; c < acols; c++) - rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + - rm->rm_col[c - 1].rc_size; + rm->rm_col[c].rc_data = data; + + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_data = + (char *)rm->rm_col[c - 1].rc_data + + rm->rm_col[c - 1].rc_size; + } + } /* * If all data stored spans all columns, there's a danger that parity @@ -536,7 +582,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_ ASSERT(rm->rm_cols >= 2); ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); - if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { + if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { devidx = rm->rm_col[0].rc_devidx; o = rm->rm_col[0].rc_offset; rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; @@ -548,8 +594,6 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_ rm->rm_skipstart = 1; } - zio->io_vsd = rm; - zio->io_vsd_ops = &vdev_raidz_vsd_ops; return (rm); } @@ -959,12 +1003,9 @@ vdev_raidz_reconstruct_pq(raidz_map_t *r * ~~ ~~ * __ __ * | 1 1 1 1 1 1 1 1 | - * | 128 64 32 16 8 4 2 1 | * | 19 205 116 29 64 16 4 1 | * | 1 0 0 0 0 0 0 0 | - * | 0 1 0 0 0 0 0 0 | - * (V|I)' = | 0 0 1 0 0 0 0 0 | - * | 0 0 0 1 0 0 0 0 | + * (V|I)' = | 0 0 0 1 0 0 0 0 | * | 0 0 0 0 1 0 0 0 | * | 0 0 0 0 0 1 0 0 | * | 0 0 0 0 0 0 1 0 | @@ -1134,7 +1175,7 @@ vdev_raidz_matrix_invert(raidz_map_t *rm */ for (i = 0; i < nmissing; i++) { for (j = 0; j < missing[i]; j++) { - ASSERT3U(rows[i][j], ==, 0); + ASSERT0(rows[i][j]); } ASSERT3U(rows[i][missing[i]], !=, 0); @@ -1175,7 +1216,7 @@ vdev_raidz_matrix_invert(raidz_map_t *rm if (j == missing[i]) { ASSERT3U(rows[i][j], ==, 1); } else { - ASSERT3U(rows[i][j], ==, 0); + ASSERT0(rows[i][j]); } } } @@ -1190,7 +1231,8 @@ vdev_raidz_matrix_reconstruct(raidz_map_ uint64_t ccount; uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; - uint8_t log, val; + uint8_t log = 0; + uint8_t val; int ll; uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; @@ -1442,7 +1484,8 @@ vdev_raidz_reconstruct(raidz_map_t *rm, } static int -vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_t *cvd; uint64_t nparity = vd->vdev_nparity; @@ -1455,7 +1498,7 @@ vdev_raidz_open(vdev_t *vd, uint64_t *as if (nparity > VDEV_RAIDZ_MAXPARITY || vd->vdev_children < nparity + 1) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); + return (SET_ERROR(EINVAL)); } vdev_open_children(vd); @@ -1470,10 +1513,14 @@ vdev_raidz_open(vdev_t *vd, uint64_t *as } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; - *ashift = MAX(*ashift, cvd->vdev_ashift); + *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; + *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); + *physical_ashift = MAX(*physical_ashift, + cvd->vdev_physical_ashift); } *asize *= vd->vdev_children; + *max_asize *= vd->vdev_children; if (numerrors > nparity) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; @@ -1492,6 +1539,154 @@ vdev_raidz_close(vdev_t *vd) vdev_close(vd->vdev_child[c]); } +#ifdef illumos +/* + * Handle a read or write I/O to a RAID-Z dump device. + * + * The dump device is in a unique situation compared to other ZFS datasets: + * writing to this device should be as simple and fast as possible. In + * addition, durability matters much less since the dump will be extracted + * once the machine reboots. For that reason, this function eschews parity for + * performance and simplicity. The dump device uses the checksum setting + * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this + * dataset. + * + * Blocks of size 128 KB have been preallocated for this volume. I/Os less than + * 128 KB will not fill an entire block; in addition, they may not be properly + * aligned. In that case, this function uses the preallocated 128 KB block and + * omits reading or writing any "empty" portions of that block, as opposed to + * allocating a fresh appropriately-sized block. + * + * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs: + * + * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB) + * + * If this were a standard RAID-Z dataset, a block of at least 40 KB would be + * allocated which spans all five child vdevs. 8 KB of data would be written to + * each of four vdevs, with the fifth containing the parity bits. + * + * parity data data data data + * | PP | XX | XX | XX | XX | + * ^ ^ ^ ^ ^ + * | | | | | + * 8 KB parity ------8 KB data blocks------ + * + * However, when writing to the dump device, the behavior is different: + * + * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB) + * + * Unlike the normal RAID-Z case in which the block is allocated based on the + * I/O size, reads and writes here always use a 128 KB logical I/O size. If the + * I/O size is less than 128 KB, only the actual portions of data are written. + * In this example the data is written to the third data vdev since that vdev + * contains the offset [64 KB, 96 KB). + * + * parity data data data data + * | | | | XX | | + * ^ + * | + * 32 KB data block + * + * As a result, an individual I/O may not span all child vdevs; moreover, a + * small I/O may only operate on a single child vdev. + * + * Note that since there are no parity bits calculated or written, this format + * remains the same no matter how many parity bits are used in a normal RAID-Z + * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above + * would look like: + * + * parity parity parity data data data data + * | | | | | | XX | | + * ^ + * | + * 32 KB data block + */ +int +vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size, + uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump) +{ + vdev_t *tvd = vd->vdev_top; + vdev_t *cvd; + raidz_map_t *rm; + raidz_col_t *rc; + int c, err = 0; + + uint64_t start, end, colstart, colend; + uint64_t coloffset, colsize, colskip; + + int flags = doread ? BIO_READ : BIO_WRITE; + +#ifdef _KERNEL + + /* + * Don't write past the end of the block + */ + VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE); + + start = offset; + end = start + size; + + /* + * Allocate a RAID-Z map for this block. Note that this block starts + * from the "original" offset, this is, the offset of the extent which + * contains the requisite offset of the data being read or written. + * + * Even if this I/O operation doesn't span the full block size, let's + * treat the on-disk format as if the only blocks are the complete 128 + * KB size. + */ + rm = vdev_raidz_map_alloc(data - (offset - origoffset), + SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, + vd->vdev_children, vd->vdev_nparity); + + coloffset = origoffset; + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; + c++, coloffset += rc->rc_size) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + + /* + * Find the start and end of this column in the RAID-Z map, + * keeping in mind that the stated size and offset of the + * operation may not fill the entire column for this vdev. + * + * If any portion of the data spans this column, issue the + * appropriate operation to the vdev. + */ + if (coloffset + rc->rc_size <= start) + continue; + if (coloffset >= end) + continue; + + colstart = MAX(coloffset, start); + colend = MIN(end, coloffset + rc->rc_size); + colsize = colend - colstart; + colskip = colstart - coloffset; + + VERIFY3U(colsize, <=, rc->rc_size); + VERIFY3U(colskip, <=, rc->rc_size); + + /* + * Note that the child vdev will have a vdev label at the start + * of its range of offsets, hence the need for + * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another + * example of why this calculation is needed. + */ + if ((err = vdev_disk_physio(cvd, + ((char *)rc->rc_data) + colskip, colsize, + VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, + flags, isdump)) != 0) + break; + } + + vdev_raidz_map_free(rm); +#endif /* KERNEL */ + + return (err); +} +#endif + static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize) { @@ -1517,7 +1712,24 @@ vdev_raidz_child_done(zio_t *zio) rc->rc_skipped = 0; } -static int +/* + * Start an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. + * 3. If the column skips any sectors for padding, create optional dummy + * write zio children for those areas to improve aggregation continuity. + * - For read operations: + * 1. Create child zio read operations to each data column's vdev to read + * the range of data required for zio. + * 2. If this is a scrub or resilver operation, or if any of the data + * vdevs have had errors, then create zio read operations to the parity + * columns' VDevs as well. + */ +static void vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -1527,11 +1739,30 @@ vdev_raidz_io_start(zio_t *zio) raidz_col_t *rc; int c, i; - rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, + rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset, + zio->io_type == ZIO_TYPE_FREE, + tvd->vdev_ashift, vd->vdev_children, vd->vdev_nparity); + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); + if (zio->io_type == ZIO_TYPE_FREE) { + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_data, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + + zio_execute(zio); + return; + } + if (zio->io_type == ZIO_TYPE_WRITE) { vdev_raidz_generate_parity(rm); @@ -1561,7 +1792,8 @@ vdev_raidz_io_start(zio_t *zio) ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); } - return (ZIO_PIPELINE_CONTINUE); + zio_execute(zio); + return; } ASSERT(zio->io_type == ZIO_TYPE_READ); @@ -1578,7 +1810,7 @@ vdev_raidz_io_start(zio_t *zio) rm->rm_missingdata++; else rm->rm_missingparity++; - rc->rc_error = ENXIO; + rc->rc_error = SET_ERROR(ENXIO); rc->rc_tried = 1; /* don't even try */ rc->rc_skipped = 1; continue; @@ -1588,7 +1820,7 @@ vdev_raidz_io_start(zio_t *zio) rm->rm_missingdata++; else rm->rm_missingparity++; - rc->rc_error = ESTALE; + rc->rc_error = SET_ERROR(ESTALE); rc->rc_skipped = 1; continue; } @@ -1601,9 +1833,10 @@ vdev_raidz_io_start(zio_t *zio) } } - return (ZIO_PIPELINE_CONTINUE); + zio_execute(zio); } + /* * Report a checksum error for a child of a RAID-Z device. */ @@ -1659,6 +1892,13 @@ raidz_parity_verify(zio_t *zio, raidz_ma int c, ret = 0; raidz_col_t *rc; + blkptr_t *bp = zio->io_bp; + enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + + if (checksum == ZIO_CHECKSUM_NOPARITY) + return (ret); + for (c = 0; c < rm->rm_firstdatacol; c++) { rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) @@ -1675,7 +1915,7 @@ raidz_parity_verify(zio_t *zio, raidz_ma continue; if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { raidz_checksum_error(zio, rc, orig[c]); - rc->rc_error = ECKSUM; + rc->rc_error = SET_ERROR(ECKSUM); ret++; } zio_buf_free(orig[c], rc->rc_size); @@ -1799,7 +2039,7 @@ vdev_raidz_combrec(zio_t *zio, int total if (rc->rc_tried) raidz_checksum_error(zio, rc, orig[i]); - rc->rc_error = ECKSUM; + rc->rc_error = SET_ERROR(ECKSUM); } ret = code; @@ -1856,6 +2096,27 @@ done: return (ret); } +/* + * Complete an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Check for errors on the child IOs. + * 2. Return, setting an error code if too few child VDevs were written + * to reconstruct the data later. Note that partial writes are + * considered successful if they can be reconstructed at all. + * - For read operations: + * 1. Check for errors on the child IOs. + * 2. If data errors occurred: + * a. Try to reassemble the data from the parity available. + * b. If we haven't yet read the parity drives, read them now. + * c. If all parity drives have been read but the data still doesn't + * reassemble with a correct checksum, then try combinatorial + * reconstruction. + * d. If that doesn't work, return an error. + * 3. If there were unexpected errors or this is a resilver operation, + * rewrite the vdevs that had errors. + */ static void vdev_raidz_io_done(zio_t *zio) { @@ -1913,6 +2174,8 @@ vdev_raidz_io_done(zio_t *zio) zio->io_error = vdev_raidz_worst_error(rm); return; + } else if (zio->io_type == ZIO_TYPE_FREE) { + return; } ASSERT(zio->io_type == ZIO_TYPE_READ); @@ -2075,7 +2338,7 @@ vdev_raidz_io_done(zio_t *zio) * Start checksum ereports for all children which haven't * failed, and the IO wasn't speculative. */ - zio->io_error = ECKSUM; + zio->io_error = SET_ERROR(ECKSUM); if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { for (c = 0; c < rm->rm_cols; c++) { @@ -2113,7 +2376,7 @@ done: zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, - ZIO_TYPE_WRITE, zio->io_priority, + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } @@ -2139,6 +2402,8 @@ vdev_ops_t vdev_raidz_ops = { vdev_raidz_io_start, vdev_raidz_io_done, vdev_raidz_state_change, + NULL, + NULL, VDEV_TYPE_RAIDZ, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 vdev_root.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c 27 Feb 2010 22:31:14 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c 2 Sep 2013 11:38:20 -0000 @@ -19,10 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + #include #include #include @@ -50,14 +54,15 @@ too_many_errors(vdev_t *vd, int numerror } static int -vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) +vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *logical_ashift, uint64_t *physical_ashift) { int lasterror = 0; int numerrors = 0; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (EINVAL); + return (SET_ERROR(EINVAL)); } vdev_open_children(vd); @@ -77,7 +82,9 @@ vdev_root_open(vdev_t *vd, uint64_t *asi } *asize = 0; - *ashift = 0; + *max_asize = 0; + *logical_ashift = 0; + *physical_ashift = 0; return (0); } @@ -109,6 +116,8 @@ vdev_ops_t vdev_root_ops = { NULL, /* io_start - not applicable to the root */ NULL, /* io_done - not applicable to the root */ vdev_root_state_change, + NULL, + NULL, VDEV_TYPE_ROOT, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zap.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c 27 Feb 2010 22:31:15 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c 2 May 2017 00:32:08 -0000 @@ -19,8 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* @@ -50,9 +51,9 @@ int fzap_default_block_shift = 14; /* 16k blocksize */ -static void zap_leaf_pageout(dmu_buf_t *db, void *vl); -static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); +extern inline zap_phys_t *zap_f_phys(zap_t *zap); +static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); void fzap_byteswap(void *vbuf, size_t size) @@ -80,13 +81,13 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, z ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); zap->zap_ismicro = FALSE; - (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, - &zap->zap_f.zap_phys, zap_evict); + zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync; + zap->zap_dbu.dbu_evict_func_async = NULL; mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); - zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1; + zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; - zp = zap->zap_f.zap_phys; + zp = zap_f_phys(zap); /* * explicitly zero it since it might be coming from an * initialized microzap @@ -112,12 +113,11 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, z * set up block 1 - the first leaf */ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, - 1<l_dbuf = db; - l->l_phys = db->db_data; zap_leaf_init(l, zp->zap_normflags != 0); @@ -162,9 +162,10 @@ zap_table_grow(zap_t *zap, zap_table_phy } else { newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); tbl->zt_nextblk = newblk; - ASSERT3U(tbl->zt_blks_copied, ==, 0); - dmu_prefetch(zap->zap_objset, zap->zap_object, - tbl->zt_blk << bs, tbl->zt_numblks << bs); + ASSERT0(tbl->zt_blks_copied); + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + tbl->zt_blk << bs, tbl->zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); } /* @@ -173,20 +174,20 @@ zap_table_grow(zap_t *zap, zap_table_phy b = tbl->zt_blks_copied; err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_blk + b) << bs, FTAG, &db_old); + (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); if (err) return (err); /* first half of entries in old[b] go to new[2*b+0] */ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, - (newblk + 2*b+0) << bs, FTAG, &db_new)); + (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func(db_old->db_data, db_new->db_data, hepb); dmu_buf_rele(db_new, FTAG); /* second half of entries in old[b] go to new[2*b+1] */ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, - (newblk + 2*b+1) << bs, FTAG, &db_new)); + (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func((uint64_t *)db_old->db_data + hepb, db_new->db_data, hepb); @@ -234,7 +235,7 @@ zap_table_store(zap_t *zap, zap_table_ph off = idx & ((1<<(bs-3))-1); err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_blk + blk) << bs, FTAG, &db); + (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err) return (err); dmu_buf_will_dirty(db, tx); @@ -246,7 +247,8 @@ zap_table_store(zap_t *zap, zap_table_ph dmu_buf_t *db2; err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_nextblk + blk2) << bs, FTAG, &db2); + (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, + DMU_READ_NO_PREFETCH); if (err) { dmu_buf_rele(db, FTAG); return (err); @@ -269,6 +271,7 @@ zap_table_load(zap_t *zap, zap_table_phy uint64_t blk, off; int err; dmu_buf_t *db; + dnode_t *dn; int bs = FZAP_BLOCK_SHIFT(zap); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); @@ -276,8 +279,15 @@ zap_table_load(zap_t *zap, zap_table_phy blk = idx >> (bs-3); off = idx & ((1<<(bs-3))-1); - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_blk + blk) << bs, FTAG, &db); + /* + * Note: this is equivalent to dmu_buf_hold(), but we use + * _dnode_enter / _by_dnode because it's faster because we don't + * have to hold the dnode. + */ + dn = dmu_buf_dnode_enter(zap->zap_dbuf); + err = dmu_buf_hold_by_dnode(dn, + (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); + dmu_buf_dnode_exit(zap->zap_dbuf); if (err) return (err); *valp = ((uint64_t *)db->db_data)[off]; @@ -291,9 +301,13 @@ zap_table_load(zap_t *zap, zap_table_phy */ blk = (idx*2) >> (bs-3); - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_nextblk + blk) << bs, FTAG, &db); - dmu_buf_rele(db, FTAG); + dn = dmu_buf_dnode_enter(zap->zap_dbuf); + err = dmu_buf_hold_by_dnode(dn, + (tbl->zt_nextblk + blk) << bs, FTAG, &db, + DMU_READ_NO_PREFETCH); + dmu_buf_dnode_exit(zap->zap_dbuf); + if (err == 0) + dmu_buf_rele(db, FTAG); } return (err); } @@ -322,10 +336,10 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx * If we are within 2 bits of running out, stop growing, since * this is already an aberrant condition. */ - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) - return (ENOSPC); + if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) + return (SET_ERROR(ENOSPC)); - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { /* * We are outgrowing the "embedded" ptrtbl (the one * stored in the header block). Give it its own entire @@ -335,13 +349,14 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx dmu_buf_t *db_new; int err; - ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, + ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); - ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0); + ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk); newblk = zap_allocate_blocks(zap, 1); err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new); + newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, + DMU_READ_NO_PREFETCH); if (err) return (err); dmu_buf_will_dirty(db_new, tx); @@ -349,17 +364,17 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); dmu_buf_rele(db_new, FTAG); - zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk; - zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1; - zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++; + zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk; + zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1; + zap_f_phys(zap)->zap_ptrtbl.zt_shift++; - ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, - zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << + ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << (FZAP_BLOCK_SHIFT(zap)-3)); return (0); } else { - return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl, zap_ptrtbl_transfer, tx)); } } @@ -369,8 +384,8 @@ zap_increment_num_entries(zap_t *zap, in { dmu_buf_will_dirty(zap->zap_dbuf, tx); mutex_enter(&zap->zap_f.zap_num_entries_mtx); - ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta); - zap->zap_f.zap_phys->zap_num_entries += delta; + ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta); + zap_f_phys(zap)->zap_num_entries += delta; mutex_exit(&zap->zap_f.zap_num_entries_mtx); } @@ -379,16 +394,25 @@ zap_allocate_blocks(zap_t *zap, int nblo { uint64_t newblk; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - newblk = zap->zap_f.zap_phys->zap_freeblk; - zap->zap_f.zap_phys->zap_freeblk += nblocks; + newblk = zap_f_phys(zap)->zap_freeblk; + zap_f_phys(zap)->zap_freeblk += nblocks; return (newblk); } +static void +zap_leaf_evict_sync(void *dbu) +{ + zap_leaf_t *l = dbu; + + rw_destroy(&l->l_rwlock); + kmem_free(l, sizeof (zap_leaf_t)); +} + static zap_leaf_t * zap_create_leaf(zap_t *zap, dmu_tx_t *tx) { void *winner; - zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); + zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); @@ -396,17 +420,18 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = zap_allocate_blocks(zap, 1); l->l_dbuf = NULL; - l->l_phys = NULL; VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, - l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf)); - winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout); + l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, + DMU_READ_NO_PREFETCH)); + dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); + winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu); ASSERT(winner == NULL); dmu_buf_will_dirty(l->l_dbuf, tx); zap_leaf_init(l, zap->zap_normflags != 0); - zap->zap_f.zap_phys->zap_num_leafs++; + zap_f_phys(zap)->zap_num_leafs++; return (l); } @@ -416,7 +441,7 @@ fzap_count(zap_t *zap, uint64_t *count) { ASSERT(!zap->zap_ismicro); mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ - *count = zap->zap_f.zap_phys->zap_num_entries; + *count = zap_f_phys(zap)->zap_num_entries; mutex_exit(&zap->zap_f.zap_num_entries_mtx); return (0); } @@ -432,16 +457,6 @@ zap_put_leaf(zap_leaf_t *l) dmu_buf_rele(l->l_dbuf, NULL); } -_NOTE(ARGSUSED(0)) -static void -zap_leaf_pageout(dmu_buf_t *db, void *vl) -{ - zap_leaf_t *l = vl; - - rw_destroy(&l->l_rwlock); - kmem_free(l, sizeof (zap_leaf_t)); -} - static zap_leaf_t * zap_open_leaf(uint64_t blkid, dmu_buf_t *db) { @@ -449,20 +464,20 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t ASSERT(blkid != 0); - l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); + l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); rw_init(&l->l_rwlock, 0, 0, 0); rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = blkid; - l->l_bs = highbit(db->db_size)-1; + l->l_bs = highbit64(db->db_size) - 1; l->l_dbuf = db; - l->l_phys = NULL; - winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout); + dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); + winner = dmu_buf_set_user(db, &l->l_dbu); rw_exit(&l->l_rwlock); if (winner != NULL) { /* someone else set it first */ - zap_leaf_pageout(NULL, l); + zap_leaf_evict_sync(&l->l_dbu); l = winner; } @@ -471,7 +486,7 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t * chain. There should be no chained leafs (as we have removed * support for them). */ - ASSERT3U(l->l_phys->l_hdr.lh_pad1, ==, 0); + ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1); /* * There should be more hash entries than there can be @@ -481,11 +496,11 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t /* The chunks should begin at the end of the hash table */ ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, - &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); + &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); /* The chunks should end at the end of the block */ ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - - (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size); + (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size); return (l); } @@ -501,8 +516,10 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - blkid << bs, NULL, &db); + dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); + err = dmu_buf_hold_by_dnode(dn, + blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); + dmu_buf_dnode_exit(zap->zap_dbuf); if (err) return (err); @@ -518,16 +535,15 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t rw_enter(&l->l_rwlock, lt); /* - * Must lock before dirtying, otherwise l->l_phys could change, + * Must lock before dirtying, otherwise zap_leaf_phys(l) could change, * causing ASSERT below to fail. */ if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); ASSERT3U(l->l_blkid, ==, blkid); ASSERT3P(l->l_dbuf, ==, db); - ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data); - ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF); - ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); *lp = l; return (0); @@ -538,13 +554,13 @@ zap_idx_to_blk(zap_t *zap, uint64_t idx, { ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { ASSERT3U(idx, <, - (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift)); + (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); return (0); } else { - return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl, idx, valp)); } } @@ -555,11 +571,11 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t ASSERT(tx != NULL); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) { + if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; return (0); } else { - return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl, idx, blk, tx)); } } @@ -571,52 +587,61 @@ zap_deref_leaf(zap_t *zap, uint64_t h, d int err; ASSERT(zap->zap_dbuf == NULL || - zap->zap_f.zap_phys == zap->zap_dbuf->db_data); - ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC); - idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + zap_f_phys(zap) == zap->zap_dbuf->db_data); + + /* Reality check for corrupt zap objects (leaf or header). */ + if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF && + zap_f_phys(zap)->zap_block_type != ZBT_HEADER) || + zap_f_phys(zap)->zap_magic != ZAP_MAGIC) { + return (SET_ERROR(EIO)); + } + + idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); err = zap_idx_to_blk(zap, idx, &blk); if (err != 0) return (err); err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); - ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) == - (*lp)->l_phys->l_hdr.lh_prefix); + ASSERT(err || + ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) == + zap_leaf_phys(*lp)->l_hdr.lh_prefix); return (err); } static int -zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp) +zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, + void *tag, dmu_tx_t *tx, zap_leaf_t **lp) { zap_t *zap = zn->zn_zap; uint64_t hash = zn->zn_hash; zap_leaf_t *nl; int prefix_diff, i, err; uint64_t sibling; - int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len; + int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; - ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, - l->l_phys->l_hdr.lh_prefix); + zap_leaf_phys(l)->l_hdr.lh_prefix); if (zap_tryupgradedir(zap, tx) == 0 || - old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) { + old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { /* We failed to upgrade, or need to grow the pointer table */ objset_t *os = zap->zap_objset; uint64_t object = zap->zap_object; zap_put_leaf(l); - zap_unlockdir(zap); + zap_unlockdir(zap, tag); err = zap_lockdir(os, object, tx, RW_WRITER, - FALSE, FALSE, &zn->zn_zap); + FALSE, FALSE, tag, &zn->zn_zap); zap = zn->zn_zap; if (err) return (err); ASSERT(!zap->zap_ismicro); while (old_prefix_len == - zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) { + zap_f_phys(zap)->zap_ptrtbl.zt_shift) { err = zap_grow_ptrtbl(zap, tx); if (err) return (err); @@ -626,18 +651,18 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf if (err) return (err); - if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) { + if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) { /* it split while our locks were down */ *lp = l; return (0); } } ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); + ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift); ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, - l->l_phys->l_hdr.lh_prefix); + zap_leaf_phys(l)->l_hdr.lh_prefix); - prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - + prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - (old_prefix_len + 1); sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; @@ -654,12 +679,12 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf zap_leaf_split(l, nl, zap->zap_normflags != 0); /* set sibling pointers */ - for (i = 0; i < (1ULL<l_blkid, tx); - ASSERT3U(err, ==, 0); /* we checked for i/o errors above */ + ASSERT0(err); /* we checked for i/o errors above */ } - if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) { + if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) { /* we want the sibling */ zap_put_leaf(l); *lp = nl; @@ -672,16 +697,17 @@ zap_expand_leaf(zap_name_t *zn, zap_leaf } static void -zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) +zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, + void *tag, dmu_tx_t *tx) { zap_t *zap = zn->zn_zap; - int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; - int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift && - l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); + int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift && + zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); zap_put_leaf(l); - if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) { + if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) { int err; /* @@ -692,16 +718,16 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_ objset_t *os = zap->zap_objset; uint64_t zapobj = zap->zap_object; - zap_unlockdir(zap); + zap_unlockdir(zap, tag); err = zap_lockdir(os, zapobj, tx, - RW_WRITER, FALSE, FALSE, &zn->zn_zap); + RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); zap = zn->zn_zap; if (err) return; } /* could have finished growing while our locks were down */ - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift) + if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift) (void) zap_grow_ptrtbl(zap, tx); } } @@ -710,7 +736,7 @@ static int fzap_checkname(zap_name_t *zn) { if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) - return (ENAMETOOLONG); + return (SET_ERROR(ENAMETOOLONG)); return (0); } @@ -725,7 +751,7 @@ fzap_checksize(uint64_t integer_size, ui case 8: break; default: - return (EINVAL); + return (SET_ERROR(EINVAL)); } if (integer_size * num_integers > ZAP_MAXVALUELEN) @@ -784,7 +810,7 @@ fzap_lookup(zap_name_t *zn, int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, - const void *val, uint32_t cd, dmu_tx_t *tx) + const void *val, uint32_t cd, void *tag, dmu_tx_t *tx) { zap_leaf_t *l; int err; @@ -801,7 +827,7 @@ fzap_add_cd(zap_name_t *zn, retry: err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { - err = EEXIST; + err = SET_ERROR(EEXIST); goto out; } if (err != ENOENT) @@ -813,7 +839,7 @@ retry: if (err == 0) { zap_increment_num_entries(zap, 1, tx); } else if (err == EAGAIN) { - err = zap_expand_leaf(zn, l, tx, &l); + err = zap_expand_leaf(zn, l, tag, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; @@ -821,26 +847,27 @@ retry: out: if (zap != NULL) - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx); + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); return (err); } int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) + const void *val, void *tag, dmu_tx_t *tx) { int err = fzap_check(zn, integer_size, num_integers); if (err != 0) return (err); return (fzap_add_cd(zn, integer_size, num_integers, - val, ZAP_NEED_CD, tx)); + val, ZAP_NEED_CD, tag, tx)); } int fzap_update(zap_name_t *zn, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) + int integer_size, uint64_t num_integers, const void *val, + void *tag, dmu_tx_t *tx) { zap_leaf_t *l; int err, create; @@ -870,14 +897,14 @@ retry: } if (err == EAGAIN) { - err = zap_expand_leaf(zn, l, tx, &l); + err = zap_expand_leaf(zn, l, tag, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; } if (zap != NULL) - zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx); + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); return (err); } @@ -924,10 +951,39 @@ fzap_remove(zap_name_t *zn, dmu_tx_t *tx return (err); } +void +fzap_prefetch(zap_name_t *zn) +{ + uint64_t idx, blk; + zap_t *zap = zn->zn_zap; + int bs; + + idx = ZAP_HASH_IDX(zn->zn_hash, + zap_f_phys(zap)->zap_ptrtbl.zt_shift); + if (zap_idx_to_blk(zap, idx, &blk) != 0) + return; + bs = FZAP_BLOCK_SHIFT(zap); + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, + ZIO_PRIORITY_SYNC_READ); +} + /* * Helper functions for consumers. */ +uint64_t +zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, + const char *name, dmu_tx_t *tx) +{ + uint64_t new_obj; + + VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0); + VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, + tx)); + + return (new_obj); +} + int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, char *name) @@ -960,18 +1016,77 @@ zap_join(objset_t *os, uint64_t fromobj, zap_attribute_t za; int err; + err = 0; for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, &za) == 0; (void) zap_cursor_advance(&zc)) { - if (za.za_integer_length != 8 || za.za_num_integers != 1) - return (EINVAL); + if (za.za_integer_length != 8 || za.za_num_integers != 1) { + err = SET_ERROR(EINVAL); + break; + } err = zap_add(os, intoobj, za.za_name, 8, 1, &za.za_first_integer, tx); if (err) - return (err); + break; } zap_cursor_fini(&zc); - return (0); + return (err); +} + +int +zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, + uint64_t value, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + int err; + + err = 0; + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + if (za.za_integer_length != 8 || za.za_num_integers != 1) { + err = SET_ERROR(EINVAL); + break; + } + err = zap_add(os, intoobj, za.za_name, + 8, 1, &value, tx); + if (err) + break; + } + zap_cursor_fini(&zc); + return (err); +} + +int +zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, + dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + int err; + + err = 0; + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + uint64_t delta = 0; + + if (za.za_integer_length != 8 || za.za_num_integers != 1) { + err = SET_ERROR(EINVAL); + break; + } + + err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta); + if (err != 0 && err != ENOENT) + break; + delta += za.za_first_integer; + err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx); + if (err) + break; + } + zap_cursor_fini(&zc); + return (err); } int @@ -1002,17 +1117,44 @@ zap_lookup_int(objset_t *os, uint64_t ob } int -zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, - dmu_tx_t *tx) +zap_add_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx) { char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_add(os, obj, name, 8, 1, &value, tx)); +} + +int +zap_update_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_update(os, obj, name, 8, 1, &value, tx)); +} + +int +zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_lookup(os, obj, name, 8, 1, valuep)); +} + +int +zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, + dmu_tx_t *tx) +{ uint64_t value = 0; int err; if (delta == 0) return (0); - (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); err = zap_lookup(os, obj, name, 8, 1, &value); if (err != 0 && err != ENOENT) return (err); @@ -1024,6 +1166,15 @@ zap_increment_int(objset_t *os, uint64_t return (err); } +int +zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_increment(os, obj, name, delta, tx)); +} /* * Routines for iterating over the attributes. @@ -1041,8 +1192,8 @@ fzap_cursor_retrieve(zap_t *zap, zap_cur if (zc->zc_leaf && (ZAP_HASH_IDX(zc->zc_hash, - zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) != - zc->zc_leaf->l_phys->l_hdr.lh_prefix)) { + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; @@ -1063,10 +1214,11 @@ again: if (err == ENOENT) { uint64_t nocare = - (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1; + (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1; zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; zc->zc_cd = 0; - if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) { + if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 || + zc->zc_hash == 0) { zc->zc_hash = -1ULL; } else { zap_put_leaf(zc->zc_leaf); @@ -1098,7 +1250,6 @@ again: return (err); } - static void zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) { @@ -1133,7 +1284,7 @@ fzap_cursor_move_to_key(zap_cursor_t *zc zap_entry_handle_t zeh; if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) - return (ENAMETOOLONG); + return (SET_ERROR(ENAMETOOLONG)); err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l); if (err != 0) @@ -1159,43 +1310,44 @@ fzap_get_stats(zap_t *zap, zap_stats_t * /* * Set zap_phys_t fields */ - zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs; - zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries; - zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk; - zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type; - zs->zs_magic = zap->zap_f.zap_phys->zap_magic; - zs->zs_salt = zap->zap_f.zap_phys->zap_salt; + zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs; + zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries; + zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk; + zs->zs_block_type = zap_f_phys(zap)->zap_block_type; + zs->zs_magic = zap_f_phys(zap)->zap_magic; + zs->zs_salt = zap_f_phys(zap)->zap_salt; /* * Set zap_ptrtbl fields */ - zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; - zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk; + zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift; + zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk; zs->zs_ptrtbl_blks_copied = - zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied; - zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk; - zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks; - zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift; + zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied; + zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk; + zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks; + zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { /* the ptrtbl is entirely in the header block. */ zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); } else { int b; - dmu_prefetch(zap->zap_objset, zap->zap_object, - zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs, - zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs); + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); - for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks; + for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; b++) { dmu_buf_t *db; int err; err = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs, - FTAG, &db); + (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, + FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) { zap_stats_ptrtbl(zap, db->db_data, 1<<(bs-3), zs); @@ -1206,8 +1358,8 @@ fzap_get_stats(zap_t *zap, zap_stats_t * } int -fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite, - uint64_t *tooverwrite) +fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite, + refcount_t *tooverwrite) { zap_t *zap = zn->zn_zap; zap_leaf_t *l; @@ -1217,9 +1369,11 @@ fzap_count_write(zap_name_t *zn, int add * Account for the header block of the fatzap. */ if (!add && dmu_buf_freeable(zap->zap_dbuf)) { - *tooverwrite += zap->zap_dbuf->db_size; + (void) refcount_add_many(tooverwrite, + zap->zap_dbuf->db_size, FTAG); } else { - *towrite += zap->zap_dbuf->db_size; + (void) refcount_add_many(towrite, + zap->zap_dbuf->db_size, FTAG); } /* @@ -1231,10 +1385,13 @@ fzap_count_write(zap_name_t *zn, int add * could extend the table. */ if (add) { - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) - *towrite += zap->zap_dbuf->db_size; - else - *towrite += (zap->zap_dbuf->db_size * 3); + if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { + (void) refcount_add_many(towrite, + zap->zap_dbuf->db_size, FTAG); + } else { + (void) refcount_add_many(towrite, + zap->zap_dbuf->db_size * 3, FTAG); + } } /* @@ -1247,13 +1404,14 @@ fzap_count_write(zap_name_t *zn, int add } if (!add && dmu_buf_freeable(l->l_dbuf)) { - *tooverwrite += l->l_dbuf->db_size; + (void) refcount_add_many(tooverwrite, l->l_dbuf->db_size, FTAG); } else { /* * If this an add operation, the leaf block could split. * Hence, we need to account for an additional leaf block. */ - *towrite += (add ? 2 : 1) * l->l_dbuf->db_size; + (void) refcount_add_many(towrite, + (add ? 2 : 1) * l->l_dbuf->db_size, FTAG); } zap_put_leaf(l); Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zap_leaf.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c 27 Feb 2010 22:31:15 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c 22 Nov 2015 17:22:33 -0000 @@ -19,8 +19,8 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ /* @@ -37,6 +37,7 @@ #include #include #include +#include static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); @@ -47,10 +48,12 @@ static uint16_t *zap_leaf_rehash_entry(z #define LEAF_HASH(l, h) \ ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ - ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len))) + ((h) >> \ + (64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) -#define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)]) +#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)]) +extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l); static void zap_memset(void *a, int c, size_t n) @@ -104,16 +107,19 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, { int i; zap_leaf_t l; - l.l_bs = highbit(size)-1; - l.l_phys = buf; + dmu_buf_t l_dbuf; - buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type); - buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix); - buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic); - buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree); - buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries); - buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); - buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); + l_dbuf.db_data = buf; + l.l_bs = highbit64(size) - 1; + l.l_dbuf = &l_dbuf; + + buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type); + buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix); + buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic); + buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree); + buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries); + buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); + buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); @@ -156,19 +162,21 @@ zap_leaf_init(zap_leaf_t *l, boolean_t s { int i; - l->l_bs = highbit(l->l_dbuf->db_size)-1; - zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header)); - zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); + l->l_bs = highbit64(l->l_dbuf->db_size) - 1; + zap_memset(&zap_leaf_phys(l)->l_hdr, 0, + sizeof (struct zap_leaf_header)); + zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, + 2*ZAP_LEAF_HASH_NUMENTRIES(l)); for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; } ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END; - l->l_phys->l_hdr.lh_block_type = ZBT_LEAF; - l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC; - l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); + zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF; + zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC; + zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); if (sort) - l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; + zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; } /* @@ -180,15 +188,16 @@ zap_leaf_chunk_alloc(zap_leaf_t *l) { int chunk; - ASSERT(l->l_phys->l_hdr.lh_nfree > 0); + ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0); - chunk = l->l_phys->l_hdr.lh_freelist; + chunk = zap_leaf_phys(l)->l_hdr.lh_freelist; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); - l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next; + zap_leaf_phys(l)->l_hdr.lh_freelist = + ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next; - l->l_phys->l_hdr.lh_nfree--; + zap_leaf_phys(l)->l_hdr.lh_nfree--; return (chunk); } @@ -197,16 +206,16 @@ static void zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) { struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free; - ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT(zlf->lf_type != ZAP_CHUNK_FREE); zlf->lf_type = ZAP_CHUNK_FREE; - zlf->lf_next = l->l_phys->l_hdr.lh_freelist; + zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist; bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */ - l->l_phys->l_hdr.lh_freelist = chunk; + zap_leaf_phys(l)->l_hdr.lh_freelist = chunk; - l->l_phys->l_hdr.lh_nfree++; + zap_leaf_phys(l)->l_hdr.lh_nfree++; } /* @@ -220,7 +229,7 @@ zap_leaf_array_create(zap_leaf_t *l, con uint16_t chunk_head; uint16_t *chunkp = &chunk_head; int byten = 0; - uint64_t value; + uint64_t value = 0; int shift = (integer_size-1)*8; int len = num_integers; @@ -392,7 +401,7 @@ zap_leaf_lookup(zap_leaf_t *l, zap_name_ uint16_t *chunkp; struct zap_leaf_entry *le; - ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); again: for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash); @@ -412,7 +421,7 @@ again: * lowest-cd match for MT_FIRST. */ ASSERT(zn->zn_matchtype == MT_EXACT || - (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED)); + (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED)); if (zap_leaf_array_match(l, zn, le->le_name_chunk, le->le_name_numints)) { zeh->zeh_num_integers = le->le_value_numints; @@ -434,7 +443,7 @@ again: goto again; } - return (ENOENT); + return (SET_ERROR(ENOENT)); } /* Return (h1,cd1 >= h2,cd2) */ @@ -452,10 +461,10 @@ zap_leaf_lookup_closest(zap_leaf_t *l, uint16_t lh; struct zap_leaf_entry *le; - ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) { - for (chunk = l->l_phys->l_hash[lh]; + for (chunk = zap_leaf_phys(l)->l_hash[lh]; chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(l, chunk); @@ -492,14 +501,14 @@ zap_entry_read(const zap_entry_handle_t ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); if (le->le_value_intlen > integer_size) - return (EINVAL); + return (SET_ERROR(EINVAL)); zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_value_intlen, le->le_value_numints, integer_size, num_integers, buf); if (zeh->zeh_num_integers > num_integers) - return (EOVERFLOW); + return (SET_ERROR(EOVERFLOW)); return (0); } @@ -520,13 +529,13 @@ zap_entry_read_name(zap_t *zap, const za le->le_name_numints, 1, buflen, buf); } if (le->le_name_numints > buflen) - return (EOVERFLOW); + return (SET_ERROR(EOVERFLOW)); return (0); } int zap_entry_update(zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, const void *buf) + uint8_t integer_size, uint64_t num_integers, const void *buf) { int delta_chunks; zap_leaf_t *l = zeh->zeh_leaf; @@ -535,16 +544,8 @@ zap_entry_update(zap_entry_handle_t *zeh delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen); - if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks) - return (EAGAIN); - - /* - * We should search other chained leaves (via - * zap_entry_remove,create?) otherwise returning EAGAIN will - * just send us into an infinite loop if we have to chain - * another leaf block, rather than being able to split this - * block. - */ + if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks) + return (SET_ERROR(EAGAIN)); zap_leaf_array_free(l, &le->le_value_chunk); le->le_value_chunk = @@ -573,7 +574,7 @@ zap_entry_remove(zap_entry_handle_t *zeh *zeh->zeh_chunkp = le->le_next; zap_leaf_chunk_free(l, entry_chunk); - l->l_phys->l_hdr.lh_nentries--; + zap_leaf_phys(l)->l_hdr.lh_nentries--; } int @@ -597,7 +598,7 @@ zap_entry_create(zap_leaf_t *l, zap_name if (cd == ZAP_NEED_CD) { /* find the lowest unused cd */ - if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) { + if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) { cd = 0; for (chunk = *LEAF_HASH_ENTPTR(l, h); @@ -633,8 +634,8 @@ zap_entry_create(zap_leaf_t *l, zap_name ASSERT3U(cd, <, zap_maxcd(zn->zn_zap)); } - if (l->l_phys->l_hdr.lh_nfree < numchunks) - return (EAGAIN); + if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks) + return (SET_ERROR(EAGAIN)); /* make the entry */ chunk = zap_leaf_chunk_alloc(l); @@ -654,7 +655,7 @@ zap_entry_create(zap_leaf_t *l, zap_name /* XXX if we did the search above, we could just use that */ chunkp = zap_leaf_rehash_entry(l, chunk); - l->l_phys->l_hdr.lh_nentries++; + zap_leaf_phys(l)->l_hdr.lh_nentries++; zeh->zeh_leaf = l; zeh->zeh_num_integers = num_integers; @@ -788,8 +789,8 @@ zap_leaf_transfer_entry(zap_leaf_t *l, i zap_leaf_chunk_free(l, entry); - l->l_phys->l_hdr.lh_nentries--; - nl->l_phys->l_hdr.lh_nentries++; + zap_leaf_phys(l)->l_hdr.lh_nentries--; + zap_leaf_phys(nl)->l_hdr.lh_nentries++; } /* @@ -799,19 +800,22 @@ void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) { int i; - int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len; + int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len; /* set new prefix and prefix_len */ - l->l_phys->l_hdr.lh_prefix <<= 1; - l->l_phys->l_hdr.lh_prefix_len++; - nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1; - nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len; + zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1; + zap_leaf_phys(l)->l_hdr.lh_prefix_len++; + zap_leaf_phys(nl)->l_hdr.lh_prefix = + zap_leaf_phys(l)->l_hdr.lh_prefix | 1; + zap_leaf_phys(nl)->l_hdr.lh_prefix_len = + zap_leaf_phys(l)->l_hdr.lh_prefix_len; /* break existing hash chains */ - zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); + zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, + 2*ZAP_LEAF_HASH_NUMENTRIES(l)); if (sort) - l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; + zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; /* * Transfer entries whose hash bit 'bit' is set to nl; rehash @@ -839,25 +843,25 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l { int i, n; - n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - - l->l_phys->l_hdr.lh_prefix_len; + n = zap_f_phys(zap)->zap_ptrtbl.zt_shift - + zap_leaf_phys(l)->l_hdr.lh_prefix_len; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_leafs_with_2n_pointers[n]++; - n = l->l_phys->l_hdr.lh_nentries/5; + n = zap_leaf_phys(l)->l_hdr.lh_nentries/5; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_blocks_with_n5_entries[n]++; n = ((1<l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 / + zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 / (1<zs_blocks_n_tenths_full[n]++; for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { int nentries = 0; - int chunk = l->l_phys->l_hash[i]; + int chunk = zap_leaf_phys(l)->l_hash[i]; while (chunk != CHAIN_END) { struct zap_leaf_entry *le = Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c,v retrieving revision 1.3 diff -u -p -r1.3 zap_micro.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c 27 Feb 2010 23:43:53 -0000 1.3 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c 6 May 2017 23:27:27 -0000 @@ -19,8 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ #include @@ -32,19 +34,24 @@ #include #include #include +#include +#include #ifdef _KERNEL #include #endif -static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags); +extern inline mzap_phys_t *zap_m_phys(zap_t *zap); + +static int mzap_upgrade(zap_t **zapp, + void *tag, dmu_tx_t *tx, zap_flags_t flags); uint64_t zap_getflags(zap_t *zap) { if (zap->zap_ismicro) return (0); - return (zap->zap_u.zap_fat.zap_phys->zap_flags); + return (zap_f_phys(zap)->zap_flags); } int @@ -255,27 +262,33 @@ mze_compare(const void *arg1, const void return (+1); if (mze1->mze_hash < mze2->mze_hash) return (-1); - if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd) + if (mze1->mze_cd > mze2->mze_cd) return (+1); - if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd) + if (mze1->mze_cd < mze2->mze_cd) return (-1); return (0); } -static void -mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep) +static int +mze_insert(zap_t *zap, int chunkid, uint64_t hash) { mzap_ent_t *mze; + avl_index_t idx; ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT(mzep->mze_cd < zap_maxcd(zap)); mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); mze->mze_chunkid = chunkid; mze->mze_hash = hash; - mze->mze_phys = *mzep; - avl_add(&zap->zap_m.zap_avl, mze); + mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; + ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); + if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) { + kmem_free(mze, sizeof (mzap_ent_t)); + return (EEXIST); + } + avl_insert(&zap->zap_m.zap_avl, mze, idx); + return (0); } static mzap_ent_t * @@ -290,14 +303,15 @@ mze_find(zap_name_t *zn) ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); mze_tofind.mze_hash = zn->zn_hash; - mze_tofind.mze_phys.mze_cd = 0; + mze_tofind.mze_cd = 0; again: mze = avl_find(avl, &mze_tofind, &idx); if (mze == NULL) mze = avl_nearest(avl, idx, AVL_AFTER); for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { - if (zap_match(zn, mze->mze_phys.mze_name)) + ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); + if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) return (mze); } if (zn->zn_matchtype == MT_BEST) { @@ -320,12 +334,12 @@ mze_find_unused_cd(zap_t *zap, uint64_t ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); mze_tofind.mze_hash = hash; - mze_tofind.mze_phys.mze_cd = 0; + mze_tofind.mze_cd = 0; cd = 0; for (mze = avl_find(avl, &mze_tofind, &idx); mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { - if (mze->mze_phys.mze_cd != cd) + if (mze->mze_cd != cd) break; cd++; } @@ -360,6 +374,9 @@ mzap_open(objset_t *os, uint64_t obj, dm zap_t *winner; zap_t *zap; int i; + uint64_t *zap_hdr = (uint64_t *)db->db_data; + uint64_t zap_block_type = zap_hdr[0]; + uint64_t zap_magic = zap_hdr[1]; ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); @@ -370,9 +387,13 @@ mzap_open(objset_t *os, uint64_t obj, dm zap->zap_object = obj; zap->zap_dbuf = db; - if (*(uint64_t *)db->db_data != ZBT_MICRO) { + if (zap_block_type != ZBT_MICRO) { mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); - zap->zap_f.zap_block_shift = highbit(db->db_size) - 1; + zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; + if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) { + winner = NULL; /* No actual winner here... */ + goto handle_winner; + } } else { zap->zap_ismicro = TRUE; } @@ -382,40 +403,40 @@ mzap_open(objset_t *os, uint64_t obj, dm * it, because zap_lockdir() checks zap_ismicro without the lock * held. */ - winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict); + dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); + winner = dmu_buf_set_user(db, &zap->zap_dbu); - if (winner != NULL) { - rw_exit(&zap->zap_rwlock); - rw_destroy(&zap->zap_rwlock); - if (!zap->zap_ismicro) - mutex_destroy(&zap->zap_f.zap_num_entries_mtx); - kmem_free(zap, sizeof (zap_t)); - return (winner); - } + if (winner != NULL) + goto handle_winner; if (zap->zap_ismicro) { - zap->zap_salt = zap->zap_m.zap_phys->mz_salt; - zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags; + zap->zap_salt = zap_m_phys(zap)->mz_salt; + zap->zap_normflags = zap_m_phys(zap)->mz_normflags; zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; avl_create(&zap->zap_m.zap_avl, mze_compare, sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = - &zap->zap_m.zap_phys->mz_chunk[i]; + &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0]) { zap_name_t *zn; - zap->zap_m.zap_num_entries++; zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); - mze_insert(zap, i, zn->zn_hash, mze); + if (mze_insert(zap, i, zn->zn_hash) == 0) + zap->zap_m.zap_num_entries++; + else { + printf("ZFS WARNING: Duplicated ZAP " + "entry detected (%s).\n", + mze->mze_name); + } zap_name_free(zn); } } } else { - zap->zap_salt = zap->zap_f.zap_phys->zap_salt; - zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags; + zap->zap_salt = zap_f_phys(zap)->zap_salt; + zap->zap_normflags = zap_f_phys(zap)->zap_normflags; ASSERT3U(sizeof (struct zap_leaf_header), ==, 2*ZAP_LEAF_CHUNKSIZE); @@ -425,7 +446,7 @@ mzap_open(objset_t *os, uint64_t obj, dm * other members. */ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, - &zap->zap_f.zap_phys->zap_salt); + &zap_f_phys(zap)->zap_salt); /* * The embedded pointer table should end at the end of @@ -433,39 +454,53 @@ mzap_open(objset_t *os, uint64_t obj, dm */ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 1<zap_f.zap_phys, ==, + (uintptr_t)zap_f_phys(zap), ==, zap->zap_dbuf->db_size); } rw_exit(&zap->zap_rwlock); return (zap); + +handle_winner: + rw_exit(&zap->zap_rwlock); + rw_destroy(&zap->zap_rwlock); + if (!zap->zap_ismicro) + mutex_destroy(&zap->zap_f.zap_num_entries_mtx); + kmem_free(zap, sizeof (zap_t)); + return (winner); } -int -zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, +static int +zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { zap_t *zap; - dmu_buf_t *db; krw_t lt; - int err; - *zapp = NULL; + ASSERT0(db->db_offset); + objset_t *os = dmu_buf_get_objset(db); + uint64_t obj = db->db_object; - err = dmu_buf_hold(os, obj, 0, NULL, &db); - if (err) - return (err); + *zapp = NULL; #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); - ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif zap = dmu_buf_get_user(db); - if (zap == NULL) + if (zap == NULL) { zap = mzap_open(os, obj, db); + if (zap == NULL) { + /* + * mzap_open() didn't like what it saw on-disk. + * Check for corruption! + */ + return (SET_ERROR(EIO)); + } + } /* * We're checking zap_ismicro without the lock held, in order to @@ -501,10 +536,12 @@ zap_lockdir(objset_t *os, uint64_t obj, dprintf("upgrading obj %llu: num_entries=%u\n", obj, zap->zap_m.zap_num_entries); *zapp = zap; - return (mzap_upgrade(zapp, tx, 0)); + int err = mzap_upgrade(zapp, tag, tx, 0); + if (err != 0) + rw_exit(&zap->zap_rwlock); + return (err); } - err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); - ASSERT3U(err, ==, 0); + VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; } @@ -513,15 +550,49 @@ zap_lockdir(objset_t *os, uint64_t obj, return (0); } +static int +zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) +{ + dmu_buf_t *db; + int err; + + err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) { + return (err); + } + err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) { + dmu_buf_rele(db, tag); + } + return (err); +} + +int +zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) +{ + dmu_buf_t *db; + int err; + + err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) + dmu_buf_rele(db, tag); + return (err); +} + void -zap_unlockdir(zap_t *zap) +zap_unlockdir(zap_t *zap, void *tag) { rw_exit(&zap->zap_rwlock); - dmu_buf_rele(zap->zap_dbuf, NULL); + dmu_buf_rele(zap->zap_dbuf, tag); } static int -mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) +mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) { mzap_phys_t *mzp; int i, sz, nchunks; @@ -531,7 +602,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); sz = zap->zap_dbuf->db_size; - mzp = kmem_alloc(sz, KM_SLEEP); + mzp = zio_buf_alloc(sz); bcopy(zap->zap_dbuf->db_data, mzp, sz); nchunks = zap->zap_m.zap_num_chunks; @@ -539,7 +610,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 1ULL << fzap_default_block_shift, 0, tx); if (err) { - kmem_free(mzp, sz); + zio_buf_free(mzp, sz); return (err); } } @@ -559,31 +630,32 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, dprintf("adding %s=%llu\n", mze->mze_name, mze->mze_value); zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); - err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx); + err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, + tag, tx); zap = zn->zn_zap; /* fzap_add_cd() may change zap */ zap_name_free(zn); if (err) break; } - kmem_free(mzp, sz); + zio_buf_free(mzp, sz); *zapp = zap; return (err); } -static void +void mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, dmu_tx_t *tx) { dmu_buf_t *db; mzap_phys_t *zp; - VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db)); + VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); - ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif @@ -598,9 +670,9 @@ mzap_create_impl(objset_t *os, uint64_t zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER, - B_FALSE, B_FALSE, &zap)); - VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags)); - zap_unlockdir(zap); + B_FALSE, B_FALSE, FTAG, &zap)); + VERIFY3U(0, ==, mzap_upgrade(&zap, FTAG, tx, flags)); + zap_unlockdir(zap, FTAG); } } @@ -651,9 +723,9 @@ zap_create_flags(objset_t *os, int normf uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && - leaf_blockshift <= SPA_MAXBLOCKSHIFT && + leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT && indirect_blockshift >= SPA_MINBLOCKSHIFT && - indirect_blockshift <= SPA_MAXBLOCKSHIFT); + indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT); VERIFY(dmu_object_set_blocksize(os, obj, 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0); @@ -674,11 +746,10 @@ zap_destroy(objset_t *os, uint64_t zapob return (dmu_object_free(os, zapobj, tx)); } -_NOTE(ARGSUSED(0)) void -zap_evict(dmu_buf_t *db, void *vzap) +zap_evict_sync(void *dbu) { - zap_t *zap = vzap; + zap_t *zap = dbu; rw_destroy(&zap->zap_rwlock); @@ -696,7 +767,7 @@ zap_count(objset_t *os, uint64_t zapobj, zap_t *zap; int err; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); if (!zap->zap_ismicro) { @@ -704,7 +775,7 @@ zap_count(objset_t *os, uint64_t zapobj, } else { *count = zap->zap_m.zap_num_entries; } - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -728,11 +799,11 @@ again: other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { if (zn == NULL) { - zn = zap_name_alloc(zap, mze->mze_phys.mze_name, + zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, MT_FIRST); allocdzn = B_TRUE; } - if (zap_match(zn, other->mze_phys.mze_name)) { + if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); @@ -761,25 +832,19 @@ zap_lookup(objset_t *os, uint64_t zapobj num_integers, buf, MT_EXACT, NULL, 0, NULL)); } -int -zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, +static int +zap_lookup_impl(zap_t *zap, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { - zap_t *zap; - int err; + int err = 0; mzap_ent_t *mze; zap_name_t *zn; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); - if (err) - return (err); zn = zap_name_alloc(zap, name, mt); - if (zn == NULL) { - zap_unlockdir(zap); - return (ENOTSUP); - } + if (zn == NULL) + return (SET_ERROR(ENOTSUP)); if (!zap->zap_ismicro) { err = fzap_lookup(zn, integer_size, num_integers, buf, @@ -787,17 +852,18 @@ zap_lookup_norm(objset_t *os, uint64_t z } else { mze = mze_find(zn); if (mze == NULL) { - err = ENOENT; + err = SET_ERROR(ENOENT); } else { if (num_integers < 1) { - err = EOVERFLOW; + err = SET_ERROR(EOVERFLOW); } else if (integer_size != 8) { - err = EINVAL; + err = SET_ERROR(EINVAL); } else { - *(uint64_t *)buf = mze->mze_phys.mze_value; + *(uint64_t *)buf = + MZE_PHYS(zap, mze)->mze_value; if (realname != NULL) (void) strlcpy(realname, - mze->mze_phys.mze_name, rn_len); + MZE_PHYS(zap, mze)->mze_name, rn_len); if (ncp) { *ncp = mzap_normalization_conflict(zap, zn, mze); @@ -806,7 +872,74 @@ zap_lookup_norm(objset_t *os, uint64_t z } } zap_name_free(zn); - zap_unlockdir(zap); + return (err); +} + +int +zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + zap_t *zap; + int err; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_lookup_impl(zap, name, integer_size, + num_integers, buf, mt, realname, rn_len, ncp); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_lookup_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf) +{ + return (zap_lookup_norm_by_dnode(dn, name, integer_size, + num_integers, buf, MT_EXACT, NULL, 0, NULL)); +} + +int +zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + zap_t *zap; + int err; + + err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, + FTAG, &zap); + if (err != 0) + return (err); + err = zap_lookup_impl(zap, name, integer_size, + num_integers, buf, mt, realname, rn_len, ncp); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + fzap_prefetch(zn); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); return (err); } @@ -818,27 +951,27 @@ zap_lookup_uint64(objset_t *os, uint64_t int err; zap_name_t *zn; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { - zap_unlockdir(zap); - return (ENOTSUP); + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); } err = fzap_lookup(zn, integer_size, num_integers, buf, NULL, 0, NULL); zap_name_free(zn); - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } int zap_contains(objset_t *os, uint64_t zapobj, const char *name) { - int err = (zap_lookup_norm(os, zapobj, name, 0, - 0, NULL, MT_EXACT, NULL, 0, NULL)); + int err = zap_lookup_norm(os, zapobj, name, 0, + 0, NULL, MT_EXACT, NULL, 0, NULL); if (err == EOVERFLOW || err == EINVAL) err = 0; /* found, but skipped reading the value */ return (err); @@ -853,20 +986,20 @@ zap_length(objset_t *os, uint64_t zapobj mzap_ent_t *mze; zap_name_t *zn; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, MT_EXACT); if (zn == NULL) { - zap_unlockdir(zap); - return (ENOTSUP); + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_length(zn, integer_size, num_integers); } else { mze = mze_find(zn); if (mze == NULL) { - err = ENOENT; + err = SET_ERROR(ENOENT); } else { if (integer_size) *integer_size = 8; @@ -875,7 +1008,7 @@ zap_length(objset_t *os, uint64_t zapobj } } zap_name_free(zn); - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -887,17 +1020,17 @@ zap_length_uint64(objset_t *os, uint64_t int err; zap_name_t *zn; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { - zap_unlockdir(zap); - return (ENOTSUP); + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); } err = fzap_length(zn, integer_size, num_integers); zap_name_free(zn); - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -913,7 +1046,7 @@ mzap_addent(zap_name_t *zn, uint64_t val #ifdef ZFS_DEBUG for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { - mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; + mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); } #endif @@ -924,7 +1057,7 @@ mzap_addent(zap_name_t *zn, uint64_t val again: for (i = start; i < zap->zap_m.zap_num_chunks; i++) { - mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; + mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0] == 0) { mze->mze_value = value; mze->mze_cd = cd; @@ -934,7 +1067,7 @@ again: if (zap->zap_m.zap_alloc_next == zap->zap_m.zap_num_chunks) zap->zap_m.zap_alloc_next = 0; - mze_insert(zap, i, zn->zn_hash, mze); + VERIFY(0 == mze_insert(zap, i, zn->zn_hash)); return; } } @@ -956,27 +1089,29 @@ zap_add(objset_t *os, uint64_t zapobj, c const uint64_t *intval = val; zap_name_t *zn; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc(zap, key, MT_EXACT); if (zn == NULL) { - zap_unlockdir(zap); - return (ENOTSUP); + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { - err = fzap_add(zn, integer_size, num_integers, val, tx); + err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(key) >= MZAP_NAME_LEN) { - err = mzap_upgrade(&zn->zn_zap, tx, 0); - if (err == 0) - err = fzap_add(zn, integer_size, num_integers, val, tx); + err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); + if (err == 0) { + err = fzap_add(zn, integer_size, num_integers, val, + FTAG, tx); + } zap = zn->zn_zap; /* fzap_add() may change zap */ } else { mze = mze_find(zn); if (mze != NULL) { - err = EEXIST; + err = SET_ERROR(EEXIST); } else { mzap_addent(zn, *intval); } @@ -984,7 +1119,7 @@ zap_add(objset_t *os, uint64_t zapobj, c ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -997,19 +1132,19 @@ zap_add_uint64(objset_t *os, uint64_t za int err; zap_name_t *zn; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { - zap_unlockdir(zap); - return (ENOTSUP); + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); } - err = fzap_add(zn, integer_size, num_integers, val, tx); + err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -1019,36 +1154,47 @@ zap_update(objset_t *os, uint64_t zapobj { zap_t *zap; mzap_ent_t *mze; + uint64_t oldval; const uint64_t *intval = val; zap_name_t *zn; int err; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); +#ifdef ZFS_DEBUG + /* + * If there is an old value, it shouldn't change across the + * lockdir (eg, due to bprewrite's xlation). + */ + if (integer_size == 8 && num_integers == 1) + (void) zap_lookup(os, zapobj, name, 8, 1, &oldval); +#endif + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, MT_EXACT); if (zn == NULL) { - zap_unlockdir(zap); - return (ENOTSUP); + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { - err = fzap_update(zn, integer_size, num_integers, val, tx); + err = fzap_update(zn, integer_size, num_integers, val, + FTAG, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", zapobj, integer_size, num_integers, name); - err = mzap_upgrade(&zn->zn_zap, tx, 0); - if (err == 0) + err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); + if (err == 0) { err = fzap_update(zn, integer_size, num_integers, - val, tx); + val, FTAG, tx); + } zap = zn->zn_zap; /* fzap_update() may change zap */ } else { mze = mze_find(zn); if (mze != NULL) { - mze->mze_phys.mze_value = *intval; - zap->zap_m.zap_phys->mz_chunk - [mze->mze_chunkid].mze_value = *intval; + ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval); + MZE_PHYS(zap, mze)->mze_value = *intval; } else { mzap_addent(zn, *intval); } @@ -1056,7 +1202,7 @@ zap_update(objset_t *os, uint64_t zapobj ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -1069,19 +1215,19 @@ zap_update_uint64(objset_t *os, uint64_t zap_name_t *zn; int err; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { - zap_unlockdir(zap); - return (ENOTSUP); + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); } - err = fzap_update(zn, integer_size, num_integers, val, tx); + err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -1100,29 +1246,29 @@ zap_remove_norm(objset_t *os, uint64_t z mzap_ent_t *mze; zap_name_t *zn; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, mt); if (zn == NULL) { - zap_unlockdir(zap); - return (ENOTSUP); + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_remove(zn, tx); } else { mze = mze_find(zn); if (mze == NULL) { - err = ENOENT; + err = SET_ERROR(ENOENT); } else { zap->zap_m.zap_num_entries--; - bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], + bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid], sizeof (mzap_ent_phys_t)); mze_remove(zap, mze); } } zap_name_free(zn); - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -1134,17 +1280,17 @@ zap_remove_uint64(objset_t *os, uint64_t int err; zap_name_t *zn; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { - zap_unlockdir(zap); - return (ENOTSUP); + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); } err = fzap_remove(zn, tx); zap_name_free(zn); - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } @@ -1176,7 +1322,7 @@ zap_cursor_fini(zap_cursor_t *zc) { if (zc->zc_zap) { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); - zap_unlockdir(zc->zc_zap); + zap_unlockdir(zc->zc_zap, NULL); zc->zc_zap = NULL; } if (zc->zc_leaf) { @@ -1218,12 +1364,12 @@ zap_cursor_retrieve(zap_cursor_t *zc, za mzap_ent_t *mze; if (zc->zc_hash == -1ULL) - return (ENOENT); + return (SET_ERROR(ENOENT)); if (zc->zc_zap == NULL) { int hb; err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, - RW_READER, TRUE, FALSE, &zc->zc_zap); + RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); if (err) return (err); @@ -1244,10 +1390,8 @@ zap_cursor_retrieve(zap_cursor_t *zc, za if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_retrieve(zc->zc_zap, zc, za); } else { - err = ENOENT; - mze_tofind.mze_hash = zc->zc_hash; - mze_tofind.mze_phys.mze_cd = zc->zc_cd; + mze_tofind.mze_cd = zc->zc_cd; mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); if (mze == NULL) { @@ -1255,21 +1399,20 @@ zap_cursor_retrieve(zap_cursor_t *zc, za idx, AVL_AFTER); } if (mze) { - ASSERT(0 == bcmp(&mze->mze_phys, - &zc->zc_zap->zap_m.zap_phys->mz_chunk - [mze->mze_chunkid], sizeof (mze->mze_phys))); - + mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); + ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); za->za_normalization_conflict = mzap_normalization_conflict(zc->zc_zap, NULL, mze); za->za_integer_length = 8; za->za_num_integers = 1; - za->za_first_integer = mze->mze_phys.mze_value; - (void) strcpy(za->za_name, mze->mze_phys.mze_name); + za->za_first_integer = mzep->mze_value; + (void) strcpy(za->za_name, mzep->mze_name); zc->zc_hash = mze->mze_hash; - zc->zc_cd = mze->mze_phys.mze_cd; + zc->zc_cd = mze->mze_cd; err = 0; } else { zc->zc_hash = -1ULL; + err = SET_ERROR(ENOENT); } } rw_exit(&zc->zc_zap->zap_rwlock); @@ -1293,7 +1436,7 @@ zap_cursor_move_to_key(zap_cursor_t *zc, if (zc->zc_zap == NULL) { err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, - RW_READER, TRUE, FALSE, &zc->zc_zap); + RW_READER, TRUE, FALSE, FTAG, &zc->zc_zap); if (err) return (err); } else { @@ -1303,7 +1446,7 @@ zap_cursor_move_to_key(zap_cursor_t *zc, zn = zap_name_alloc(zc->zc_zap, name, mt); if (zn == NULL) { rw_exit(&zc->zc_zap->zap_rwlock); - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); } if (!zc->zc_zap->zap_ismicro) { @@ -1311,11 +1454,11 @@ zap_cursor_move_to_key(zap_cursor_t *zc, } else { mze = mze_find(zn); if (mze == NULL) { - err = ENOENT; + err = SET_ERROR(ENOENT); goto out; } zc->zc_hash = mze->mze_hash; - zc->zc_cd = mze->mze_phys.mze_cd; + zc->zc_cd = mze->mze_cd; } out: @@ -1330,7 +1473,7 @@ zap_get_stats(objset_t *os, uint64_t zap int err; zap_t *zap; - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); @@ -1343,42 +1486,43 @@ zap_get_stats(objset_t *os, uint64_t zap } else { fzap_get_stats(zap, zs); } - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (0); } int -zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, - uint64_t *towrite, uint64_t *tooverwrite) +zap_count_write_by_dnode(dnode_t *dn, const char *name, int add, + refcount_t *towrite, refcount_t *tooverwrite) { zap_t *zap; int err = 0; - /* * Since, we don't have a name, we cannot figure out which blocks will * be affected in this operation. So, account for the worst case : * - 3 blocks overwritten: target leaf, ptrtbl block, header block * - 4 new blocks written if adding: - * - 2 blocks for possibly split leaves, - * - 2 grown ptrtbl blocks + * - 2 blocks for possibly split leaves, + * - 2 grown ptrtbl blocks * - * This also accomodates the case where an add operation to a fairly + * This also accommodates the case where an add operation to a fairly * large microzap results in a promotion to fatzap. */ if (name == NULL) { - *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; + (void) refcount_add_many(towrite, + (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG); return (err); } /* - * We lock the zap with adding == FALSE. Because, if we pass + * We lock the zap with adding == FALSE. Because, if we pass * the actual value of add, it could trigger a mzap_upgrade(). * At present we are just evaluating the possibility of this operation - * and hence we donot want to trigger an upgrade. + * and hence we do not want to trigger an upgrade. */ - err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); - if (err) + err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, + FTAG, &zap); + if (err != 0) return (err); if (!zap->zap_ismicro) { @@ -1391,7 +1535,8 @@ zap_count_write(objset_t *os, uint64_t z /* * We treat this case as similar to (name == NULL) */ - *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; + (void) refcount_add_many(towrite, + (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG); } } else { /* @@ -1409,16 +1554,20 @@ zap_count_write(objset_t *os, uint64_t z * 4 new blocks written : 2 new split leaf, 2 grown * ptrtbl blocks */ - if (dmu_buf_freeable(zap->zap_dbuf)) - *tooverwrite += SPA_MAXBLOCKSIZE; - else - *towrite += SPA_MAXBLOCKSIZE; + if (dmu_buf_freeable(zap->zap_dbuf)) { + (void) refcount_add_many(tooverwrite, + MZAP_MAX_BLKSZ, FTAG); + } else { + (void) refcount_add_many(towrite, + MZAP_MAX_BLKSZ, FTAG); + } if (add) { - *towrite += 4 * SPA_MAXBLOCKSIZE; + (void) refcount_add_many(towrite, + 4 * MZAP_MAX_BLKSZ, FTAG); } } - zap_unlockdir(zap); + zap_unlockdir(zap, FTAG); return (err); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfeature.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfeature.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/zfeature.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfeature.c 22 Nov 2015 17:22:31 -0000 @@ -0,0 +1,509 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include "zfeature_common.h" +#include + +/* + * ZFS Feature Flags + * ----------------- + * + * ZFS feature flags are used to provide fine-grained versioning to the ZFS + * on-disk format. Once enabled on a pool feature flags replace the old + * spa_version() number. + * + * Each new on-disk format change will be given a uniquely identifying string + * guid rather than a version number. This avoids the problem of different + * organizations creating new on-disk formats with the same version number. To + * keep feature guids unique they should consist of the reverse dns name of the + * organization which implemented the feature and a short name for the feature, + * separated by a colon (e.g. com.delphix:async_destroy). + * + * Reference Counts + * ---------------- + * + * Within each pool features can be in one of three states: disabled, enabled, + * or active. These states are differentiated by a reference count stored on + * disk for each feature: + * + * 1) If there is no reference count stored on disk the feature is disabled. + * 2) If the reference count is 0 a system administrator has enabled the + * feature, but the feature has not been used yet, so no on-disk + * format changes have been made. + * 3) If the reference count is greater than 0 the feature is active. + * The format changes required by the feature are currently on disk. + * Note that if the feature's format changes are reversed the feature + * may choose to set its reference count back to 0. + * + * Feature flags makes no differentiation between non-zero reference counts + * for an active feature (e.g. a reference count of 1 means the same thing as a + * reference count of 27834721), but feature implementations may choose to use + * the reference count to store meaningful information. For example, a new RAID + * implementation might set the reference count to the number of vdevs using + * it. If all those disks are removed from the pool the feature goes back to + * having a reference count of 0. + * + * It is the responsibility of the individual features to maintain a non-zero + * reference count as long as the feature's format changes are present on disk. + * + * Dependencies + * ------------ + * + * Each feature may depend on other features. The only effect of this + * relationship is that when a feature is enabled all of its dependencies are + * automatically enabled as well. Any future work to support disabling of + * features would need to ensure that features cannot be disabled if other + * enabled features depend on them. + * + * On-disk Format + * -------------- + * + * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES + * (5000). In order for this to work the pool is automatically upgraded to + * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk + * format changes will be in use. + * + * Information about features is stored in 3 ZAP objects in the pool's MOS. + * These objects are linked to by the following names in the pool directory + * object: + * + * 1) features_for_read: feature guid -> reference count + * Features needed to open the pool for reading. + * 2) features_for_write: feature guid -> reference count + * Features needed to open the pool for writing. + * 3) feature_descriptions: feature guid -> descriptive string + * A human readable string. + * + * All enabled features appear in either features_for_read or + * features_for_write, but not both. + * + * To open a pool in read-only mode only the features listed in + * features_for_read need to be supported. + * + * To open the pool in read-write mode features in both features_for_read and + * features_for_write need to be supported. + * + * Some features may be required to read the ZAP objects containing feature + * information. To allow software to check for compatibility with these features + * before the pool is opened their names must be stored in the label in a + * new "features_for_read" entry (note that features that are only required + * to write to a pool never need to be stored in the label since the + * features_for_write ZAP object can be read before the pool is written to). + * To save space in the label features must be explicitly marked as needing to + * be written to the label. Also, reference counts are not stored in the label, + * instead any feature whose reference count drops to 0 is removed from the + * label. + * + * Adding New Features + * ------------------- + * + * Features must be registered in zpool_feature_init() function in + * zfeature_common.c using the zfeature_register() function. This function + * has arguments to specify if the feature should be stored in the + * features_for_read or features_for_write ZAP object and if it needs to be + * written to the label when active. + * + * Once a feature is registered it will appear as a "feature@" + * property which can be set by an administrator. Feature implementors should + * use the spa_feature_is_enabled() and spa_feature_is_active() functions to + * query the state of a feature and the spa_feature_incr() and + * spa_feature_decr() functions to change an enabled feature's reference count. + * Reference counts may only be updated in the syncing context. + * + * Features may not perform enable-time initialization. Instead, any such + * initialization should occur when the feature is first used. This design + * enforces that on-disk changes be made only when features are used. Code + * should only check if a feature is enabled using spa_feature_is_enabled(), + * not by relying on any feature specific metadata existing. If a feature is + * enabled, but the feature's metadata is not on disk yet then it should be + * created as needed. + * + * As an example, consider the com.delphix:async_destroy feature. This feature + * relies on the existence of a bptree in the MOS that store blocks for + * asynchronous freeing. This bptree is not created when async_destroy is + * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is + * called to check if async_destroy is enabled. If it is and the bptree object + * does not exist yet, the bptree object is created as part of the dataset + * destroy and async_destroy's reference count is incremented to indicate it + * has made an on-disk format change. Later, after the destroyed dataset's + * blocks have all been asynchronously freed there is no longer any use for the + * bptree object, so it is destroyed and async_destroy's reference count is + * decremented back to 0 to indicate that it has undone its on-disk format + * changes. + */ + +typedef enum { + FEATURE_ACTION_INCR, + FEATURE_ACTION_DECR, +} feature_action_t; + +/* + * Checks that the active features in the pool are supported by + * this software. Adds each unsupported feature (name -> description) to + * the supplied nvlist. + */ +boolean_t +spa_features_check(spa_t *spa, boolean_t for_write, + nvlist_t *unsup_feat, nvlist_t *enabled_feat) +{ + objset_t *os = spa->spa_meta_objset; + boolean_t supported; + zap_cursor_t zc; + zap_attribute_t za; + uint64_t obj = for_write ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + + supported = B_TRUE; + for (zap_cursor_init(&zc, os, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + + if (NULL != enabled_feat) { + fnvlist_add_uint64(enabled_feat, za.za_name, + za.za_first_integer); + } + + if (za.za_first_integer != 0 && + !zfeature_is_supported(za.za_name)) { + supported = B_FALSE; + + if (NULL != unsup_feat) { + char *desc = ""; + char buf[MAXPATHLEN]; + + if (zap_lookup(os, spa->spa_feat_desc_obj, + za.za_name, 1, sizeof (buf), buf) == 0) + desc = buf; + + VERIFY(nvlist_add_string(unsup_feat, za.za_name, + desc) == 0); + } + } + } + zap_cursor_fini(&zc); + + return (supported); +} + +/* + * Use an in-memory cache of feature refcounts for quick retrieval. + * + * Note: well-designed features will not need to use this; they should + * use spa_feature_is_enabled() and spa_feature_is_active() instead. + * However, this is non-static for zdb and zhack. + */ +int +feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res) +{ + ASSERT(VALID_FEATURE_FID(feature->fi_feature)); + if (spa->spa_feat_refcount_cache[feature->fi_feature] == + SPA_FEATURE_DISABLED) { + return (SET_ERROR(ENOTSUP)); + } + *res = spa->spa_feat_refcount_cache[feature->fi_feature]; + return (0); +} + +/* + * Note: well-designed features will not need to use this; they should + * use spa_feature_is_enabled() and spa_feature_is_active() instead. + * However, this is non-static for zdb and zhack. + */ +int +feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature, + uint64_t *res) +{ + int err; + uint64_t refcount; + uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + + /* + * If the pool is currently being created, the feature objects may not + * have been allocated yet. Act as though all features are disabled. + */ + if (zapobj == 0) + return (SET_ERROR(ENOTSUP)); + + err = zap_lookup(spa->spa_meta_objset, zapobj, + feature->fi_guid, sizeof (uint64_t), 1, &refcount); + if (err != 0) { + if (err == ENOENT) + return (SET_ERROR(ENOTSUP)); + else + return (err); + } + *res = refcount; + return (0); +} + + +static int +feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res) +{ + uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj; + + ASSERT(zfeature_depends_on(feature->fi_feature, + SPA_FEATURE_ENABLED_TXG)); + + if (!spa_feature_is_enabled(spa, feature->fi_feature)) { + return (SET_ERROR(ENOTSUP)); + } + + ASSERT(enabled_txg_obj != 0); + + VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj, + feature->fi_guid, sizeof (uint64_t), 1, res)); + + return (0); +} + +/* + * This function is non-static for zhack; it should otherwise not be used + * outside this file. + */ +void +feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount, + dmu_tx_t *tx) +{ + ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature)); + uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + + VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid, + sizeof (uint64_t), 1, &refcount, tx)); + + /* + * feature_sync is called directly from zhack, allowing the + * creation of arbitrary features whose fi_feature field may + * be greater than SPA_FEATURES. When called from zhack, the + * zfeature_info_t object's fi_feature field will be set to + * SPA_FEATURE_NONE. + */ + if (feature->fi_feature != SPA_FEATURE_NONE) { + uint64_t *refcount_cache = + &spa->spa_feat_refcount_cache[feature->fi_feature]; +#ifdef atomic_swap_64 + VERIFY3U(*refcount_cache, ==, + atomic_swap_64(refcount_cache, refcount)); +#else + *refcount_cache = refcount; +#endif + } + + if (refcount == 0) + spa_deactivate_mos_feature(spa, feature->fi_guid); + else if (feature->fi_flags & ZFEATURE_FLAG_MOS) + spa_activate_mos_feature(spa, feature->fi_guid, tx); +} + +/* + * This function is non-static for zhack; it should otherwise not be used + * outside this file. + */ +void +feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) +{ + uint64_t initial_refcount = + (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0; + uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + + ASSERT(0 != zapobj); + ASSERT(zfeature_is_valid_guid(feature->fi_guid)); + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + + /* + * If the feature is already enabled, ignore the request. + */ + if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0) + return; + + for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) + spa_feature_enable(spa, feature->fi_depends[i], tx); + + VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj, + feature->fi_guid, 1, strlen(feature->fi_desc) + 1, + feature->fi_desc, tx)); + + feature_sync(spa, feature, initial_refcount, tx); + + if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) { + uint64_t enabling_txg = dmu_tx_get_txg(tx); + + if (spa->spa_feat_enabled_txg_obj == 0ULL) { + spa->spa_feat_enabled_txg_obj = + zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURE_ENABLED_TXG, tx); + } + spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx); + + VERIFY0(zap_add(spa->spa_meta_objset, + spa->spa_feat_enabled_txg_obj, feature->fi_guid, + sizeof (uint64_t), 1, &enabling_txg, tx)); + } +} + +static void +feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action, + dmu_tx_t *tx) +{ + uint64_t refcount; + zfeature_info_t *feature = &spa_feature_table[fid]; + uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + + ASSERT(VALID_FEATURE_FID(fid)); + ASSERT(0 != zapobj); + ASSERT(zfeature_is_valid_guid(feature->fi_guid)); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + + VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP); + + switch (action) { + case FEATURE_ACTION_INCR: + VERIFY3U(refcount, !=, UINT64_MAX); + refcount++; + break; + case FEATURE_ACTION_DECR: + VERIFY3U(refcount, !=, 0); + refcount--; + break; + default: + ASSERT(0); + break; + } + + feature_sync(spa, feature, refcount, tx); +} + +void +spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx) +{ + /* + * We create feature flags ZAP objects in two instances: during pool + * creation and during pool upgrade. + */ + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on && + tx->tx_txg == TXG_INITIAL)); + + spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURES_FOR_READ, tx); + spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURES_FOR_WRITE, tx); + spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURE_DESCRIPTIONS, tx); +} + +/* + * Enable any required dependencies, then enable the requested feature. + */ +void +spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) +{ + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + ASSERT(VALID_FEATURE_FID(fid)); + feature_enable_sync(spa, &spa_feature_table[fid], tx); +} + +void +spa_feature_incr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) +{ + feature_do_action(spa, fid, FEATURE_ACTION_INCR, tx); +} + +void +spa_feature_decr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) +{ + feature_do_action(spa, fid, FEATURE_ACTION_DECR, tx); +} + +boolean_t +spa_feature_is_enabled(spa_t *spa, spa_feature_t fid) +{ + int err; + uint64_t refcount; + + ASSERT(VALID_FEATURE_FID(fid)); + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount); + ASSERT(err == 0 || err == ENOTSUP); + return (err == 0); +} + +boolean_t +spa_feature_is_active(spa_t *spa, spa_feature_t fid) +{ + int err; + uint64_t refcount; + + ASSERT(VALID_FEATURE_FID(fid)); + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount); + ASSERT(err == 0 || err == ENOTSUP); + return (err == 0 && refcount > 0); +} + +/* + * For the feature specified by fid (which must depend on + * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the + * OUT txg argument. + * + * Returns B_TRUE if the feature is enabled, in which case txg will be filled + * with the transaction group in which the specified feature was enabled. + * Returns B_FALSE otherwise (i.e. if the feature is not enabled). + */ +boolean_t +spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg) +{ + int err; + + ASSERT(VALID_FEATURE_FID(fid)); + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg); + ASSERT(err == 0 || err == ENOTSUP); + + return (err == 0); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs.conf =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs.conf diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs.conf --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs.conf 12 Jun 2012 05:57:28 -0000 @@ -0,0 +1,28 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +name="zfs" parent="pseudo"; Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c,v retrieving revision 1.3 diff -u -p -r1.3 zfs_acl.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c 27 Feb 2010 23:43:53 -0000 1.3 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c 4 May 2017 20:15:44 -0000 @@ -19,8 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #include @@ -31,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include @@ -50,8 +49,8 @@ #include #include #include -#include -#include +#include +#include #define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE #define DENY ACE_ACCESS_DENIED_ACE_TYPE @@ -321,6 +320,117 @@ static acl_ops_t zfs_acl_fuid_ops = { zfs_ace_fuid_data }; +/* + * The following three functions are provided for compatibility with + * older ZPL version in order to determine if the file use to have + * an external ACL and what version of ACL previously existed on the + * file. Would really be nice to not need this, sigh. + */ +uint64_t +zfs_external_acl(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + int error; + + if (zp->z_is_sa) + return (0); + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_extern_obj); + else { + /* + * after upgrade the SA_ZPL_ZNODE_ACL should have been + * removed + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (0); + } +} + +/* + * Determine size of ACL in bytes + * + * This is more complicated than it should be since we have to deal + * with old external ACLs. + */ +static int +zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, + zfs_acl_phys_t *aclphys) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t acl_count; + int size; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_is_sa) { + if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), + &size)) != 0) + return (error); + *aclsize = size; + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), + &acl_count, sizeof (acl_count))) != 0) + return (error); + *aclcount = acl_count; + } else { + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + aclphys, sizeof (*aclphys))) != 0) + return (error); + + if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { + *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); + *aclcount = aclphys->z_acl_size; + } else { + *aclsize = aclphys->z_acl_size; + *aclcount = aclphys->z_acl_count; + } + } + return (0); +} + +int +zfs_znode_acl_version(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + + if (zp->z_is_sa) + return (ZFS_ACL_VERSION_FUID); + else { + int error; + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_version); + else { + /* + * After upgrade SA_ZPL_ZNODE_ACL should have + * been removed. + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (ZFS_ACL_VERSION_FUID); + } + } +} + static int zfs_acl_version(int version) { @@ -336,7 +446,7 @@ zfs_acl_version_zp(znode_t *zp) return (zfs_acl_version(zp->z_zfsvfs->z_version)); } -static zfs_acl_t * +zfs_acl_t * zfs_acl_alloc(int vers) { zfs_acl_t *aclp; @@ -352,7 +462,7 @@ zfs_acl_alloc(int vers) return (aclp); } -static zfs_acl_node_t * +zfs_acl_node_t * zfs_acl_node_alloc(size_t bytes) { zfs_acl_node_t *aclnode; @@ -463,6 +573,8 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void * { zfs_acl_node_t *aclnode; + ASSERT(aclp); + if (start == NULL) { aclnode = list_head(&aclp->z_acl); if (aclnode == NULL) @@ -509,6 +621,7 @@ zfs_acl_next_ace(zfs_acl_t *aclp, void * *who = aclp->z_ops.ace_who_get(acep); aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; aclnode->z_ace_idx++; + return ((void *)acep); } return (NULL); @@ -542,7 +655,7 @@ zfs_acl_curr_node(zfs_acl_t *aclp) */ int zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp, - void *datap, zfs_ace_t *z_acl, int aclcnt, size_t *size, + void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, zfs_fuid_info_t **fuidp, cred_t *cr) { int i; @@ -569,7 +682,7 @@ zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vt */ if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type, aceptr->z_hdr.z_flags) != B_TRUE) - return (EINVAL); + return (SET_ERROR(EINVAL)); switch (acep->a_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: @@ -676,7 +789,7 @@ zfs_copy_ace_2_oldace(vtype_t obj_type, */ if (zfs_ace_valid(obj_type, aclp, aceptr->z_type, aceptr->z_flags) != B_TRUE) - return (EINVAL); + return (SET_ERROR(EINVAL)); } *size = (caddr_t)aceptr - (caddr_t)z_acl; return (0); @@ -771,10 +884,10 @@ zfs_set_ace(zfs_acl_t *aclp, void *acep, /* * Determine mode of file based on ACL. - * Also, create FUIDs for any User/Group ACEs */ -static uint64_t -zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) +uint64_t +zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, + uint64_t *pflags, uint64_t fuid, uint64_t fgid) { int entry_type; mode_t mode; @@ -785,7 +898,7 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t uint32_t access_mask; boolean_t an_exec_denied = B_FALSE; - mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type)) { @@ -796,14 +909,13 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t entry_type = (iflags & ACE_TYPE_FLAGS); /* - * Skip over owner@, group@ or everyone@ inherit only ACEs + * Skip over any inherit_only ACEs */ - if ((iflags & ACE_INHERIT_ONLY_ACE) && - (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || - entry_type == OWNING_GROUP)) + if (iflags & ACE_INHERIT_ONLY_ACE) continue; - if (entry_type == ACE_OWNER) { + if (entry_type == ACE_OWNER || (entry_type == 0 && + who == fuid)) { if ((access_mask & ACE_READ_DATA) && (!(seen & S_IRUSR))) { seen |= S_IRUSR; @@ -825,7 +937,8 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t mode |= S_IXUSR; } } - } else if (entry_type == OWNING_GROUP) { + } else if (entry_type == OWNING_GROUP || + (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { if ((access_mask & ACE_READ_DATA) && (!(seen & S_IRGRP))) { seen |= S_IRGRP; @@ -930,48 +1043,13 @@ zfs_mode_compute(znode_t *zp, zfs_acl_t an_exec_denied = B_TRUE; if (an_exec_denied) - zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED; + *pflags &= ~ZFS_NO_EXECS_DENIED; else - zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED; + *pflags |= ZFS_NO_EXECS_DENIED; return (mode); } -static zfs_acl_t * -zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify) -{ - zfs_acl_t *aclp; - zfs_acl_node_t *aclnode; - - aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version); - - /* - * Version 0 to 1 znode_acl_phys has the size/count fields swapped. - * Version 0 didn't have a size field, only a count. - */ - if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) { - aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_size; - aclp->z_acl_bytes = ZFS_ACL_SIZE(aclp->z_acl_count); - } else { - aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; - aclp->z_acl_bytes = zp->z_phys->zp_acl.z_acl_size; - } - - aclnode = zfs_acl_node_alloc(will_modify ? aclp->z_acl_bytes : 0); - aclnode->z_ace_count = aclp->z_acl_count; - if (will_modify) { - bcopy(zp->z_phys->zp_acl.z_ace_data, aclnode->z_acldata, - aclp->z_acl_bytes); - } else { - aclnode->z_size = aclp->z_acl_bytes; - aclnode->z_acldata = &zp->z_phys->zp_acl.z_ace_data[0]; - } - - list_insert_head(&aclp->z_acl, aclnode); - - return (aclp); -} - /* * Read an external acl object. If the intent is to modify, always * create a new acl and leave any cached acl in place. @@ -979,60 +1057,100 @@ zfs_acl_node_read_internal(znode_t *zp, static int zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) { - uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj; zfs_acl_t *aclp; - size_t aclsize; - size_t acl_count; + int aclsize; + int acl_count; zfs_acl_node_t *aclnode; - int error; + zfs_acl_phys_t znode_acl; + int version; + int error; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); if (zp->z_acl_cached && !will_modify) { *aclpp = zp->z_acl_cached; return (0); } - if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) { - *aclpp = zfs_acl_node_read_internal(zp, will_modify); - if (!will_modify) - zp->z_acl_cached = *aclpp; - return (0); + version = zfs_znode_acl_version(zp); + + if ((error = zfs_acl_znode_info(zp, &aclsize, + &acl_count, &znode_acl)) != 0) { + goto done; } - aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_version); - if (zp->z_phys->zp_acl.z_acl_version == ZFS_ACL_VERSION_INITIAL) { - zfs_acl_phys_v0_t *zacl0 = - (zfs_acl_phys_v0_t *)&zp->z_phys->zp_acl; + aclp = zfs_acl_alloc(version); - aclsize = ZFS_ACL_SIZE(zacl0->z_acl_count); - acl_count = zacl0->z_acl_count; - } else { - aclsize = zp->z_phys->zp_acl.z_acl_size; - acl_count = zp->z_phys->zp_acl.z_acl_count; - if (aclsize == 0) - aclsize = acl_count * sizeof (zfs_ace_t); - } - aclnode = zfs_acl_node_alloc(aclsize); - list_insert_head(&aclp->z_acl, aclnode); - error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0, - aclsize, aclnode->z_acldata, DMU_READ_PREFETCH); - aclnode->z_ace_count = acl_count; aclp->z_acl_count = acl_count; aclp->z_acl_bytes = aclsize; + aclnode = zfs_acl_node_alloc(aclsize); + aclnode->z_ace_count = aclp->z_acl_count; + aclnode->z_size = aclsize; + + if (!zp->z_is_sa) { + if (znode_acl.z_acl_extern_obj) { + error = dmu_read(zp->z_zfsvfs->z_os, + znode_acl.z_acl_extern_obj, 0, aclnode->z_size, + aclnode->z_acldata, DMU_READ_PREFETCH); + } else { + bcopy(znode_acl.z_ace_data, aclnode->z_acldata, + aclnode->z_size); + } + } else { + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs), + aclnode->z_acldata, aclnode->z_size); + } + if (error != 0) { zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); /* convert checksum errors into IO errors */ if (error == ECKSUM) - error = EIO; - return (error); + error = SET_ERROR(EIO); + goto done; } + list_insert_head(&aclp->z_acl, aclnode); + *aclpp = aclp; if (!will_modify) zp->z_acl_cached = aclp; - return (0); +done: + return (error); +} + +/*ARGSUSED*/ +void +zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, + boolean_t start, void *userdata) +{ + zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; + + if (start) { + cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); + } else { + cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, + cb->cb_acl_node); + } + *dataptr = cb->cb_acl_node->z_acldata; + *length = cb->cb_acl_node->z_size; +} + +int +zfs_acl_chown_setattr(znode_t *zp) +{ + int error; + zfs_acl_t *aclp; + + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0) + zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, + &zp->z_pflags, zp->z_uid, zp->z_gid); + return (error); } /* @@ -1045,28 +1163,35 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t int zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) { - int error; - znode_phys_t *zphys = zp->z_phys; - zfs_acl_phys_t *zacl = &zphys->zp_acl; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint64_t aoid = zphys->zp_acl.z_acl_extern_obj; - uint64_t off = 0; - dmu_object_type_t otype; - zfs_acl_node_t *aclnode; - - dmu_buf_will_dirty(zp->z_dbuf, tx); + int error; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_type_t otype; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t mode; + sa_bulk_attr_t bulk[5]; + uint64_t ctime[2]; + int count = 0; + + mode = zp->z_mode; + + mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, + zp->z_uid, zp->z_gid); + + zp->z_mode = mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } - zphys->zp_mode = zfs_mode_compute(zp, aclp); - /* - * Decide which object type to use. If we are forced to - * use old ACL format then transform ACL into zfs_oldace_t - * layout. + * Upgrade needed? */ if (!zfsvfs->z_use_fuids) { otype = DMU_OT_OLDACL; @@ -1078,462 +1203,196 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t otype = DMU_OT_ACL; } - if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { - /* - * If ACL was previously external and we are now - * converting to new ACL format then release old - * ACL object and create a new one. - */ - if (aoid && aclp->z_version != zacl->z_acl_version) { - error = dmu_object_free(zfsvfs->z_os, - zp->z_phys->zp_acl.z_acl_extern_obj, tx); - if (error) - return (error); - aoid = 0; - } - if (aoid == 0) { - aoid = dmu_object_alloc(zfsvfs->z_os, - otype, aclp->z_acl_bytes, - otype == DMU_OT_ACL ? DMU_OT_SYSACL : DMU_OT_NONE, - otype == DMU_OT_ACL ? DN_MAX_BONUSLEN : 0, tx); - } else { - (void) dmu_object_set_blocksize(zfsvfs->z_os, aoid, - aclp->z_acl_bytes, 0, tx); - } - zphys->zp_acl.z_acl_extern_obj = aoid; - for (aclnode = list_head(&aclp->z_acl); aclnode; - aclnode = list_next(&aclp->z_acl, aclnode)) { - if (aclnode->z_ace_count == 0) - continue; - dmu_write(zfsvfs->z_os, aoid, off, - aclnode->z_size, aclnode->z_acldata, tx); - off += aclnode->z_size; - } - } else { - void *start = zacl->z_ace_data; - /* - * Migrating back embedded? - */ - if (zphys->zp_acl.z_acl_extern_obj) { - error = dmu_object_free(zfsvfs->z_os, - zp->z_phys->zp_acl.z_acl_extern_obj, tx); - if (error) - return (error); - zphys->zp_acl.z_acl_extern_obj = 0; - } - - for (aclnode = list_head(&aclp->z_acl); aclnode; - aclnode = list_next(&aclp->z_acl, aclnode)) { - if (aclnode->z_ace_count == 0) - continue; - bcopy(aclnode->z_acldata, start, aclnode->z_size); - start = (caddr_t)start + aclnode->z_size; - } - } - - /* - * If Old version then swap count/bytes to match old - * layout of znode_acl_phys_t. - */ - if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { - zphys->zp_acl.z_acl_size = aclp->z_acl_count; - zphys->zp_acl.z_acl_count = aclp->z_acl_bytes; - } else { - zphys->zp_acl.z_acl_size = aclp->z_acl_bytes; - zphys->zp_acl.z_acl_count = aclp->z_acl_count; - } - - zphys->zp_acl.z_acl_version = aclp->z_version; - /* - * Replace ACL wide bits, but first clear them. + * Arrgh, we have to handle old on disk format + * as well as newer (preferred) SA format. */ - zp->z_phys->zp_flags &= ~ZFS_ACL_WIDE_FLAGS; - zp->z_phys->zp_flags |= aclp->z_hints; + if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ + locate.cb_aclp = aclp; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, aclp->z_acl_bytes); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), + NULL, &aclp->z_acl_count, sizeof (uint64_t)); + } else { /* Painful legacy way */ + zfs_acl_node_t *aclnode; + uint64_t off = 0; + zfs_acl_phys_t acl_phys; + uint64_t aoid; - if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) - zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL; - - return (0); -} + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + &acl_phys, sizeof (acl_phys))) != 0) + return (error); -/* - * Update access mask for prepended ACE - * - * This applies the "groupmask" value for aclmode property. - */ -static void -zfs_acl_prepend_fixup(zfs_acl_t *aclp, void *acep, void *origacep, - mode_t mode, uint64_t owner) -{ - int rmask, wmask, xmask; - int user_ace; - uint16_t aceflags; - uint32_t origmask, acepmask; - uint64_t fuid; - - aceflags = aclp->z_ops.ace_flags_get(acep); - fuid = aclp->z_ops.ace_who_get(acep); - origmask = aclp->z_ops.ace_mask_get(origacep); - acepmask = aclp->z_ops.ace_mask_get(acep); - - user_ace = (!(aceflags & - (ACE_OWNER|ACE_GROUP|ACE_IDENTIFIER_GROUP))); - - if (user_ace && (fuid == owner)) { - rmask = S_IRUSR; - wmask = S_IWUSR; - xmask = S_IXUSR; - } else { - rmask = S_IRGRP; - wmask = S_IWGRP; - xmask = S_IXGRP; - } + aoid = acl_phys.z_acl_extern_obj; - if (origmask & ACE_READ_DATA) { - if (mode & rmask) { - acepmask &= ~ACE_READ_DATA; + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && + aclp->z_version != acl_phys.z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, aoid, tx); + if (error) + return (error); + aoid = 0; + } + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? + DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? + DN_MAX_BONUSLEN : 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, + aoid, aclp->z_acl_bytes, 0, tx); + } + acl_phys.z_acl_extern_obj = aoid; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } } else { - acepmask |= ACE_READ_DATA; - } - } + void *start = acl_phys.z_ace_data; + /* + * Migrating back embedded? + */ + if (acl_phys.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + acl_phys.z_acl_extern_obj, tx); + if (error) + return (error); + acl_phys.z_acl_extern_obj = 0; + } - if (origmask & ACE_WRITE_DATA) { - if (mode & wmask) { - acepmask &= ~ACE_WRITE_DATA; - } else { - acepmask |= ACE_WRITE_DATA; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } } - } - - if (origmask & ACE_APPEND_DATA) { - if (mode & wmask) { - acepmask &= ~ACE_APPEND_DATA; + /* + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. + */ + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + acl_phys.z_acl_size = aclp->z_acl_count; + acl_phys.z_acl_count = aclp->z_acl_bytes; } else { - acepmask |= ACE_APPEND_DATA; + acl_phys.z_acl_size = aclp->z_acl_bytes; + acl_phys.z_acl_count = aclp->z_acl_count; } - } + acl_phys.z_acl_version = aclp->z_version; - if (origmask & ACE_EXECUTE) { - if (mode & xmask) { - acepmask &= ~ACE_EXECUTE; - } else { - acepmask |= ACE_EXECUTE; - } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (acl_phys)); } - aclp->z_ops.ace_mask_set(acep, acepmask); -} - -/* - * Apply mode to canonical six ACEs. - */ -static void -zfs_acl_fixup_canonical_six(zfs_acl_t *aclp, mode_t mode) -{ - zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl); - void *acep; - int maskoff = aclp->z_ops.ace_mask_off(); - size_t abstract_size = aclp->z_ops.ace_abstract_size(); - - ASSERT(aclnode != NULL); - - acep = (void *)((caddr_t)aclnode->z_acldata + - aclnode->z_size - (abstract_size * 6)); /* - * Fixup final ACEs to match the mode + * Replace ACL wide bits, but first clear them. */ + zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; - adjust_ace_pair_common(acep, maskoff, abstract_size, - (mode & 0700) >> 6); /* owner@ */ - - acep = (caddr_t)acep + (abstract_size * 2); - - adjust_ace_pair_common(acep, maskoff, abstract_size, - (mode & 0070) >> 3); /* group@ */ - - acep = (caddr_t)acep + (abstract_size * 2); - adjust_ace_pair_common(acep, maskoff, - abstract_size, mode); /* everyone@ */ -} - - -static int -zfs_acl_ace_match(zfs_acl_t *aclp, void *acep, int allow_deny, - int entry_type, int accessmask) -{ - uint32_t mask = aclp->z_ops.ace_mask_get(acep); - uint16_t type = aclp->z_ops.ace_type_get(acep); - uint16_t flags = aclp->z_ops.ace_flags_get(acep); - - return (mask == accessmask && type == allow_deny && - ((flags & ACE_TYPE_FLAGS) == entry_type)); -} - -/* - * Can prepended ACE be reused? - */ -static int -zfs_reuse_deny(zfs_acl_t *aclp, void *acep, void *prevacep) -{ - int okay_masks; - uint16_t prevtype; - uint16_t prevflags; - uint16_t flags; - uint32_t mask, prevmask; - - if (prevacep == NULL) - return (B_FALSE); - - prevtype = aclp->z_ops.ace_type_get(prevacep); - prevflags = aclp->z_ops.ace_flags_get(prevacep); - flags = aclp->z_ops.ace_flags_get(acep); - mask = aclp->z_ops.ace_mask_get(acep); - prevmask = aclp->z_ops.ace_mask_get(prevacep); - - if (prevtype != DENY) - return (B_FALSE); - - if (prevflags != (flags & ACE_IDENTIFIER_GROUP)) - return (B_FALSE); - - okay_masks = (mask & OKAY_MASK_BITS); - - if (prevmask & ~okay_masks) - return (B_FALSE); - - return (B_TRUE); -} - - -/* - * Insert new ACL node into chain of zfs_acl_node_t's - * - * This will result in two possible results. - * 1. If the ACL is currently just a single zfs_acl_node and - * we are prepending the entry then current acl node will have - * a new node inserted above it. - * - * 2. If we are inserting in the middle of current acl node then - * the current node will be split in two and new node will be inserted - * in between the two split nodes. - */ -static zfs_acl_node_t * -zfs_acl_ace_insert(zfs_acl_t *aclp, void *acep) -{ - zfs_acl_node_t *newnode; - zfs_acl_node_t *trailernode = NULL; - zfs_acl_node_t *currnode = zfs_acl_curr_node(aclp); - int curr_idx = aclp->z_curr_node->z_ace_idx; - int trailer_count; - size_t oldsize; - - newnode = zfs_acl_node_alloc(aclp->z_ops.ace_size(acep)); - newnode->z_ace_count = 1; - - oldsize = currnode->z_size; - - if (curr_idx != 1) { - trailernode = zfs_acl_node_alloc(0); - trailernode->z_acldata = acep; - - trailer_count = currnode->z_ace_count - curr_idx + 1; - currnode->z_ace_count = curr_idx - 1; - currnode->z_size = (caddr_t)acep - (caddr_t)currnode->z_acldata; - trailernode->z_size = oldsize - currnode->z_size; - trailernode->z_ace_count = trailer_count; - } - - aclp->z_acl_count += 1; - aclp->z_acl_bytes += aclp->z_ops.ace_size(acep); - - if (curr_idx == 1) - list_insert_before(&aclp->z_acl, currnode, newnode); - else - list_insert_after(&aclp->z_acl, currnode, newnode); - if (trailernode) { - list_insert_after(&aclp->z_acl, newnode, trailernode); - aclp->z_curr_node = trailernode; - trailernode->z_ace_idx = 1; - } - - return (newnode); -} - -/* - * Prepend deny ACE - */ -static void * -zfs_acl_prepend_deny(uint64_t uid, zfs_acl_t *aclp, void *acep, - mode_t mode) -{ - zfs_acl_node_t *aclnode; - void *newacep; - uint64_t fuid; - uint16_t flags; - - aclnode = zfs_acl_ace_insert(aclp, acep); - newacep = aclnode->z_acldata; - fuid = aclp->z_ops.ace_who_get(acep); - flags = aclp->z_ops.ace_flags_get(acep); - zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS)); - zfs_acl_prepend_fixup(aclp, newacep, acep, mode, uid); - - return (newacep); -} - -/* - * Split an inherited ACE into inherit_only ACE - * and original ACE with inheritance flags stripped off. - */ -static void -zfs_acl_split_ace(zfs_acl_t *aclp, zfs_ace_hdr_t *acep) -{ - zfs_acl_node_t *aclnode; - zfs_acl_node_t *currnode; - void *newacep; - uint16_t type, flags; - uint32_t mask; - uint64_t fuid; - - type = aclp->z_ops.ace_type_get(acep); - flags = aclp->z_ops.ace_flags_get(acep); - mask = aclp->z_ops.ace_mask_get(acep); - fuid = aclp->z_ops.ace_who_get(acep); - - aclnode = zfs_acl_ace_insert(aclp, acep); - newacep = aclnode->z_acldata; - - aclp->z_ops.ace_type_set(newacep, type); - aclp->z_ops.ace_flags_set(newacep, flags | ACE_INHERIT_ONLY_ACE); - aclp->z_ops.ace_mask_set(newacep, mask); - aclp->z_ops.ace_type_set(newacep, type); - aclp->z_ops.ace_who_set(newacep, fuid); - aclp->z_next_ace = acep; - flags &= ~ALL_INHERIT; - aclp->z_ops.ace_flags_set(acep, flags); - currnode = zfs_acl_curr_node(aclp); - ASSERT(currnode->z_ace_idx >= 1); - currnode->z_ace_idx -= 1; -} + zp->z_pflags |= aclp->z_hints; -/* - * Are ACES started at index i, the canonical six ACES? - */ -static int -zfs_have_canonical_six(zfs_acl_t *aclp) -{ - void *acep; - zfs_acl_node_t *aclnode = list_tail(&aclp->z_acl); - int i = 0; - size_t abstract_size = aclp->z_ops.ace_abstract_size(); - - ASSERT(aclnode != NULL); - - if (aclnode->z_ace_count < 6) - return (0); - - acep = (void *)((caddr_t)aclnode->z_acldata + - aclnode->z_size - (aclp->z_ops.ace_abstract_size() * 6)); - - if ((zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), - DENY, ACE_OWNER, 0) && - zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), - ALLOW, ACE_OWNER, OWNER_ALLOW_MASK) && - zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), DENY, - OWNING_GROUP, 0) && zfs_acl_ace_match(aclp, (caddr_t)acep + - (abstract_size * i++), - ALLOW, OWNING_GROUP, 0) && - zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), - DENY, ACE_EVERYONE, EVERYONE_DENY_MASK) && - zfs_acl_ace_match(aclp, (caddr_t)acep + (abstract_size * i++), - ALLOW, ACE_EVERYONE, EVERYONE_ALLOW_MASK))) { - return (1); - } else { - return (0); - } -} - - -/* - * Apply step 1g, to group entries - * - * Need to deal with corner case where group may have - * greater permissions than owner. If so then limit - * group permissions, based on what extra permissions - * group has. - */ -static void -zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep, - mode_t mode) -{ - uint32_t prevmask = aclp->z_ops.ace_mask_get(prevacep); - uint32_t mask = aclp->z_ops.ace_mask_get(acep); - uint16_t prevflags = aclp->z_ops.ace_flags_get(prevacep); - mode_t extramode = (mode >> 3) & 07; - mode_t ownermode = (mode >> 6); - - if (prevflags & ACE_IDENTIFIER_GROUP) { - - extramode &= ~ownermode; + if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) + zp->z_pflags |= ZFS_ACL_TRIVIAL; - if (extramode) { - if (extramode & S_IROTH) { - prevmask &= ~ACE_READ_DATA; - mask &= ~ACE_READ_DATA; - } - if (extramode & S_IWOTH) { - prevmask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - mask &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); - } - if (extramode & S_IXOTH) { - prevmask &= ~ACE_EXECUTE; - mask &= ~ACE_EXECUTE; - } - } - } - aclp->z_ops.ace_mask_set(acep, mask); - aclp->z_ops.ace_mask_set(prevacep, prevmask); + zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE); + return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); } -/* - * Apply the chmod algorithm as described - * in PSARC/2002/240 - */ static void -zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid, - uint64_t mode, zfs_acl_t *aclp) +zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim, + zfs_acl_t *aclp) { - void *acep = NULL, *prevacep = NULL; + void *acep = NULL; uint64_t who; - int i; + int new_count, new_bytes; + int ace_size; int entry_type; - int reuse_deny; - int need_canonical_six = 1; uint16_t iflags, type; uint32_t access_mask; - - /* - * If discard then just discard all ACL nodes which - * represent the ACEs. - * - * New owner@/group@/everone@ ACEs will be added - * later. - */ - if (zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) - zfs_acl_release_nodes(aclp); + zfs_acl_node_t *newnode; + size_t abstract_size = aclp->z_ops.ace_abstract_size(); + void *zacep; + boolean_t isdir; + trivial_acl_t masks; + + new_count = new_bytes = 0; + + isdir = (vtype == VDIR); + + acl_trivial_access_masks((mode_t)mode, isdir, &masks); + + newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); + + zacep = newnode->z_acldata; + if (masks.allow0) { + zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny1) { + zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny2) { + zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type)) { - entry_type = (iflags & ACE_TYPE_FLAGS); - iflags = (iflags & ALL_INHERIT); + /* + * ACEs used to represent the file mode may be divided + * into an equivalent pair of inherit-only and regular + * ACEs, if they are inheritable. + * Skip regular ACEs, which are replaced by the new mode. + */ + if (split && (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE)) { + if (!isdir || !(iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + continue; + /* + * We preserve owner@, group@, or @everyone + * permissions, if they are inheritable, by + * copying them to inherit_only ACEs. This + * prevents inheritable permissions from being + * altered along with the file mode. + */ + iflags |= ACE_INHERIT_ONLY_ACE; + } + + /* + * If this ACL has any inheritable ACEs, mark that in + * the hints (which are later masked into the pflags) + * so create knows to do inheritance. + */ + if (isdir && (iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; if ((type != ALLOW && type != DENY) || (iflags & ACE_INHERIT_ONLY_ACE)) { - if (iflags) - aclp->z_hints |= ZFS_INHERIT_ACE; switch (type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: @@ -1542,131 +1401,58 @@ zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t aclp->z_hints |= ZFS_ACL_OBJ_ACE; break; } - goto nextace; - } - - /* - * Need to split ace into two? - */ - if ((iflags & (ACE_FILE_INHERIT_ACE| - ACE_DIRECTORY_INHERIT_ACE)) && - (!(iflags & ACE_INHERIT_ONLY_ACE))) { - zfs_acl_split_ace(aclp, acep); - aclp->z_hints |= ZFS_INHERIT_ACE; - goto nextace; - } - - if (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || - (entry_type == OWNING_GROUP)) { - access_mask &= ~OGE_CLEAR; - aclp->z_ops.ace_mask_set(acep, access_mask); - goto nextace; } else { - reuse_deny = B_TRUE; - if (type == ALLOW) { - - /* - * Check preceding ACE if any, to see - * if we need to prepend a DENY ACE. - * This is only applicable when the acl_mode - * property == groupmask. - */ - if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK) { - - reuse_deny = zfs_reuse_deny(aclp, acep, - prevacep); - - if (!reuse_deny) { - prevacep = - zfs_acl_prepend_deny(uid, - aclp, acep, mode); - } else { - zfs_acl_prepend_fixup( - aclp, prevacep, - acep, mode, uid); - } - zfs_fixup_group_entries(aclp, acep, - prevacep, mode); - } - } - } -nextace: - prevacep = acep; - } - - /* - * Check out last six aces, if we have six. - */ - - if (aclp->z_acl_count >= 6) { - if (zfs_have_canonical_six(aclp)) { - need_canonical_six = 0; + /* + * Limit permissions granted by ACEs to be no greater + * than permissions of the requested group mode. + * Applies when the "aclmode" property is set to + * "groupmask". + */ + if ((type == ALLOW) && trim) + access_mask &= masks.group; } - } - - if (need_canonical_six) { - size_t abstract_size = aclp->z_ops.ace_abstract_size(); - void *zacep; - zfs_acl_node_t *aclnode = - zfs_acl_node_alloc(abstract_size * 6); - - aclnode->z_size = abstract_size * 6; - aclnode->z_ace_count = 6; - aclp->z_acl_bytes += aclnode->z_size; - list_insert_tail(&aclp->z_acl, aclnode); - - zacep = aclnode->z_acldata; - - i = 0; - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), - 0, DENY, -1, ACE_OWNER); - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), - OWNER_ALLOW_MASK, ALLOW, -1, ACE_OWNER); - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0, - DENY, -1, OWNING_GROUP); - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), 0, - ALLOW, -1, OWNING_GROUP); - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), - EVERYONE_DENY_MASK, DENY, -1, ACE_EVERYONE); - zfs_set_ace(aclp, (caddr_t)zacep + (abstract_size * i++), - EVERYONE_ALLOW_MASK, ALLOW, -1, ACE_EVERYONE); - aclp->z_acl_count += 6; - } + zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); + ace_size = aclp->z_ops.ace_size(acep); + zacep = (void *)((uintptr_t)zacep + ace_size); + new_count++; + new_bytes += ace_size; + } + zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); - zfs_acl_fixup_canonical_six(aclp, mode); + new_count += 3; + new_bytes += abstract_size * 3; + zfs_acl_release_nodes(aclp); + aclp->z_acl_count = new_count; + aclp->z_acl_bytes = new_bytes; + newnode->z_ace_count = new_count; + newnode->z_size = new_bytes; + list_insert_tail(&aclp->z_acl, newnode); } int zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) { - int error; + int error = 0; - mutex_enter(&zp->z_lock); mutex_enter(&zp->z_acl_lock); - *aclp = NULL; - error = zfs_acl_node_read(zp, aclp, B_TRUE); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) + *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + else + error = zfs_acl_node_read(zp, aclp, B_TRUE); + if (error == 0) { - (*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS; - zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp); + (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE, + (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); } mutex_exit(&zp->z_acl_lock); - mutex_exit(&zp->z_lock); - return (error); -} -/* - * strip off write_owner and write_acl - */ -static void -zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep) -{ - uint32_t mask = aclp->z_ops.ace_mask_get(acep); - - if ((zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) && - (aclp->z_ops.ace_type_get(acep) == ALLOW)) { - mask &= ~RESTRICTED_CLEAR; - aclp->z_ops.ace_mask_set(acep, mask); - } + return (error); } /* @@ -1690,11 +1476,11 @@ zfs_ace_can_use(vtype_t vtype, uint16_t */ static zfs_acl_t * zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, - uint64_t mode, boolean_t *need_chmod) + uint64_t mode) { - void *pacep; - void *acep, *acep2; - zfs_acl_node_t *aclnode, *aclnode2; + void *pacep = NULL; + void *acep; + zfs_acl_node_t *aclnode; zfs_acl_t *aclp = NULL; uint64_t who; uint32_t access_mask; @@ -1702,22 +1488,14 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_ size_t ace_size; void *data1, *data2; size_t data1sz, data2sz; - boolean_t vdir = vtype == VDIR; - boolean_t vreg = vtype == VREG; - boolean_t passthrough, passthrough_x, noallow; - - passthrough_x = - zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH_X; - passthrough = passthrough_x || - zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH; - noallow = - zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW; + uint_t aclinherit; + boolean_t isdir = (vtype == VDIR); - *need_chmod = B_TRUE; - pacep = NULL; aclp = zfs_acl_alloc(paclp->z_version); - if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD) + aclinherit = zfsvfs->z_acl_inherit; + if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK) return (aclp); + while (pacep = zfs_acl_next_ace(paclp, pacep, &who, &access_mask, &iflags, &type)) { @@ -1727,31 +1505,31 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_ if (!zfs_acl_valid_ace_type(type, iflags)) continue; - if (noallow && type == ALLOW) + /* + * Check if ACE is inheritable by this vnode + */ + if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || + !zfs_ace_can_use(vtype, iflags)) continue; - ace_size = aclp->z_ops.ace_size(pacep); - - if (!zfs_ace_can_use(vtype, iflags)) - continue; + /* + * Strip inherited execute permission from file if + * not in mode + */ + if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && + !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { + access_mask &= ~ACE_EXECUTE; + } /* - * If owner@, group@, or everyone@ inheritable - * then zfs_acl_chmod() isn't needed. + * Strip write_acl and write_owner from permissions + * when inheriting an ACE */ - if (passthrough && - ((iflags & (ACE_OWNER|ACE_EVERYONE)) || - ((iflags & OWNING_GROUP) == - OWNING_GROUP)) && (vreg || (vdir && (iflags & - ACE_DIRECTORY_INHERIT_ACE)))) { - *need_chmod = B_FALSE; - - if (!vdir && passthrough_x && - ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) { - access_mask &= ~ACE_EXECUTE; - } + if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { + access_mask &= ~RESTRICTED_CLEAR; } + ace_size = aclp->z_ops.ace_size(pacep); aclnode = zfs_acl_node_alloc(ace_size); list_insert_tail(&aclp->z_acl, aclnode); acep = aclnode->z_acldata; @@ -1767,66 +1545,51 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_ &data2)) == data1sz); bcopy(data1, data2, data2sz); } + aclp->z_acl_count++; aclnode->z_ace_count++; aclp->z_acl_bytes += aclnode->z_size; newflags = aclp->z_ops.ace_flags_get(acep); - if (vdir) - aclp->z_hints |= ZFS_INHERIT_ACE; - - if ((iflags & ACE_NO_PROPAGATE_INHERIT_ACE) || !vdir) { + /* + * If ACE is not to be inherited further, or if the vnode is + * not a directory, remove all inheritance flags + */ + if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { newflags &= ~ALL_INHERIT; aclp->z_ops.ace_flags_set(acep, newflags|ACE_INHERITED_ACE); - zfs_restricted_update(zfsvfs, aclp, acep); continue; } - ASSERT(vdir); + /* + * This directory has an inheritable ACE + */ + aclp->z_hints |= ZFS_INHERIT_ACE; - newflags = aclp->z_ops.ace_flags_get(acep); + /* + * If only FILE_INHERIT is set then turn on + * inherit_only + */ if ((iflags & (ACE_FILE_INHERIT_ACE | - ACE_DIRECTORY_INHERIT_ACE)) != - ACE_FILE_INHERIT_ACE) { - aclnode2 = zfs_acl_node_alloc(ace_size); - list_insert_tail(&aclp->z_acl, aclnode2); - acep2 = aclnode2->z_acldata; - zfs_set_ace(aclp, acep2, - access_mask, type, who, - iflags|ACE_INHERITED_ACE); + ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { newflags |= ACE_INHERIT_ONLY_ACE; - aclp->z_ops.ace_flags_set(acep, newflags); - newflags &= ~ALL_INHERIT; - aclp->z_ops.ace_flags_set(acep2, + aclp->z_ops.ace_flags_set(acep, newflags|ACE_INHERITED_ACE); - - /* - * Copy special opaque data if any - */ - if ((data1sz = aclp->z_ops.ace_data(acep, - &data1)) != 0) { - VERIFY((data2sz = - aclp->z_ops.ace_data(acep2, - &data2)) == data1sz); - bcopy(data1, data2, data1sz); - } - aclp->z_acl_count++; - aclnode2->z_ace_count++; - aclp->z_acl_bytes += aclnode->z_size; - zfs_restricted_update(zfsvfs, aclp, acep2); } else { - newflags |= ACE_INHERIT_ONLY_ACE; + newflags &= ~ACE_INHERIT_ONLY_ACE; aclp->z_ops.ace_flags_set(acep, newflags|ACE_INHERITED_ACE); } } + return (aclp); } /* * Create file system object initial permissions * including inheritable ACEs. + * Also, create FUIDs for owner and group. */ int zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, @@ -1836,8 +1599,12 @@ zfs_acl_ids_create(znode_t *dzp, int fla zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zfs_acl_t *paclp; gid_t gid; - boolean_t need_chmod = B_TRUE; + boolean_t trim = B_FALSE; + boolean_t inherited = B_FALSE; +#ifndef __NetBSD__ + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); +#endif bzero(acl_ids, sizeof (zfs_acl_ids_t)); acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); @@ -1845,7 +1612,6 @@ zfs_acl_ids_create(znode_t *dzp, int fla if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) return (error); - /* * Determine uid and gid. */ @@ -1867,17 +1633,17 @@ zfs_acl_ids_create(znode_t *dzp, int fla (uint64_t)vap->va_gid, cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; - if (acl_ids->z_fgid != dzp->z_phys->zp_gid && + if (acl_ids->z_fgid != dzp->z_gid && !groupmember(vap->va_gid, cr) && secpolicy_vnode_create_gid(cr) != 0) acl_ids->z_fgid = 0; } if (acl_ids->z_fgid == 0) { - if (dzp->z_phys->zp_mode & S_ISGID) { + if (dzp->z_mode & S_ISGID) { char *domain; uint32_t rid; - acl_ids->z_fgid = dzp->z_phys->zp_gid; + acl_ids->z_fgid = dzp->z_gid; gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, cr, ZFS_GROUP); @@ -1895,7 +1661,11 @@ zfs_acl_ids_create(znode_t *dzp, int fla } else { acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, ZFS_GROUP, cr, &acl_ids->z_fuidp); +#ifdef __FreeBSD_kernel__ + gid = acl_ids->z_fgid = dzp->z_gid; +#else gid = crgetgid(cr); +#endif } } } @@ -1907,36 +1677,48 @@ zfs_acl_ids_create(znode_t *dzp, int fla * file's new group, clear the file's set-GID bit. */ - if (!(flag & IS_ROOT_NODE) && (dzp->z_phys->zp_mode & S_ISGID) && + if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && (vap->va_type == VDIR)) { acl_ids->z_mode |= S_ISGID; } else { if ((acl_ids->z_mode & S_ISGID) && - secpolicy_vnode_setids_setgids(cr, gid) != 0) + secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0) acl_ids->z_mode &= ~S_ISGID; } if (acl_ids->z_aclp == NULL) { - mutex_enter(&dzp->z_lock); - if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR && - (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)) && - !(dzp->z_phys->zp_flags & ZFS_XATTR)) { - mutex_enter(&dzp->z_acl_lock); + mutex_enter(&dzp->z_acl_lock); + if (!(flag & IS_ROOT_NODE) && + (dzp->z_pflags & ZFS_INHERIT_ACE) && + !(dzp->z_pflags & ZFS_XATTR)) { VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE)); - mutex_exit(&dzp->z_acl_lock); acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, - vap->va_type, paclp, acl_ids->z_mode, &need_chmod); + vap->va_type, paclp, acl_ids->z_mode); + inherited = B_TRUE; } else { acl_ids->z_aclp = zfs_acl_alloc(zfs_acl_version_zp(dzp)); + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; } - mutex_exit(&dzp->z_lock); - if (need_chmod) { - acl_ids->z_aclp->z_hints = (vap->va_type == VDIR) ? - ZFS_ACL_AUTO_INHERIT : 0; - zfs_acl_chmod(zfsvfs, acl_ids->z_fuid, - acl_ids->z_mode, acl_ids->z_aclp); - } + mutex_exit(&dzp->z_acl_lock); + + if (vap->va_type == VDIR) + acl_ids->z_aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + + if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) + trim = B_TRUE; + zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE, trim, + acl_ids->z_aclp); + } + + if (inherited || vsecp) { + acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, + acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, + acl_ids->z_fuid, acl_ids->z_fgid); + if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; } return (0); @@ -1959,12 +1741,12 @@ zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) boolean_t zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids) { - return (zfs_usergroup_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) || - zfs_usergroup_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid)); + return (zfs_fuid_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) || + zfs_fuid_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid)); } /* - * Retrieve a files ACL + * Retrieve a file's ACL */ int zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) @@ -1978,14 +1760,15 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsec mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); + if (mask == 0) + return (SET_ERROR(ENOSYS)); + if (error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)) return (error); - if (mask == 0) - return (ENOSYS); - mutex_enter(&zp->z_acl_lock); + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); error = zfs_acl_node_read(zp, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); @@ -1995,8 +1778,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsec /* * Scan ACL to determine number of ACEs */ - if ((zp->z_phys->zp_flags & ZFS_ACL_OBJ_ACE) && - !(mask & VSA_ACE_ALLTYPES)) { + if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { void *zacep = NULL; uint64_t who; uint32_t access_mask; @@ -2017,7 +1799,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsec } vsecp->vsa_aclcnt = count; } else - count = aclp->z_acl_count; + count = (int)aclp->z_acl_count; if (mask & VSA_ACECNT) { vsecp->vsa_aclcnt = count; @@ -2051,11 +1833,11 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsec } if (mask & VSA_ACE_ACLFLAGS) { vsecp->vsa_aclflags = 0; - if (zp->z_phys->zp_flags & ZFS_ACL_DEFAULTED) + if (zp->z_pflags & ZFS_ACL_DEFAULTED) vsecp->vsa_aclflags |= ACL_DEFAULTED; - if (zp->z_phys->zp_flags & ZFS_ACL_PROTECTED) + if (zp->z_pflags & ZFS_ACL_PROTECTED) vsecp->vsa_aclflags |= ACL_PROTECTED; - if (zp->z_phys->zp_flags & ZFS_ACL_AUTO_INHERIT) + if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; } @@ -2074,7 +1856,7 @@ zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_ int error; if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) - return (EINVAL); + return (SET_ERROR(EINVAL)); aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); @@ -2120,7 +1902,7 @@ zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_ } /* - * Set a files ACL + * Set a file's ACL */ int zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) @@ -2133,12 +1915,14 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsec zfs_acl_t *aclp; zfs_fuid_info_t *fuidp = NULL; boolean_t fuid_dirtied; + uint64_t acl_obj; + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); if (mask == 0) - return (ENOSYS); + return (SET_ERROR(ENOSYS)); - if (zp->z_phys->zp_flags & ZFS_IMMUTABLE) - return (EPERM); + if (zp->z_pflags & ZFS_IMMUTABLE) + return (SET_ERROR(EPERM)); if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) return (error); @@ -2153,41 +1937,43 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsec * existing flags. */ if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { - aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); + aclp->z_hints |= + (zp->z_pflags & V4_ACL_WIDE_FLAGS); } top: - mutex_enter(&zp->z_lock); mutex_enter(&zp->z_acl_lock); tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); - if (zp->z_phys->zp_acl.z_acl_extern_obj) { - /* Are we upgrading ACL? */ - if (zfsvfs->z_version <= ZPL_VERSION_FUID && - zp->z_phys->zp_acl.z_acl_version == - ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, - zp->z_phys->zp_acl.z_acl_extern_obj, - 0, DMU_OBJECT_END); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, aclp->z_acl_bytes); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + /* + * If old version and ACL won't fit in bonus and we aren't + * upgrading then take out necessary DMU holds + */ + + if ((acl_obj = zfs_external_acl(zp)) != 0) { + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + aclp->z_acl_bytes); } else { - dmu_tx_hold_write(tx, - zp->z_phys->zp_acl.z_acl_extern_obj, - 0, aclp->z_acl_bytes); + dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } - } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); + zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { mutex_exit(&zp->z_acl_lock); - mutex_exit(&zp->z_lock); if (error == ERESTART) { dmu_tx_wait(tx); @@ -2201,20 +1987,18 @@ top: error = zfs_aclset_common(zp, aclp, cr, tx); ASSERT(error == 0); + ASSERT(zp->z_acl_cached == NULL); zp->z_acl_cached = aclp; if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); zfs_log_acl(zilog, tx, zp, vsecp, fuidp); if (fuidp) zfs_fuid_info_free(fuidp); dmu_tx_commit(tx); -done: mutex_exit(&zp->z_acl_lock); - mutex_exit(&zp->z_lock); return (error); } @@ -2231,7 +2015,7 @@ zfs_zaccess_dataset_check(znode_t *zp, u (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && (!IS_DEVVP(ZTOV(zp)) || (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) { - return (EROFS); + return (SET_ERROR(EROFS)); } /* @@ -2239,20 +2023,32 @@ zfs_zaccess_dataset_check(znode_t *zp, u */ if ((v4_mode & WRITE_MASK_DATA) && (((ZTOV(zp)->v_type != VDIR) && - (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) || + (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) || (ZTOV(zp)->v_type == VDIR && - (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) { - return (EPERM); + (zp->z_pflags & ZFS_IMMUTABLE)))) { + return (SET_ERROR(EPERM)); } +#ifdef illumos if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && - (zp->z_phys->zp_flags & ZFS_NOUNLINK)) { + (zp->z_pflags & ZFS_NOUNLINK)) { + return (SET_ERROR(EPERM)); + } +#else + /* + * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK + * (sunlnk) is set. We just don't allow directory removal, which is + * handled in zfs_zaccess_delete(). + */ + if ((v4_mode & ACE_DELETE) && + (zp->z_pflags & ZFS_NOUNLINK)) { return (EPERM); } +#endif if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && - (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) { - return (EACCES); + (zp->z_pflags & ZFS_AV_QUARANTINED))) { + return (SET_ERROR(EACCES)); } return (0); @@ -2298,19 +2094,22 @@ zfs_zaccess_aces_check(znode_t *zp, uint uint32_t deny_mask = 0; zfs_ace_hdr_t *acep = NULL; boolean_t checkit; - uid_t fowner; uid_t gowner; + uid_t fowner; zfs_fuid_map_ids(zp, cr, &fowner, &gowner); mutex_enter(&zp->z_acl_lock); + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); error = zfs_acl_node_read(zp, &aclp, B_FALSE); if (error != 0) { mutex_exit(&zp->z_acl_lock); return (error); } + ASSERT(zp->z_acl_cached); + while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type)) { uint32_t mask_matched; @@ -2358,7 +2157,7 @@ zfs_zaccess_aces_check(znode_t *zp, uint break; } else { mutex_exit(&zp->z_acl_lock); - return (EIO); + return (SET_ERROR(EIO)); } } @@ -2392,7 +2191,7 @@ zfs_zaccess_aces_check(znode_t *zp, uint /* Put the found 'denies' back on the working mode */ if (deny_mask) { *working_mode |= deny_mask; - return (EACCES); + return (SET_ERROR(EACCES)); } else if (*working_mode) { return (-1); } @@ -2410,18 +2209,10 @@ zfs_has_access(znode_t *zp, cred_t *cr) uint32_t have = ACE_ALL_PERMS; if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { - uid_t owner; - - owner = zfs_fuid_map_id(zp->z_zfsvfs, - zp->z_phys->zp_uid, cr, ZFS_OWNER); + uid_t owner; - return ( - secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 || - secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 || - secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 || - secpolicy_vnode_chown(cr, owner) == 0 || - secpolicy_vnode_setdac(cr, owner) == 0 || - secpolicy_vnode_remove(cr) == 0); + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0); } return (B_TRUE); } @@ -2467,7 +2258,7 @@ zfs_zaccess_append(znode_t *zp, uint32_t cred_t *cr) { if (*working_mode != ACE_WRITE_DATA) - return (EACCES); + return (SET_ERROR(EACCES)); return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, check_privs, B_FALSE, cr)); @@ -2479,38 +2270,33 @@ zfs_fastaccesschk_execute(znode_t *zdp, boolean_t owner = B_FALSE; boolean_t groupmbr = B_FALSE; boolean_t is_attr; - uid_t fowner; - uid_t gowner; uid_t uid = crgetuid(cr); int error; - if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED) - return (EACCES); + if (zdp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); - is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) && + is_attr = ((zdp->z_pflags & ZFS_XATTR) && (ZTOV(zdp)->v_type == VDIR)); if (is_attr) goto slow; + mutex_enter(&zdp->z_acl_lock); - if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) { + if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) { mutex_exit(&zdp->z_acl_lock); return (0); } - if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 || - FUID_INDEX(zdp->z_phys->zp_gid) != 0) { + if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) { mutex_exit(&zdp->z_acl_lock); goto slow; } - fowner = (uid_t)zdp->z_phys->zp_uid; - gowner = (uid_t)zdp->z_phys->zp_gid; - - if (uid == fowner) { + if (uid == zdp->z_uid) { owner = B_TRUE; - if (zdp->z_phys->zp_mode & S_IXUSR) { + if (zdp->z_mode & S_IXUSR) { mutex_exit(&zdp->z_acl_lock); return (0); } else { @@ -2518,9 +2304,9 @@ zfs_fastaccesschk_execute(znode_t *zdp, goto slow; } } - if (groupmember(gowner, cr)) { + if (groupmember(zdp->z_gid, cr)) { groupmbr = B_TRUE; - if (zdp->z_phys->zp_mode & S_IXGRP) { + if (zdp->z_mode & S_IXGRP) { mutex_exit(&zdp->z_acl_lock); return (0); } else { @@ -2529,7 +2315,7 @@ zfs_fastaccesschk_execute(znode_t *zdp, } } if (!owner && !groupmbr) { - if (zdp->z_phys->zp_mode & S_IXOTH) { + if (zdp->z_mode & S_IXOTH) { mutex_exit(&zdp->z_acl_lock); return (0); } @@ -2546,8 +2332,10 @@ slow: } /* - * Determine whether Access should be granted/denied, invoking least - * priv subsytem when a deny is determined. + * Determine whether Access should be granted/denied. + * + * The least priv subsytem is always consulted as a basic privilege + * can define any form of access. */ int zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) @@ -2555,20 +2343,36 @@ zfs_zaccess(znode_t *zp, int mode, int f uint32_t working_mode; int error; int is_attr; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; boolean_t check_privs; znode_t *xzp; znode_t *check_zp = zp; + mode_t needed_bits; + uid_t owner; - is_attr = ((zp->z_phys->zp_flags & ZFS_XATTR) && - (ZTOV(zp)->v_type == VDIR)); + is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR)); +#ifndef illumos + /* + * In FreeBSD, we don't care about permissions of individual ADS. + * Note that not checking them is not just an optimization - without + * this shortcut, EA operations may bogusly fail with EACCES. + */ + if (zp->z_pflags & ZFS_XATTR) + return (0); +#else /* * If attribute then validate against base file */ if (is_attr) { + uint64_t parent; + + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zp->z_zfsvfs), &parent, + sizeof (parent))) != 0) + return (error); + if ((error = zfs_zget(zp->z_zfsvfs, - zp->z_phys->zp_parent, &xzp)) != 0) { + parent, &xzp)) != 0) { return (error); } @@ -2588,12 +2392,38 @@ zfs_zaccess(znode_t *zp, int mode, int f mode |= ACE_READ_NAMED_ATTRS; } } +#endif + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + /* + * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC + * in needed_bits. Map the bits mapped by working_mode (currently + * missing) in missing_bits. + * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), + * needed_bits. + */ + needed_bits = 0; + + working_mode = mode; + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + owner == crgetuid(cr)) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VWRITE; + if (working_mode & ACE_EXECUTE) + needed_bits |= VEXEC; if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, &check_privs, skipaclchk, cr)) == 0) { if (is_attr) VN_RELE(ZTOV(xzp)); - return (0); + return (secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits)); } if (error && !check_privs) { @@ -2607,12 +2437,8 @@ zfs_zaccess(znode_t *zp, int mode, int f } if (error && check_privs) { - uid_t owner; mode_t checkmode = 0; - owner = zfs_fuid_map_id(zfsvfs, check_zp->z_phys->zp_uid, cr, - ZFS_OWNER); - /* * First check for implicit owner permission on * read_acl/read_attributes @@ -2634,21 +2460,20 @@ zfs_zaccess(znode_t *zp, int mode, int f if (working_mode & ACE_EXECUTE) checkmode |= VEXEC; - if (checkmode) - error = secpolicy_vnode_access(cr, ZTOV(check_zp), - owner, checkmode); + error = secpolicy_vnode_access2(cr, ZTOV(check_zp), owner, + needed_bits & ~checkmode, needed_bits); if (error == 0 && (working_mode & ACE_WRITE_OWNER)) - error = secpolicy_vnode_chown(cr, owner); + error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner); if (error == 0 && (working_mode & ACE_WRITE_ACL)) - error = secpolicy_vnode_setdac(cr, owner); + error = secpolicy_vnode_setdac(ZTOV(check_zp), cr, owner); if (error == 0 && (working_mode & (ACE_DELETE|ACE_DELETE_CHILD))) - error = secpolicy_vnode_remove(cr); + error = secpolicy_vnode_remove(ZTOV(check_zp), cr); if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { - error = secpolicy_vnode_chown(cr, owner); + error = secpolicy_vnode_chown(ZTOV(check_zp), cr, owner); } if (error == 0) { /* @@ -2656,11 +2481,15 @@ zfs_zaccess(znode_t *zp, int mode, int f * for are still present. If so then return EACCES */ if (working_mode & ~(ZFS_CHECKED_MASKS)) { - error = EACCES; + error = SET_ERROR(EACCES); } } + } else if (error == 0) { + error = secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits); } + if (is_attr) VN_RELE(ZTOV(xzp)); @@ -2690,15 +2519,15 @@ zfs_zaccess_unix(znode_t *zp, mode_t mod static int zfs_delete_final_check(znode_t *zp, znode_t *dzp, - mode_t missing_perms, cred_t *cr) + mode_t available_perms, cred_t *cr) { int error; uid_t downer; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - downer = zfs_fuid_map_id(zfsvfs, dzp->z_phys->zp_uid, cr, ZFS_OWNER); + downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER); - error = secpolicy_vnode_access(cr, ZTOV(dzp), downer, missing_perms); + error = secpolicy_vnode_access2(cr, ZTOV(dzp), + downer, available_perms, VWRITE|VEXEC); if (error == 0) error = zfs_sticky_remove_access(dzp, zp, cr); @@ -2710,7 +2539,6 @@ zfs_delete_final_check(znode_t *zp, znod * Determine whether Access should be granted/deny, without * consulting least priv subsystem. * - * * The following chart is the recommended NFSv4 enforcement for * ability to delete an object. * @@ -2747,7 +2575,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t uint32_t dzp_working_mode = 0; uint32_t zp_working_mode = 0; int dzp_error, zp_error; - mode_t missing_perms; + mode_t available_perms; boolean_t dzpcheck_privs = B_TRUE; boolean_t zpcheck_privs = B_TRUE; @@ -2765,8 +2593,8 @@ zfs_zaccess_delete(znode_t *dzp, znode_t * to determine what was found. */ - if (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) - return (EPERM); + if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + return (SET_ERROR(EPERM)); /* * First row @@ -2801,30 +2629,27 @@ zfs_zaccess_delete(znode_t *dzp, znode_t */ if (dzp_error == EACCES) - return (secpolicy_vnode_remove(cr)); + return (secpolicy_vnode_remove(ZTOV(dzp), cr)); /* XXXPJD: s/dzp/zp/ ? */ /* * Third Row * only need to see if we have write/execute on directory. */ - if ((dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, - &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) - return (zfs_sticky_remove_access(dzp, zp, cr)); + dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); - if (!dzpcheck_privs) + if (dzp_error != 0 && !dzpcheck_privs) return (dzp_error); /* * Fourth row */ - missing_perms = (dzp_working_mode & ACE_WRITE_DATA) ? VWRITE : 0; - missing_perms |= (dzp_working_mode & ACE_EXECUTE) ? VEXEC : 0; - - ASSERT(missing_perms); + available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE; + available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC; - return (zfs_delete_final_check(zp, dzp, missing_perms, cr)); + return (zfs_delete_final_check(zp, dzp, available_perms, cr)); } @@ -2835,8 +2660,8 @@ zfs_zaccess_rename(znode_t *sdzp, znode_ int add_perm; int error; - if (szp->z_phys->zp_flags & ZFS_AV_QUARANTINED) - return (EACCES); + if (szp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); add_perm = (ZTOV(szp)->v_type == VDIR) ? ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; @@ -2844,7 +2669,15 @@ zfs_zaccess_rename(znode_t *sdzp, znode_ /* * Rename permissions are combination of delete permission + * add file/subdir permission. + * + * BSD operating systems also require write permission + * on the directory being moved from one parent directory + * to another. */ + if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) { + if (error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr)) + return (error); + } /* * first make sure we do the delete portion. Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zfs_byteswap.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c 27 Feb 2010 22:31:18 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c 23 Mar 2013 15:29:23 -0000 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -27,6 +27,7 @@ #include #include #include +#include #include void @@ -50,7 +51,7 @@ zfs_ace_byteswap(void *buf, size_t size, { caddr_t end; caddr_t ptr; - zfs_ace_t *zacep; + zfs_ace_t *zacep = NULL; ace_t *acep; uint16_t entry_type; size_t entry_size; Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c,v retrieving revision 1.2 diff -u -p -r1.2 zfs_ctldir.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c 14 Dec 2010 01:28:18 -0000 1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c 11 May 2017 23:15:03 -0000 @@ -19,8 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. */ /* @@ -64,95 +65,248 @@ * so that it cannot be freed until all snapshots have been unmounted. */ -#include +#ifdef __FreeBSD__ + +#include #include #include #include -#include -#include +#include #include #include +#include +#include #include #include -#include +#include #include "zfs_namecheck.h" -typedef struct zfsctl_node { - gfs_dir_t zc_gfs_private; - uint64_t zc_id; - timestruc_t zc_cmtime; /* ctime and mtime, always the same */ -} zfsctl_node_t; - -typedef struct zfsctl_snapdir { - zfsctl_node_t sd_node; - kmutex_t sd_lock; - avl_tree_t sd_snaps; -} zfsctl_snapdir_t; +/* + * "Synthetic" filesystem implementation. + */ -typedef struct { - char *se_name; - vnode_t *se_root; - avl_node_t se_node; -} zfs_snapentry_t; - -static int -snapentry_compare(const void *a, const void *b) -{ - const zfs_snapentry_t *sa = a; - const zfs_snapentry_t *sb = b; - int ret = strcmp(sa->se_name, sb->se_name); - - if (ret < 0) - return (-1); - else if (ret > 0) - return (1); - else - return (0); -} - -//vnodeops_t *zfsctl_ops_shares_dir; - -static struct vnodeopv_entry_desc zfsctl_ops_root; -static struct vnodeopv_entry_desc zfsctl_ops_snapdir; -static struct vnodeopv_entry_desc zfsctl_ops_snapshot; -static struct vnodeopv_entry_desc zfsctl_ops_shares; - - -static vnode_t *zfsctl_mknode_snapdir(vnode_t *); -static vnode_t *zfsctl_mknode_shares(vnode_t *); -static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); -static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *); - -static gfs_opsvec_t zfsctl_opsvec[] = { - { ".zfs", zfsctl_tops_root, &zfsctl_ops_root }, - { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir }, - { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot }, - { ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares_dir }, - { ".zfs/shares/vnode", zfsctl_tops_shares, &zfsctl_ops_shares }, - { NULL } -}; +/* + * Assert that A implies B. + */ +#define KASSERT_IMPLY(A, B, msg) KASSERT(!(A) || (B), (msg)); + +static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes"); + +typedef struct sfs_node { + char sn_name[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t sn_parent_id; + uint64_t sn_id; +} sfs_node_t; /* - * Root directory elements. We only have two entries - * snapshot and shares. + * Check the parent's ID as well as the node's to account for a chance + * that IDs originating from different domains (snapshot IDs, artifical + * IDs, znode IDs) may clash. */ -static gfs_dirent_t zfsctl_root_entries[] = { - { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, - { "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE }, - { NULL } -}; +static int +sfs_compare_ids(struct vnode *vp, void *arg) +{ + sfs_node_t *n1 = vp->v_data; + sfs_node_t *n2 = arg; + bool equal; + + equal = n1->sn_id == n2->sn_id && + n1->sn_parent_id == n2->sn_parent_id; + + /* Zero means equality. */ + return (!equal); +} -/* include . and .. in the calculation */ -#define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \ - sizeof (gfs_dirent_t)) + 1) +static int +sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id, + uint64_t id, struct vnode **vpp) +{ + sfs_node_t search; + int err; + + search.sn_id = id; + search.sn_parent_id = parent_id; + err = vfs_hash_get(mp, (u_int)id, flags, curthread, vpp, + sfs_compare_ids, &search); + return (err); +} + +static int +sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id, + uint64_t id, struct vnode **vpp) +{ + int err; + + KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data")); + err = vfs_hash_insert(vp, (u_int)id, flags, curthread, vpp, + sfs_compare_ids, vp->v_data); + return (err); +} + +static void +sfs_vnode_remove(struct vnode *vp) +{ + vfs_hash_remove(vp); +} + +typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg); + +static int +sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id, + const char *tag, struct vop_vector *vops, + sfs_vnode_setup_fn setup, void *arg, + struct vnode **vpp) +{ + struct vnode *vp; + int error; + + error = sfs_vnode_get(mp, flags, parent_id, id, vpp); + if (error != 0 || *vpp != NULL) { + KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, + "sfs vnode with no data"); + return (error); + } + + /* Allocate a new vnode/inode. */ + error = getnewvnode(tag, mp, vops, &vp); + if (error != 0) { + *vpp = NULL; + return (error); + } + + /* + * Exclusively lock the vnode vnode while it's being constructed. + */ + lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); + error = insmntque(vp, mp); + if (error != 0) { + *vpp = NULL; + return (error); + } + + setup(vp, arg); + + error = sfs_vnode_insert(vp, flags, parent_id, id, vpp); + if (error != 0 || *vpp != NULL) { + KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, + "sfs vnode with no data"); + return (error); + } + + *vpp = vp; + return (0); +} + +static void +sfs_print_node(sfs_node_t *node) +{ + printf("\tname = %s\n", node->sn_name); + printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id); + printf("\tid = %ju\n", (uintmax_t)node->sn_id); +} + +static sfs_node_t * +sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id) +{ + struct sfs_node *node; + + KASSERT(strlen(name) < sizeof(node->sn_name), + ("sfs node name is too long")); + KASSERT(size >= sizeof(*node), ("sfs node size is too small")); + node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO); + strlcpy(node->sn_name, name, sizeof(node->sn_name)); + node->sn_parent_id = parent_id; + node->sn_id = id; + + return (node); +} + +static void +sfs_destroy_node(sfs_node_t *node) +{ + free(node, M_SFSNODES); +} + +static void * +sfs_reclaim_vnode(vnode_t *vp) +{ + sfs_node_t *node; + void *data; + + sfs_vnode_remove(vp); + data = vp->v_data; + vp->v_data = NULL; + return (data); +} + +static int +sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap, + uio_t *uio, off_t *offp) +{ + struct dirent entry; + int error; + + /* Reset ncookies for subsequent use of vfs_read_dirent. */ + if (ap->a_ncookies != NULL) + *ap->a_ncookies = 0; + + if (uio->uio_resid < sizeof(entry)) + return (SET_ERROR(EINVAL)); + + if (uio->uio_offset < 0) + return (SET_ERROR(EINVAL)); + if (uio->uio_offset == 0) { + entry.d_fileno = id; + entry.d_type = DT_DIR; + entry.d_name[0] = '.'; + entry.d_name[1] = '\0'; + entry.d_namlen = 1; + entry.d_reclen = sizeof(entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) + return (SET_ERROR(error)); + } + + if (uio->uio_offset < sizeof(entry)) + return (SET_ERROR(EINVAL)); + if (uio->uio_offset == sizeof(entry)) { + entry.d_fileno = parent_id; + entry.d_type = DT_DIR; + entry.d_name[0] = '.'; + entry.d_name[1] = '.'; + entry.d_name[2] = '\0'; + entry.d_namlen = 2; + entry.d_reclen = sizeof(entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) + return (SET_ERROR(error)); + } + + if (offp != NULL) + *offp = 2 * sizeof(entry); + return (0); +} /* - * Initialize the various GFS pieces we'll need to create and manipulate .zfs - * directories. This is called from the ZFS init routine, and initializes the - * vnode ops vectors that we'll be using. + * .zfs inode namespace + * + * We need to generate unique inode numbers for all files and directories + * within the .zfs pseudo-filesystem. We use the following scheme: + * + * ENTRY ZFSCTL_INODE + * .zfs 1 + * .zfs/snapshot 2 + * .zfs/snapshot/ objectid(snap) */ +#define ZFSCTL_INO_SNAP(id) (id) + +static struct vop_vector zfsctl_ops_root; +static struct vop_vector zfsctl_ops_snapdir; +static struct vop_vector zfsctl_ops_snapshot; +static struct vop_vector zfsctl_ops_shares_dir; + void zfsctl_init(void) { @@ -163,80 +317,120 @@ zfsctl_fini(void) { } -/* - * Return the inode number associated with the 'snapshot' or - * 'shares' directory. - */ -/* ARGSUSED */ -static ino64_t -zfsctl_root_inode_cb(vnode_t *vp, int index) +boolean_t +zfsctl_is_node(vnode_t *vp) { - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + return (vn_matchops(vp, zfsctl_ops_root) || + vn_matchops(vp, zfsctl_ops_snapdir) || + vn_matchops(vp, zfsctl_ops_snapshot) || + vn_matchops(vp, zfsctl_ops_shares_dir)); - ASSERT(index <= 2); +} - if (index == 0) - return (ZFSCTL_INO_SNAPDIR); +typedef struct zfsctl_root { + sfs_node_t node; + sfs_node_t *snapdir; + timestruc_t cmtime; +} zfsctl_root_t; - return (zfsvfs->z_shares_dir); -} /* - * Create the '.zfs' directory. This directory is cached as part of the VFS - * structure. This results in a hold on the vfs_t. The code in zfs_umount() - * therefore checks against a vfs_count of 2 instead of 1. This reference - * is removed when the ctldir is destroyed in the unmount. + * Create the '.zfs' directory. */ void zfsctl_create(zfsvfs_t *zfsvfs) { - vnode_t *vp, *rvp; - zfsctl_node_t *zcp; + zfsctl_root_t *dot_zfs; + sfs_node_t *snapdir; + vnode_t *rvp; + uint64_t crtime[2]; ASSERT(zfsvfs->z_ctldir == NULL); - vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs, - zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries, - zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL); - zcp = vp->v_data; - zcp->zc_id = ZFSCTL_INO_ROOT; - - VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0); - ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime); - VN_RELE(rvp); + snapdir = sfs_alloc_node(sizeof(*snapdir), "snapshot", ZFSCTL_INO_ROOT, + ZFSCTL_INO_SNAPDIR); + dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof(*dot_zfs), ".zfs", 0, + ZFSCTL_INO_ROOT); + dot_zfs->snapdir = snapdir; + + VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0); + VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + &crtime, sizeof(crtime))); + ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime); + vput(rvp); - /* - * We're only faking the fact that we have a root of a filesystem for - * the sake of the GFS interfaces. Undo the flag manipulation it did - * for us. - */ - vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT); - - zfsvfs->z_ctldir = vp; + zfsvfs->z_ctldir = dot_zfs; } /* * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. - * There might still be more references if we were force unmounted, but only - * new zfs_inactive() calls can occur and they don't reference .zfs + * The nodes must not have any associated vnodes by now as they should be + * vflush-ed. */ void zfsctl_destroy(zfsvfs_t *zfsvfs) { - VN_RELE(zfsvfs->z_ctldir); + sfs_destroy_node(zfsvfs->z_ctldir->snapdir); + sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir); zfsvfs->z_ctldir = NULL; } +static int +zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + return (VFS_ROOT(mp, flags, vpp)); +} + +static void +zfsctl_common_vnode_setup(vnode_t *vp, void *arg) +{ + ASSERT_VOP_ELOCKED(vp, __func__); + + /* We support shared locking. */ + VN_LOCK_ASHARE(vp); + vp->v_type = VDIR; + vp->v_data = arg; +} + +static int +zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + void *node; + int err; + + node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir; + err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root, + zfsctl_common_vnode_setup, node, vpp); + return (err); +} + +static int +zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + void *node; + int err; + + node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir->snapdir; + err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs", + &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp); + return (err); +} + /* * Given a root znode, retrieve the associated .zfs directory. * Add a hold to the vnode and return it. */ -vnode_t * -zfsctl_root(znode_t *zp) +int +zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp) { - ASSERT(zfs_has_ctldir(zp)); - VN_HOLD(zp->z_zfsvfs->z_ctldir); - return (zp->z_zfsvfs->z_ctldir); + vnode_t *vp; + int error; + + error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp); + return (error); } /* @@ -244,10 +438,12 @@ zfsctl_root(znode_t *zp) */ /* ARGSUSED */ static int -zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr, caller_context_t *ct) +zfsctl_common_open(struct vop_open_args *ap) { + int flags = ap->a_mode; + if (flags & FWRITE) - return (EACCES); + return (SET_ERROR(EACCES)); return (0); } @@ -257,8 +453,7 @@ zfsctl_common_open(vnode_t **vpp, int fl */ /* ARGSUSED */ static int -zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off, - cred_t *cr, caller_context_t *ct) +zfsctl_common_close(struct vop_close_args *ap) { return (0); } @@ -268,17 +463,18 @@ zfsctl_common_close(vnode_t *vpp, int fl */ /* ARGSUSED */ static int -zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr, - caller_context_t *ct) +zfsctl_common_access(ap) + struct vop_access_args /* { + struct vnode *a_vp; + accmode_t a_accmode; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; { - if (flags & V_ACE_MASK) { - if (mode & ACE_ALL_WRITE_PERMS) - return (EACCES); - } else { - if (mode & VWRITE) - return (EACCES); - } + accmode_t accmode = ap->a_accmode; + if (accmode & VWRITE) + return (SET_ERROR(EACCES)); return (0); } @@ -289,6 +485,9 @@ static void zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) { timestruc_t now; + sfs_node_t *node; + + node = vp->v_data; vap->va_uid = 0; vap->va_gid = 0; @@ -300,7 +499,7 @@ zfsctl_common_getattr(vnode_t *vp, vattr vap->va_blksize = 0; vap->va_nblocks = 0; vap->va_seq = 0; - vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; vap->va_type = VDIR; @@ -309,417 +508,314 @@ zfsctl_common_getattr(vnode_t *vp, vattr */ gethrestime(&now); vap->va_atime = now; + /* FreeBSD: Reset chflags(2) flags. */ + vap->va_flags = 0; + + vap->va_nodeid = node->sn_id; + + /* At least '.' and '..'. */ + vap->va_nlink = 2; } /*ARGSUSED*/ static int -zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) -{ - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - zfsctl_node_t *zcp = vp->v_data; - uint64_t object = zcp->zc_id; +zfsctl_common_fid(ap) + struct vop_fid_args /* { + struct vnode *a_vp; + struct fid *a_fid; + } */ *ap; +{ + vnode_t *vp = ap->a_vp; + fid_t *fidp = (void *)ap->a_fid; + sfs_node_t *node = vp->v_data; + uint64_t object = node->sn_id; zfid_short_t *zfid; int i; - ZFS_ENTER(zfsvfs); - - if (fidp->fid_len < SHORT_FID_LEN) { - fidp->fid_len = SHORT_FID_LEN; - ZFS_EXIT(zfsvfs); - return (ENOSPC); - } - zfid = (zfid_short_t *)fidp; - zfid->zf_len = SHORT_FID_LEN; - for (i = 0; i < sizeof (zfid->zf_object); i++) + for (i = 0; i < sizeof(zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); - /* .zfs znodes always have a generation number of 0 */ - for (i = 0; i < sizeof (zfid->zf_gen); i++) + /* .zfs nodes always have a generation number of 0 */ + for (i = 0; i < sizeof(zfid->zf_gen); i++) zfid->zf_gen[i] = 0; - ZFS_EXIT(zfsvfs); return (0); } - -/*ARGSUSED*/ static int -zfsctl_shares_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) +zfsctl_common_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; { - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - znode_t *dzp; - int error; - - ZFS_ENTER(zfsvfs); + vnode_t *vp = ap->a_vp; - if (zfsvfs->z_shares_dir == 0) { - ZFS_EXIT(zfsvfs); - return (ENOTSUP); - } - - if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { - error = VOP_FID(ZTOV(dzp), fidp, ct); - VN_RELE(ZTOV(dzp)); - } - - ZFS_EXIT(zfsvfs); - return (error); + (void) sfs_reclaim_vnode(vp); + return (0); } -/* - * .zfs inode namespace - * - * We need to generate unique inode numbers for all files and directories - * within the .zfs pseudo-filesystem. We use the following scheme: - * - * ENTRY ZFSCTL_INODE - * .zfs 1 - * .zfs/snapshot 2 - * .zfs/snapshot/ objectid(snap) - */ -#define ZFSCTL_INO_SNAP(id) (id) +static int +zfsctl_common_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + sfs_print_node(ap->a_vp->v_data); + return (0); +} /* * Get root directory attributes. */ /* ARGSUSED */ static int -zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, - caller_context_t *ct) -{ - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - zfsctl_node_t *zcp = vp->v_data; - - ZFS_ENTER(zfsvfs); - vap->va_nodeid = ZFSCTL_INO_ROOT; - vap->va_nlink = vap->va_size = NROOT_ENTRIES; - vap->va_mtime = vap->va_ctime = zcp->zc_cmtime; +zfsctl_root_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct vattr *vap = ap->a_vap; + zfsctl_root_t *node = vp->v_data; zfsctl_common_getattr(vp, vap); - ZFS_EXIT(zfsvfs); + vap->va_ctime = node->cmtime; + vap->va_mtime = vap->va_ctime; + vap->va_birthtime = vap->va_ctime; + vap->va_nlink += 1; /* snapdir */ + vap->va_size = vap->va_nlink; + return (0); +} +/* + * When we lookup "." we still can be asked to lock it + * differently, can't we? + */ +int +zfsctl_relock_dot(vnode_t *dvp, int ltype) +{ + vref(dvp); + if (ltype != VOP_ISLOCKED(dvp)) { + if (ltype == LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + else /* if (ltype == LK_SHARED) */ + vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); + + /* Relock for the "." case may left us with reclaimed vnode. */ + if ((dvp->v_iflag & VI_DOOMED) != 0) { + vrele(dvp); + return (SET_ERROR(ENOENT)); + } + } return (0); } /* * Special case the handling of "..". */ -/* ARGSUSED */ int -zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, - int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, - int *direntflags, pathname_t *realpnp) -{ - zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; +zfsctl_root_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + struct componentname *cnp = ap->a_cnp; + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + cred_t *cr = ap->a_cnp->cn_cred; + int flags = ap->a_cnp->cn_flags; + int lkflags = ap->a_cnp->cn_lkflags; + int nameiop = ap->a_cnp->cn_nameiop; int err; + int ltype; - /* - * No extended attributes allowed under .zfs - */ - if (flags & LOOKUP_XATTR) - return (EINVAL); + ASSERT(dvp->v_type == VDIR); - ZFS_ENTER(zfsvfs); + if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); - if (strcmp(nm, "..") == 0) { - err = VFS_ROOT(dvp->v_vfsp, vpp); + if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { + err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); + if (err == 0) + *vpp = dvp; + } else if ((flags & ISDOTDOT) != 0) { + err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL, + lkflags, vpp); + } else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) { + err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp); } else { - err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir, - cr, ct, direntflags, realpnp); + err = SET_ERROR(ENOENT); } - - ZFS_EXIT(zfsvfs); - + if (err != 0) + *vpp = NULL; return (err); } static int -zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, - caller_context_t *ct) -{ - /* - * We only care about ACL_ENABLED so that libsec can - * display ACL correctly and not default to POSIX draft. - */ - if (cmd == _PC_ACL_ENABLED) { - *valp = _ACL_ACE_ENABLED; - return (0); - } - - return (fs_pathconf(vp, cmd, valp, cr, ct)); -} - -static const fs_operation_def_t zfsctl_tops_root[] = { - { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, - { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, - { VOPNAME_IOCTL, { .error = fs_inval } }, - { VOPNAME_GETATTR, { .vop_getattr = zfsctl_root_getattr } }, - { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, - { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } }, - { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_root_lookup } }, - { VOPNAME_SEEK, { .vop_seek = fs_seek } }, - { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, - { VOPNAME_PATHCONF, { .vop_pathconf = zfsctl_pathconf } }, - { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, - { NULL } -}; - -static int -zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) -{ - objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; - - if (snapshot_namecheck(name, NULL, NULL) != 0) - return (EILSEQ); - dmu_objset_name(os, zname); - if (strlen(zname) + 1 + strlen(name) >= len) - return (ENAMETOOLONG); - (void) strcat(zname, "@"); - (void) strcat(zname, name); - return (0); -} - -static int -zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr) +zfsctl_root_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + int *a_eofflag; + int *ncookies; + u_long **a_cookies; + } */ *ap; { - vnode_t *svp = sep->se_root; + struct dirent entry; + vnode_t *vp = ap->a_vp; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_root_t *node = vp->v_data; + uio_t *uio = ap->a_uio; + int *eofp = ap->a_eofflag; + off_t dots_offset; int error; - ASSERT(vn_ismntpt(svp)); - - /* this will be dropped by dounmount() */ - if ((error = vn_vfswlock(svp)) != 0) - return (error); + ASSERT(vp->v_type == VDIR); - VN_HOLD(svp); - error = dounmount(vn_mountedvfs(svp), fflags, cr); - if (error) { - VN_RELE(svp); + error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio, + &dots_offset); + if (error != 0) { + if (error == ENAMETOOLONG) /* ran out of destination space */ + error = 0; return (error); } + if (uio->uio_offset != dots_offset) + return (SET_ERROR(EINVAL)); - /* - * We can't use VN_RELE(), as that will try to invoke - * zfsctl_snapdir_inactive(), which would cause us to destroy - * the sd_lock mutex held by our caller. - */ - ASSERT(svp->v_count == 1); - gfs_vop_inactive(svp, cr, NULL); - - kmem_free(sep->se_name, strlen(sep->se_name) + 1); - kmem_free(sep, sizeof (zfs_snapentry_t)); - + CTASSERT(sizeof(node->snapdir->sn_name) <= sizeof(entry.d_name)); + entry.d_fileno = node->snapdir->sn_id; + entry.d_type = DT_DIR; + strcpy(entry.d_name, node->snapdir->sn_name); + entry.d_namlen = strlen(entry.d_name); + entry.d_reclen = sizeof(entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) { + if (error == ENAMETOOLONG) + error = 0; + return (SET_ERROR(error)); + } + if (eofp != NULL) + *eofp = 1; return (0); } -static void -zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm) +static int +zfsctl_root_vptocnp(struct vop_vptocnp_args *ap) { - avl_index_t where; - vfs_t *vfsp; - refstr_t *pathref; - char newpath[MAXNAMELEN]; - char *tail; - - ASSERT(MUTEX_HELD(&sdp->sd_lock)); - ASSERT(sep != NULL); - - vfsp = vn_mountedvfs(sep->se_root); - ASSERT(vfsp != NULL); + static const char dotzfs_name[4] = ".zfs"; + vnode_t *dvp; + int error; - vfs_lock_wait(vfsp); + if (*ap->a_buflen < sizeof (dotzfs_name)) + return (SET_ERROR(ENOMEM)); - /* - * Change the name in the AVL tree. - */ - avl_remove(&sdp->sd_snaps, sep); - kmem_free(sep->se_name, strlen(sep->se_name) + 1); - sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); - (void) strcpy(sep->se_name, nm); - VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL); - avl_insert(&sdp->sd_snaps, sep, where); - - /* - * Change the current mountpoint info: - * - update the tail of the mntpoint path - * - update the tail of the resource path - */ - pathref = vfs_getmntpoint(vfsp); - (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); - VERIFY((tail = strrchr(newpath, '/')) != NULL); - *(tail+1) = '\0'; - ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); - (void) strcat(newpath, nm); - refstr_rele(pathref); - vfs_setmntpoint(vfsp, newpath); - - pathref = vfs_getresource(vfsp); - (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath)); - VERIFY((tail = strrchr(newpath, '@')) != NULL); - *(tail+1) = '\0'; - ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath)); - (void) strcat(newpath, nm); - refstr_rele(pathref); - vfs_setresource(vfsp, newpath); + error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL, + LK_SHARED, &dvp); + if (error != 0) + return (SET_ERROR(error)); - vfs_unlock(vfsp); + VOP_UNLOCK(dvp, 0); + *ap->a_vpp = dvp; + *ap->a_buflen -= sizeof (dotzfs_name); + bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name)); + return (0); } -/*ARGSUSED*/ +static struct vop_vector zfsctl_ops_root = { + .vop_default = &default_vnodeops, + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_ioctl = VOP_EINVAL, + .vop_getattr = zfsctl_root_getattr, + .vop_access = zfsctl_common_access, + .vop_readdir = zfsctl_root_readdir, + .vop_lookup = zfsctl_root_lookup, + .vop_inactive = VOP_NULL, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_common_fid, + .vop_print = zfsctl_common_print, + .vop_vptocnp = zfsctl_root_vptocnp, +}; + static int -zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, - cred_t *cr, caller_context_t *ct, int flags) +zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) { - zfsctl_snapdir_t *sdp = sdvp->v_data; - zfs_snapentry_t search, *sep; - zfsvfs_t *zfsvfs; - avl_index_t where; - char from[MAXNAMELEN], to[MAXNAMELEN]; - char real[MAXNAMELEN]; - int err; - - zfsvfs = sdvp->v_vfsp->vfs_data; - ZFS_ENTER(zfsvfs); - - if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { - err = dmu_snapshot_realname(zfsvfs->z_os, snm, real, - MAXNAMELEN, NULL); - if (err == 0) { - snm = real; - } else if (err != ENOTSUP) { - ZFS_EXIT(zfsvfs); - return (err); - } - } - - ZFS_EXIT(zfsvfs); - - err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from); - if (!err) - err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to); - if (!err) - err = zfs_secpolicy_rename_perms(from, to, cr); - if (err) - return (err); - - /* - * Cannot move snapshots out of the snapdir. - */ - if (sdvp != tdvp) - return (EINVAL); - - if (strcmp(snm, tnm) == 0) - return (0); - - mutex_enter(&sdp->sd_lock); - - search.se_name = (char *)snm; - if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) { - mutex_exit(&sdp->sd_lock); - return (ENOENT); - } - - err = dmu_objset_rename(from, to, B_FALSE); - if (err == 0) - zfsctl_rename_snap(sdp, sep, tnm); - - mutex_exit(&sdp->sd_lock); + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; - return (err); + dmu_objset_name(os, zname); + if (strlen(zname) + 1 + strlen(name) >= len) + return (SET_ERROR(ENAMETOOLONG)); + (void) strcat(zname, "@"); + (void) strcat(zname, name); + return (0); } -/* ARGSUSED */ static int -zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, - caller_context_t *ct, int flags) +zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id) { - zfsctl_snapdir_t *sdp = dvp->v_data; - zfs_snapentry_t *sep; - zfs_snapentry_t search; - zfsvfs_t *zfsvfs; - char snapname[MAXNAMELEN]; - char real[MAXNAMELEN]; + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; int err; - zfsvfs = dvp->v_vfsp->vfs_data; - ZFS_ENTER(zfsvfs); - - if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { - - err = dmu_snapshot_realname(zfsvfs->z_os, name, real, - MAXNAMELEN, NULL); - if (err == 0) { - name = real; - } else if (err != ENOTSUP) { - ZFS_EXIT(zfsvfs); - return (err); - } - } - - ZFS_EXIT(zfsvfs); - - err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname); - if (!err) - err = zfs_secpolicy_destroy_perms(snapname, cr); - if (err) - return (err); - - mutex_enter(&sdp->sd_lock); - - search.se_name = name; - sep = avl_find(&sdp->sd_snaps, &search, NULL); - if (sep) { - avl_remove(&sdp->sd_snaps, sep); - err = zfsctl_unmount_snap(sep, MS_FORCE, cr); - if (err) - avl_add(&sdp->sd_snaps, sep); - else - err = dmu_objset_destroy(snapname, B_FALSE); - } else { - err = ENOENT; - } - - mutex_exit(&sdp->sd_lock); - + err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id); return (err); } /* - * This creates a snapshot under '.zfs/snapshot'. + * Given a vnode get a root vnode of a filesystem mounted on top of + * the vnode, if any. The root vnode is referenced and locked. + * If no filesystem is mounted then the orinal vnode remains referenced + * and locked. If any error happens the orinal vnode is unlocked and + * released. */ -/* ARGSUSED */ static int -zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, - cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp) +zfsctl_mounted_here(vnode_t **vpp, int flags) { - zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; - char name[MAXNAMELEN]; + struct mount *mp; int err; - static enum symfollow follow = NO_FOLLOW; - static enum uio_seg seg = UIO_SYSSPACE; - if (snapshot_namecheck(dirname, NULL, NULL) != 0) - return (EILSEQ); + ASSERT_VOP_LOCKED(*vpp, __func__); + ASSERT3S((*vpp)->v_type, ==, VDIR); - dmu_objset_name(zfsvfs->z_os, name); + if ((mp = (*vpp)->v_mountedhere) != NULL) { + err = vfs_busy(mp, 0); + KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err)); + KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint")); + vput(*vpp); + err = VFS_ROOT(mp, flags, vpp); + vfs_unbusy(mp); + return (err); + } + return (EJUSTRETURN); +} - *vpp = NULL; +typedef struct { + const char *snap_name; + uint64_t snap_id; +} snapshot_setup_arg_t; - err = zfs_secpolicy_snapshot_perms(name, cr); - if (err) - return (err); +static void +zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg) +{ + snapshot_setup_arg_t *ssa = arg; + sfs_node_t *node; - if (err == 0) { - err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE); - if (err) - return (err); - err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp); - } + ASSERT_VOP_ELOCKED(vp, __func__); - return (err); + node = sfs_alloc_node(sizeof(sfs_node_t), + ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id); + zfsctl_common_vnode_setup(vp, node); + + /* We have to support recursive locking. */ + VN_LOCK_AREC(vp); } /* @@ -728,151 +824,112 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char * Perform a mount of the associated dataset on top of the vnode. */ /* ARGSUSED */ -static int -zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, - int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, - int *direntflags, pathname_t *realpnp) -{ - zfsctl_snapdir_t *sdp = dvp->v_data; - objset_t *snap; - char snapname[MAXNAMELEN]; - char real[MAXNAMELEN]; +int +zfsctl_snapdir_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + char name[NAME_MAX + 1]; + char fullname[ZFS_MAX_DATASET_NAME_LEN]; char *mountpoint; - zfs_snapentry_t *sep, search; - struct mounta margs; - vfs_t *vfsp; size_t mountpoint_len; - avl_index_t where; zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + uint64_t snap_id; + int nameiop = cnp->cn_nameiop; + int lkflags = cnp->cn_lkflags; + int flags = cnp->cn_flags; int err; - /* - * No extended attributes allowed under .zfs - */ - if (flags & LOOKUP_XATTR) - return (EINVAL); - ASSERT(dvp->v_type == VDIR); - /* - * If we get a recursive call, that means we got called - * from the domount() code while it was trying to look up the - * spec (which looks like a local path for zfs). We need to - * add some flag to domount() to tell it not to do this lookup. - */ - if (MUTEX_HELD(&sdp->sd_lock)) - return (ENOENT); - - ZFS_ENTER(zfsvfs); + if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); - if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { - ZFS_EXIT(zfsvfs); - return (0); + if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { + err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); + if (err == 0) + *vpp = dvp; + return (err); + } + if (flags & ISDOTDOT) { + err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags, + vpp); + return (err); } - if (flags & FIGNORECASE) { - boolean_t conflict = B_FALSE; + if (cnp->cn_namelen >= sizeof(name)) + return (SET_ERROR(ENAMETOOLONG)); - err = dmu_snapshot_realname(zfsvfs->z_os, nm, real, - MAXNAMELEN, &conflict); - if (err == 0) { - nm = real; - } else if (err != ENOTSUP) { - ZFS_EXIT(zfsvfs); + strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); + err = zfsctl_snapshot_lookup(dvp, name, &snap_id); + if (err != 0) + return (SET_ERROR(ENOENT)); + + for (;;) { + snapshot_setup_arg_t ssa; + + ssa.snap_name = name; + ssa.snap_id = snap_id; + err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR, + snap_id, "zfs", &zfsctl_ops_snapshot, + zfsctl_snapshot_vnode_setup, &ssa, vpp); + if (err != 0) return (err); - } - if (realpnp) - (void) strlcpy(realpnp->pn_buf, nm, - realpnp->pn_bufsize); - if (conflict && direntflags) - *direntflags = ED_CASE_CONFLICT; - } - - mutex_enter(&sdp->sd_lock); - search.se_name = (char *)nm; - if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) { - *vpp = sep->se_root; - VN_HOLD(*vpp); - err = traverse(vpp); - if (err) { - VN_RELE(*vpp); - *vpp = NULL; - } else if (*vpp == sep->se_root) { - /* - * The snapshot was unmounted behind our backs, - * try to remount it. - */ - goto domount; - } else { - /* - * VROOT was set during the traverse call. We need - * to clear it since we're pretending to be part - * of our parent's vfs. - */ - (*vpp)->v_flag &= ~VROOT; - } - mutex_exit(&sdp->sd_lock); - ZFS_EXIT(zfsvfs); - return (err); - } - /* - * The requested snapshot is not currently mounted, look it up. - */ - err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname); - if (err) { - mutex_exit(&sdp->sd_lock); - ZFS_EXIT(zfsvfs); + /* Check if a new vnode has just been created. */ + if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE) + break; + /* - * handle "ls *" or "?" in a graceful manner, - * forcing EILSEQ to ENOENT. - * Since shell ultimately passes "*" or "?" as name to lookup + * The vnode must be referenced at least by this thread and + * the mounted snapshot or the thread doing the mounting. + * There can be more references from concurrent lookups. */ - return (err == EILSEQ ? ENOENT : err); - } - if (dmu_objset_hold(snapname, FTAG, &snap) != 0) { - mutex_exit(&sdp->sd_lock); - ZFS_EXIT(zfsvfs); - return (ENOENT); - } - - sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP); - sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP); - (void) strcpy(sep->se_name, nm); - *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap)); - avl_insert(&sdp->sd_snaps, sep, where); - - dmu_objset_rele(snap, FTAG); -domount: - mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) + - strlen("/.zfs/snapshot/") + strlen(nm) + 1; - mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); - (void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s", - refstr_value(dvp->v_vfsp->vfs_mntpt), nm); + KASSERT(vrefcnt(*vpp) > 1, ("found unreferenced mountpoint")); - margs.spec = snapname; - margs.dir = mountpoint; - margs.flags = MS_SYSSPACE | MS_NOMNTTAB; - margs.fstype = "zfs"; - margs.dataptr = NULL; - margs.datalen = 0; - margs.optptr = NULL; - margs.optlen = 0; + /* + * Check if a snapshot is already mounted on top of the vnode. + */ + err = zfsctl_mounted_here(vpp, lkflags); + if (err != EJUSTRETURN) + return (err); - err = domount("zfs", &margs, *vpp, kcred, &vfsp); - kmem_free(mountpoint, mountpoint_len); +#ifdef INVARIANTS + /* + * If the vnode not covered yet, then the mount operation + * must be in progress. + */ + VI_LOCK(*vpp); + KASSERT(((*vpp)->v_iflag & VI_MOUNT) != 0, + ("snapshot vnode not covered")); + VI_UNLOCK(*vpp); +#endif + vput(*vpp); - if (err == 0) { /* - * Return the mounted root rather than the covered mount point. - * Takes the GFS vnode at .zfs/snapshot/ and returns - * the ZFS vnode mounted on top of the GFS node. This ZFS - * vnode is the root of the newly created vfsp. + * In this situation we can loop on uncontested locks and starve + * the thread doing the lengthy, non-trivial mount operation. */ - VFS_RELE(vfsp); - err = traverse(vpp); + kern_yield(PRI_USER); } + VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof(fullname), fullname)); + + mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) + + strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1; + mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); + (void) snprintf(mountpoint, mountpoint_len, + "%s/" ZFS_CTLDIR_NAME "/snapshot/%s", + dvp->v_vfsp->mnt_stat.f_mntonname, name); + + err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0); + kmem_free(mountpoint, mountpoint_len); if (err == 0) { /* * Fix up the root vnode mounted on .zfs/snapshot/. @@ -883,385 +940,246 @@ domount: */ ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; - (*vpp)->v_vfsp = zfsvfs->z_vfs; - (*vpp)->v_flag &= ~VROOT; + + /* Clear the root flag (set via VFS_ROOT) as well. */ + (*vpp)->v_vflag &= ~VV_ROOT; } - mutex_exit(&sdp->sd_lock); - ZFS_EXIT(zfsvfs); - /* - * If we had an error, drop our hold on the vnode and - * zfsctl_snapshot_inactive() will clean up. - */ - if (err) { - VN_RELE(*vpp); + if (err != 0) *vpp = NULL; - } return (err); } -/* ARGSUSED */ static int -zfsctl_shares_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, - int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, - int *direntflags, pathname_t *realpnp) -{ - zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; - znode_t *dzp; - int error; - - ZFS_ENTER(zfsvfs); - - if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { - ZFS_EXIT(zfsvfs); - return (0); - } - - if (zfsvfs->z_shares_dir == 0) { - ZFS_EXIT(zfsvfs); - return (ENOTSUP); - } - if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) - error = VOP_LOOKUP(ZTOV(dzp), nm, vpp, pnp, - flags, rdir, cr, ct, direntflags, realpnp); - - VN_RELE(ZTOV(dzp)); - ZFS_EXIT(zfsvfs); - - return (error); -} - -/* ARGSUSED */ -static int -zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp, - offset_t *offp, offset_t *nextp, void *data, int flags) -{ +zfsctl_snapdir_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + int *a_eofflag; + int *ncookies; + u_long **a_cookies; + } */ *ap; +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + struct dirent entry; + vnode_t *vp = ap->a_vp; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - char snapname[MAXNAMELEN]; - uint64_t id, cookie; - boolean_t case_conflict; + uio_t *uio = ap->a_uio; + int *eofp = ap->a_eofflag; + off_t dots_offset; int error; - ZFS_ENTER(zfsvfs); + ASSERT(vp->v_type == VDIR); - cookie = *offp; - error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, - &cookie, &case_conflict); - if (error) { - ZFS_EXIT(zfsvfs); - if (error == ENOENT) { - *eofp = 1; - return (0); - } + error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio, + &dots_offset); + if (error != 0) { + if (error == ENAMETOOLONG) /* ran out of destination space */ + error = 0; return (error); } - if (flags & V_RDDIR_ENTFLAGS) { - edirent_t *eodp = dp; - - (void) strcpy(eodp->ed_name, snapname); - eodp->ed_ino = ZFSCTL_INO_SNAP(id); - eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0; - } else { - struct dirent64 *odp = dp; + for (;;) { + uint64_t cookie; + uint64_t id; + + cookie = uio->uio_offset - dots_offset; + + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname), + snapname, &id, &cookie, NULL); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); + if (error != 0) { + if (error == ENOENT) { + if (eofp != NULL) + *eofp = 1; + error = 0; + } + return (error); + } - (void) strcpy(odp->d_name, snapname); - odp->d_ino = ZFSCTL_INO_SNAP(id); + entry.d_fileno = id; + entry.d_type = DT_DIR; + strcpy(entry.d_name, snapname); + entry.d_namlen = strlen(entry.d_name); + entry.d_reclen = sizeof(entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) { + if (error == ENAMETOOLONG) + error = 0; + return (SET_ERROR(error)); + } + uio->uio_offset = cookie + dots_offset; } - *nextp = cookie; - - ZFS_EXIT(zfsvfs); - - return (0); + /* NOTREACHED */ } /* ARGSUSED */ static int -zfsctl_shares_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, - caller_context_t *ct, int flags) +zfsctl_snapdir_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + } */ *ap; { + vnode_t *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - znode_t *dzp; - int error; - - ZFS_ENTER(zfsvfs); + dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os); + sfs_node_t *node = vp->v_data; + uint64_t snap_count; + int err; - if (zfsvfs->z_shares_dir == 0) { - ZFS_EXIT(zfsvfs); - return (ENOTSUP); - } - if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { - error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ct, flags); - VN_RELE(ZTOV(dzp)); - } else { - *eofp = 1; - error = ENOENT; + zfsctl_common_getattr(vp, vap); + vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os); + vap->va_mtime = vap->va_ctime; + vap->va_birthtime = vap->va_ctime; + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { + err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); + if (err != 0) + return (err); + vap->va_nlink += snap_count; } + vap->va_size = vap->va_nlink; - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * pvp is the '.zfs' directory (zfsctl_node_t). - * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t). - * - * This function is the callback to create a GFS vnode for '.zfs/snapshot' - * when a lookup is performed on .zfs for "snapshot". - */ -vnode_t * -zfsctl_mknode_snapdir(vnode_t *pvp) -{ - vnode_t *vp; - zfsctl_snapdir_t *sdp; - - vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, - zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN, - zfsctl_snapdir_readdir_cb, NULL); - sdp = vp->v_data; - sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR; - sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; - mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&sdp->sd_snaps, snapentry_compare, - sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node)); - return (vp); + return (0); } -vnode_t * -zfsctl_mknode_shares(vnode_t *pvp) -{ - vnode_t *vp; - zfsctl_node_t *sdp; - - vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, - zfsctl_ops_shares, NULL, NULL, MAXNAMELEN, - NULL, NULL); - sdp = vp->v_data; - sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; - return (vp); - -} +static struct vop_vector zfsctl_ops_snapdir = { + .vop_default = &default_vnodeops, + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_getattr = zfsctl_snapdir_getattr, + .vop_access = zfsctl_common_access, + .vop_readdir = zfsctl_snapdir_readdir, + .vop_lookup = zfsctl_snapdir_lookup, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_common_fid, + .vop_print = zfsctl_common_print, +}; -/* ARGSUSED */ static int -zfsctl_shares_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, - caller_context_t *ct) +zfsctl_snapshot_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; { - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - znode_t *dzp; - int error; - - ZFS_ENTER(zfsvfs); - if (zfsvfs->z_shares_dir == 0) { - ZFS_EXIT(zfsvfs); - return (ENOTSUP); - } - if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { - error = VOP_GETATTR(ZTOV(dzp), vap, flags, cr, ct); - VN_RELE(ZTOV(dzp)); - } - ZFS_EXIT(zfsvfs); - return (error); - + vnode_t *vp = ap->a_vp; + VERIFY(vrecycle(vp) == 1); + return (0); } -/* ARGSUSED */ static int -zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, - caller_context_t *ct) +zfsctl_snapshot_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; { - zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; - zfsctl_snapdir_t *sdp = vp->v_data; - - ZFS_ENTER(zfsvfs); - zfsctl_common_getattr(vp, vap); - vap->va_nodeid = gfs_file_inode(vp); - vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2; - vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); - ZFS_EXIT(zfsvfs); + vnode_t *vp = ap->a_vp; + void *data = vp->v_data; + sfs_reclaim_vnode(vp); + sfs_destroy_node(data); return (0); } -/* ARGSUSED */ -static void -zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) -{ - zfsctl_snapdir_t *sdp = vp->v_data; - void *private; - - private = gfs_dir_inactive(vp); - if (private != NULL) { - ASSERT(avl_numnodes(&sdp->sd_snaps) == 0); - mutex_destroy(&sdp->sd_lock); - avl_destroy(&sdp->sd_snaps); - kmem_free(private, sizeof (zfsctl_snapdir_t)); - } -} - -#ifndef __NetBSD__ -static const fs_operation_def_t zfsctl_tops_snapdir[] = { - { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, - { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, - { VOPNAME_IOCTL, { .error = fs_inval } }, - { VOPNAME_GETATTR, { .vop_getattr = zfsctl_snapdir_getattr } }, - { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, - { VOPNAME_RENAME, { .vop_rename = zfsctl_snapdir_rename } }, - { VOPNAME_RMDIR, { .vop_rmdir = zfsctl_snapdir_remove } }, - { VOPNAME_MKDIR, { .vop_mkdir = zfsctl_snapdir_mkdir } }, - { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } }, - { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_snapdir_lookup } }, - { VOPNAME_SEEK, { .vop_seek = fs_seek } }, - { VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapdir_inactive } }, - { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, - { NULL } -}; - -static const fs_operation_def_t zfsctl_tops_shares[] = { - { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, - { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, - { VOPNAME_IOCTL, { .error = fs_inval } }, - { VOPNAME_GETATTR, { .vop_getattr = zfsctl_shares_getattr } }, - { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, - { VOPNAME_READDIR, { .vop_readdir = zfsctl_shares_readdir } }, - { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_shares_lookup } }, - { VOPNAME_SEEK, { .vop_seek = fs_seek } }, - { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, - { VOPNAME_FID, { .vop_fid = zfsctl_shares_fid } }, - { NULL } -}; -#endif -/* - * pvp is the GFS vnode '.zfs/snapshot'. - * - * This creates a GFS node under '.zfs/snapshot' representing each - * snapshot. This newly created GFS node is what we mount snapshot - * vfs_t's ontop of. - */ -static vnode_t * -zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) -{ - vnode_t *vp; - zfsctl_node_t *zcp; - - vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, - zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); - zcp = vp->v_data; - zcp->zc_id = objset; - - return (vp); -} - -static void -zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +static int +zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap) { - zfsctl_snapdir_t *sdp; - zfs_snapentry_t *sep, *next; + struct mount *mp; vnode_t *dvp; + vnode_t *vp; + sfs_node_t *node; + size_t len; + int locked; + int error; - VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0); - sdp = dvp->v_data; - - mutex_enter(&sdp->sd_lock); - - if (vp->v_count > 1) { - mutex_exit(&sdp->sd_lock); - return; - } - ASSERT(!vn_ismntpt(vp)); - - sep = avl_first(&sdp->sd_snaps); - while (sep != NULL) { - next = AVL_NEXT(&sdp->sd_snaps, sep); + vp = ap->a_vp; + node = vp->v_data; + len = strlen(node->sn_name); + if (*ap->a_buflen < len) + return (SET_ERROR(ENOMEM)); - if (sep->se_root == vp) { - avl_remove(&sdp->sd_snaps, sep); - kmem_free(sep->se_name, strlen(sep->se_name) + 1); - kmem_free(sep, sizeof (zfs_snapentry_t)); - break; - } - sep = next; - } - ASSERT(sep != NULL); - - mutex_exit(&sdp->sd_lock); - VN_RELE(dvp); + /* + * Prevent unmounting of the snapshot while the vnode lock + * is not held. That is not strictly required, but allows + * us to assert that an uncovered snapshot vnode is never + * "leaked". + */ + mp = vp->v_mountedhere; + if (mp == NULL) + return (SET_ERROR(ENOENT)); + error = vfs_busy(mp, 0); + KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error)); /* - * Dispose of the vnode for the snapshot mount point. - * This is safe to do because once this entry has been removed - * from the AVL tree, it can't be found again, so cannot become - * "active". If we lookup the same name again we will end up - * creating a new vnode. + * We can vput the vnode as we can now depend on the reference owned + * by the busied mp. But we also need to hold the vnode, because + * the reference may go after vfs_unbusy() which has to be called + * before we can lock the vnode again. */ - gfs_vop_inactive(vp, cr, ct); + locked = VOP_ISLOCKED(vp); + vhold(vp); + vput(vp); + + /* Look up .zfs/snapshot, our parent. */ + error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp); + if (error == 0) { + VOP_UNLOCK(dvp, 0); + *ap->a_vpp = dvp; + *ap->a_buflen -= len; + bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len); + } + vfs_unbusy(mp); + vget(vp, locked | LK_VNHELD | LK_RETRY, curthread); + return (error); } -#ifndef __NetBSD__ /* * These VP's should never see the light of day. They should always * be covered. */ -static const fs_operation_def_t zfsctl_tops_snapshot[] = { - VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapshot_inactive }, - NULL, NULL +static struct vop_vector zfsctl_ops_snapshot = { + .vop_default = NULL, /* ensure very restricted access */ + .vop_inactive = zfsctl_snapshot_inactive, + .vop_reclaim = zfsctl_snapshot_reclaim, + .vop_vptocnp = zfsctl_snapshot_vptocnp, + .vop_lock1 = vop_stdlock, + .vop_unlock = vop_stdunlock, + .vop_islocked = vop_stdislocked, + .vop_advlockpurge = vop_stdadvlockpurge, /* called by vgone */ + .vop_print = zfsctl_common_print, }; -#endif int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) { + struct mount *mp; zfsvfs_t *zfsvfs = vfsp->vfs_data; - vnode_t *dvp, *vp; - zfsctl_snapdir_t *sdp; - zfsctl_node_t *zcp; - zfs_snapentry_t *sep; + vnode_t *vp; int error; ASSERT(zfsvfs->z_ctldir != NULL); - error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, - NULL, 0, NULL, kcred, NULL, NULL, NULL); - if (error != 0) - return (error); - sdp = dvp->v_data; - - mutex_enter(&sdp->sd_lock); - sep = avl_first(&sdp->sd_snaps); - while (sep != NULL) { - vp = sep->se_root; - zcp = vp->v_data; - if (zcp->zc_id == objsetid) - break; - - sep = AVL_NEXT(&sdp->sd_snaps, sep); - } - - if (sep != NULL) { - VN_HOLD(vp); + *zfsvfsp = NULL; + error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, + ZFSCTL_INO_SNAPDIR, objsetid, &vp); + if (error == 0 && vp != NULL) { /* - * Return the mounted root rather than the covered mount point. - * Takes the GFS vnode at .zfs/snapshot/ - * and returns the ZFS vnode mounted on top of the GFS node. - * This ZFS vnode is the root of the vfs for objset 'objsetid'. + * XXX Probably need to at least reference, if not busy, the mp. */ - error = traverse(&vp); - if (error == 0) { - if (vp == sep->se_root) - error = EINVAL; - else - *zfsvfsp = VTOZ(vp)->z_zfsvfs; - } - mutex_exit(&sdp->sd_lock); - VN_RELE(vp); - } else { - error = EINVAL; - mutex_exit(&sdp->sd_lock); + if (vp->v_mountedhere != NULL) + *zfsvfsp = vp->v_mountedhere->mnt_data; + vput(vp); } - - VN_RELE(dvp); - - return (error); + if (*zfsvfsp == NULL) + return (SET_ERROR(EINVAL)); + return (0); } /* @@ -1272,43 +1190,84 @@ zfsctl_lookup_objset(vfs_t *vfsp, uint64 int zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) { + char snapname[ZFS_MAX_DATASET_NAME_LEN]; zfsvfs_t *zfsvfs = vfsp->vfs_data; + struct mount *mp; vnode_t *dvp; - zfsctl_snapdir_t *sdp; - zfs_snapentry_t *sep, *next; + vnode_t *vp; + sfs_node_t *node; + sfs_node_t *snap; + uint64_t cookie; int error; ASSERT(zfsvfs->z_ctldir != NULL); - error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp, - NULL, 0, NULL, cr, NULL, NULL, NULL); - if (error != 0) - return (error); - sdp = dvp->v_data; - mutex_enter(&sdp->sd_lock); + cookie = 0; + for (;;) { + uint64_t id; + + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname), + snapname, &id, &cookie, NULL); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); + if (error != 0) { + if (error == ENOENT) + error = 0; + break; + } - sep = avl_first(&sdp->sd_snaps); - while (sep != NULL) { - next = AVL_NEXT(&sdp->sd_snaps, sep); + for (;;) { + error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, + ZFSCTL_INO_SNAPDIR, id, &vp); + if (error != 0 || vp == NULL) + break; - /* - * If this snapshot is not mounted, then it must - * have just been unmounted by somebody else, and - * will be cleaned up by zfsctl_snapdir_inactive(). - */ - if (vn_ismntpt(sep->se_root)) { - avl_remove(&sdp->sd_snaps, sep); - error = zfsctl_unmount_snap(sep, fflags, cr); - if (error) { - avl_add(&sdp->sd_snaps, sep); + mp = vp->v_mountedhere; + + /* + * v_mountedhere being NULL means that the + * (uncovered) vnode is in a transient state + * (mounting or unmounting), so loop until it + * settles down. + */ + if (mp != NULL) break; - } + vput(vp); } - sep = next; + if (error != 0) + break; + if (vp == NULL) + continue; /* no mountpoint, nothing to do */ + + /* + * The mount-point vnode is kept locked to avoid spurious EBUSY + * from a concurrent umount. + * The vnode lock must have recursive locking enabled. + */ + vfs_ref(mp); + error = dounmount(mp, fflags, curthread); + KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1, + ("extra references after unmount")); + vput(vp); + if (error != 0) + break; } + KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0, + ("force unmounting failed")); + return (error); +} - mutex_exit(&sdp->sd_lock); - VN_RELE(dvp); +#endif /* __FreeBSD__ */ - return (error); +#ifdef __NetBSD__ + +#include +#include + +boolean_t +zfsctl_is_node(vnode_t *vp) +{ + + return B_FALSE; } +#endif /* __NetBSD__ */ Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_debug.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_debug.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_debug.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_debug.c 31 Jul 2014 04:37:12 -0000 @@ -0,0 +1,112 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +#include + +list_t zfs_dbgmsgs; +int zfs_dbgmsg_size; +kmutex_t zfs_dbgmsgs_lock; +int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ + +void +zfs_dbgmsg_init(void) +{ + list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), + offsetof(zfs_dbgmsg_t, zdm_node)); + mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +zfs_dbgmsg_fini(void) +{ + zfs_dbgmsg_t *zdm; + + while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) { + int size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } + mutex_destroy(&zfs_dbgmsgs_lock); + ASSERT0(zfs_dbgmsg_size); +} + +/* + * Print these messages by running: + * echo ::zfs_dbgmsg | mdb -k + * + * Monitor these messages by running: + * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' + * + * When used with libzpool, monitor with: + * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}' + */ +void +zfs_dbgmsg(const char *fmt, ...) +{ + int size; + va_list adx; + zfs_dbgmsg_t *zdm; + + va_start(adx, fmt); + size = vsnprintf(NULL, 0, fmt, adx); + va_end(adx); + + /* + * There is one byte of string in sizeof (zfs_dbgmsg_t), used + * for the terminating null. + */ + zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP); + zdm->zdm_timestamp = gethrestime_sec(); + + va_start(adx, fmt); + (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx); + va_end(adx); + + DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg); + + mutex_enter(&zfs_dbgmsgs_lock); + list_insert_tail(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += sizeof (zfs_dbgmsg_t) + size; + while (zfs_dbgmsg_size > zfs_dbgmsg_maxsize) { + zdm = list_remove_head(&zfs_dbgmsgs); + size = sizeof (zfs_dbgmsg_t) + strlen(zdm->zdm_msg); + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } + mutex_exit(&zfs_dbgmsgs_lock); +} + +void +zfs_dbgmsg_print(const char *tag) +{ + zfs_dbgmsg_t *zdm; + + (void) printf("ZFS_DBGMSG(%s):\n", tag); + mutex_enter(&zfs_dbgmsgs_lock); + for (zdm = list_head(&zfs_dbgmsgs); zdm; + zdm = list_next(&zfs_dbgmsgs, zdm)) + (void) printf("%s\n", zdm->zdm_msg); + mutex_exit(&zfs_dbgmsgs_lock); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c,v retrieving revision 1.10 diff -u -p -r1.10 zfs_dir.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c 9 Apr 2015 05:02:28 -0000 1.10 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c 28 Apr 2017 21:13:09 -0000 @@ -19,8 +19,8 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include @@ -32,10 +32,8 @@ #include #include #include -#include #include #include -#include #include #include #include @@ -43,6 +41,11 @@ #include #include #include +#ifdef __FreeBSD__ +#include +#include +#include +#endif #include #include #include @@ -51,135 +54,70 @@ #include #include #include +#include +#include #include #include /* - * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups + * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups * of names after deciding which is the appropriate lookup interface. */ static int -zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact, - boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid) +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, + boolean_t exact, uint64_t *zoid) { int error; if (zfsvfs->z_norm) { - matchtype_t mt = MT_FIRST; - boolean_t conflict = B_FALSE; - size_t bufsz = 0; - char *buf = NULL; - - if (rpnp) { - buf = rpnp->pn_buf; - bufsz = rpnp->pn_bufsize; - } - if (exact) - mt = MT_EXACT; + matchtype_t mt = exact? MT_EXACT : MT_FIRST; + /* * In the non-mixed case we only expect there would ever * be one match, but we need to use the normalizing lookup. */ error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, - zoid, mt, buf, bufsz, &conflict); - if (!error && deflags) - *deflags = conflict ? ED_CASE_CONFLICT : 0; + zoid, mt, NULL, 0, NULL); } else { error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); } *zoid = ZFS_DIRENT_OBJ(*zoid); - if (error == ENOENT && update) - dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE); - return (error); } /* - * Reference counting for dirlocks. Solaris destroys the condvar as - * soon as it broadcasts, which works for them because cv_wait doesn't - * need to use the condvar after it is woken, but which is too fast and - * loose with the abstraction for us in NetBSD. - */ - -static int -zfs_dirlock_hold(zfs_dirlock_t *dl) -{ - - KASSERT(mutex_owned(&dl->dl_dzp->z_lock)); - - if (dl->dl_refcnt >= ULONG_MAX) /* XXX Name this constant. */ - return (ENFILE); /* XXX What to do? */ - - dl->dl_refcnt++; - return (0); -} - -static void -zfs_dirlock_rele(zfs_dirlock_t *dl) -{ - - KASSERT(mutex_owned(&dl->dl_dzp->z_lock)); - KASSERT(dl->dl_refcnt > 0); - - if (--dl->dl_refcnt == 0) { - if (dl->dl_namesize != 0) - kmem_free(dl->dl_name, dl->dl_namesize); - cv_destroy(&dl->dl_cv); - kmem_free(dl, sizeof(*dl)); - } -} - -/* - * Lock a directory entry. A dirlock on protects that name - * in dzp's directory zap object. As long as you hold a dirlock, you can - * assume two things: (1) dzp cannot be reaped, and (2) no other thread - * can change the zap entry for (i.e. link or unlink) this name. + * Look up a directory entry under a locked vnode. + * dvp being locked gives us a guarantee that there are no concurrent + * modification of the directory and, thus, if a node can be found in + * the directory, then it must not be unlinked. * * Input arguments: * dzp - znode for directory * name - name of entry to lock * flag - ZNEW: if the entry already exists, fail with EEXIST. * ZEXISTS: if the entry does not exist, fail with ENOENT. - * ZSHARED: allow concurrent access with other ZSHARED callers. * ZXATTR: we want dzp's xattr directory - * ZCILOOK: On a mixed sensitivity file system, - * this lookup should be case-insensitive. - * ZCIEXACT: On a purely case-insensitive file system, - * this lookup should be case-sensitive. - * ZRENAMING: we are locking for renaming, force narrow locks - * ZHAVELOCK: Don't grab the z_name_lock for this call. The - * current thread already holds it. * * Output arguments: * zpp - pointer to the znode for the entry (NULL if there isn't one) - * dlpp - pointer to the dirlock for this entry (NULL on error) - * direntflags - (case-insensitive lookup only) - * flags if multiple case-sensitive matches exist in directory - * realpnp - (case-insensitive lookup only) - * actual name matched within the directory * * Return value: 0 on success or errno on failure. * * NOTE: Always checks for, and rejects, '.' and '..'. - * NOTE: For case-insensitive file systems we take wide locks (see below), - * but return znode pointers to a single match. */ int -zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, - int flag, int *direntflags, pathname_t *realpnp) +zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zfs_dirlock_t *dl; - boolean_t update; boolean_t exact; uint64_t zoid; vnode_t *vp = NULL; int error = 0; - int cmpflags; + + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); *zpp = NULL; - *dlpp = NULL; /* * Verify that we are not trying to lock '.', '..', or '.zfs' @@ -187,287 +125,99 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, zn if (name[0] == '.' && (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) || zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) - return (EEXIST); + return (SET_ERROR(EEXIST)); /* * Case sensitivity and normalization preferences are set when * the file system is created. These are stored in the * zfsvfs->z_case and zfsvfs->z_norm fields. These choices - * affect what vnodes can be cached in the DNLC, how we - * perform zap lookups, and the "width" of our dirlocks. + * affect how we perform zap lookups. * - * A normal dirlock locks a single name. Note that with - * normalization a name can be composed multiple ways, but - * when normalized, these names all compare equal. A wide - * dirlock locks multiple names. We need these when the file - * system is supporting mixed-mode access. It is sometimes - * necessary to lock all case permutations of file name at - * once so that simultaneous case-insensitive/case-sensitive - * behaves as rationally as possible. - */ - - /* * Decide if exact matches should be requested when performing * a zap lookup on file systems supporting case-insensitive * access. - */ - exact = - ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) || - ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK)); - - /* - * Only look in or update the DNLC if we are looking for the - * name on a file system that does not require normalization - * or case folding. We can also look there if we happen to be - * on a non-normalizing, mixed sensitivity file system IF we - * are looking for the exact name. * - * Maybe can add TO-UPPERed version of name to dnlc in ci-only - * case for performance improvement? + * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE + * because in that case MT_EXACT and MT_FIRST should produce exactly + * the same result. */ - update = !zfsvfs->z_norm || - ((zfsvfs->z_case == ZFS_CASE_MIXED) && - !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); + exact = zfsvfs->z_case == ZFS_CASE_MIXED; - /* - * ZRENAMING indicates we are in a situation where we should - * take narrow locks regardless of the file system's - * preferences for normalizing and case folding. This will - * prevent us deadlocking trying to grab the same wide lock - * twice if the two names happen to be case-insensitive - * matches. - */ - if (flag & ZRENAMING) - cmpflags = 0; - else - cmpflags = zfsvfs->z_norm; - - /* - * Wait until there are no locks on this name. - * - * Don't grab the the lock if it is already held. However, cannot - * have both ZSHARED and ZHAVELOCK together. - */ - ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); - if (!(flag & ZHAVELOCK)) - rw_enter(&dzp->z_name_lock, RW_READER); - - mutex_enter(&dzp->z_lock); - for (;;) { - if (dzp->z_unlinked) { - mutex_exit(&dzp->z_lock); - if (!(flag & ZHAVELOCK)) - rw_exit(&dzp->z_name_lock); - return (ENOENT); - } - for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { - if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, - U8_UNICODE_LATEST, &error) == 0) || error != 0) - break; - } - if (error != 0) { - mutex_exit(&dzp->z_lock); - if (!(flag & ZHAVELOCK)) - rw_exit(&dzp->z_name_lock); - return (ENOENT); - } - if (dl == NULL) { - /* - * Allocate a new dirlock and add it to the list. - */ - dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); - cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); - dl->dl_name = name; - dl->dl_sharecnt = 0; - dl->dl_namelock = 0; - dl->dl_namesize = 0; - dl->dl_refcnt = 1; - dl->dl_dzp = dzp; - dl->dl_next = dzp->z_dirlocks; - dzp->z_dirlocks = dl; - break; - } - if ((flag & ZSHARED) && dl->dl_sharecnt != 0) - break; - error = zfs_dirlock_hold(dl); - if (error) { - mutex_exit(&dzp->z_lock); - if (!(flag & ZHAVELOCK)) - rw_exit(&dzp->z_name_lock); - return (error); - } - cv_wait(&dl->dl_cv, &dzp->z_lock); - zfs_dirlock_rele(dl); - } - - /* - * If the z_name_lock was NOT held for this dirlock record it. - */ - if (flag & ZHAVELOCK) - dl->dl_namelock = 1; - - if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { - /* - * We're the second shared reference to dl. Make a copy of - * dl_name in case the first thread goes away before we do. - * Note that we initialize the new name before storing its - * pointer into dl_name, because the first thread may load - * dl->dl_name at any time. He'll either see the old value, - * which is his, or the new shared copy; either is OK. - */ - dl->dl_namesize = strlen(dl->dl_name) + 1; - name = kmem_alloc(dl->dl_namesize, KM_SLEEP); - bcopy(dl->dl_name, name, dl->dl_namesize); - dl->dl_name = name; - } - - mutex_exit(&dzp->z_lock); - - /* - * We have a dirlock on the name. (Note that it is the dirlock, - * not the dzp's z_lock, that protects the name in the zap object.) - * See if there's an object by this name; if so, put a hold on it. - */ + if (dzp->z_unlinked && !(flag & ZXATTR)) + return (ENOENT); if (flag & ZXATTR) { - zoid = dzp->z_phys->zp_xattr; - error = (zoid == 0 ? ENOENT : 0); + error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, + sizeof (zoid)); + if (error == 0) + error = (zoid == 0 ? ENOENT : 0); } else { - if (update) - vp = dnlc_lookup(ZTOV(dzp), name); - if (vp == DNLC_NO_VNODE) { - VN_RELE(vp); - error = ENOENT; - } else if (vp) { - if (flag & ZNEW) { - zfs_dirent_unlock(dl); - VN_RELE(vp); - return (EEXIST); - } - *dlpp = dl; - *zpp = VTOZ(vp); - return (0); - } else { - error = zfs_match_find(zfsvfs, dzp, name, exact, - update, direntflags, realpnp, &zoid); - } + error = zfs_match_find(zfsvfs, dzp, name, exact, &zoid); } if (error) { if (error != ENOENT || (flag & ZEXISTS)) { - zfs_dirent_unlock(dl); return (error); } } else { if (flag & ZNEW) { - zfs_dirent_unlock(dl); - return (EEXIST); + return (SET_ERROR(EEXIST)); } error = zfs_zget(zfsvfs, zoid, zpp); - if (error) { - zfs_dirent_unlock(dl); + if (error) return (error); - } - if (!(flag & ZXATTR) && update) - dnlc_update(ZTOV(dzp), name, ZTOV(*zpp)); + ASSERT(!(*zpp)->z_unlinked); } - *dlpp = dl; - return (0); } -/* - * Unlock this directory entry and wake anyone who was waiting for it. - */ -void -zfs_dirent_unlock(zfs_dirlock_t *dl) +static int +zfs_dd_lookup(znode_t *dzp, znode_t **zpp) { - znode_t *dzp = dl->dl_dzp; - zfs_dirlock_t **prev_dl, *cur_dl; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + znode_t *zp; + uint64_t parent; + int error; - mutex_enter(&dzp->z_lock); + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); - if (!dl->dl_namelock) - rw_exit(&dzp->z_name_lock); - - if (dl->dl_sharecnt > 1) { - dl->dl_sharecnt--; - mutex_exit(&dzp->z_lock); - return; - } - prev_dl = &dzp->z_dirlocks; - while ((cur_dl = *prev_dl) != dl) - prev_dl = &cur_dl->dl_next; - *prev_dl = dl->dl_next; - cv_broadcast(&dl->dl_cv); - zfs_dirlock_rele(dl); - mutex_exit(&dzp->z_lock); + if (dzp->z_unlinked) + return (ENOENT); + + if ((error = sa_lookup(dzp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + return (error); + + error = zfs_zget(zfsvfs, parent, &zp); + if (error == 0) + *zpp = zp; + return (error); } -/* - * Look up an entry in a directory. - * - * NOTE: '.' and '..' are handled as special cases because - * no directory entries are actually stored for them. If this is - * the root of a filesystem, then '.zfs' is also treated as a - * special pseudo-directory. - */ int -zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags, - int *deflg, pathname_t *rpnp) +zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp) { - zfs_dirlock_t *dl; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; znode_t *zp; int error = 0; + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); + + if (dzp->z_unlinked) + return (SET_ERROR(ENOENT)); + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { - *vpp = ZTOV(dzp); - VN_HOLD(*vpp); + *zpp = dzp; } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { - zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - /* - * If we are a snapshot mounted under .zfs, return - * the vp for the snapshot directory. - */ - if (dzp->z_phys->zp_parent == dzp->z_id && - zfsvfs->z_parent != zfsvfs) { - error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, - "snapshot", vpp, NULL, 0, NULL, kcred, - NULL, NULL, NULL); - return (error); - } - rw_enter(&dzp->z_parent_lock, RW_READER); - mutex_enter(&dzp->z_lock); - if (dzp->z_phys->zp_links == 0) { - /* Directory has been rmdir'd. */ - error = ENOENT; - } else { - error = zfs_zget(zfsvfs, dzp->z_phys->zp_parent, &zp); - if (error == 0) - *vpp = ZTOV(zp); - } - mutex_exit(&dzp->z_lock); - rw_exit(&dzp->z_parent_lock); - } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { - *vpp = zfsctl_root(dzp); + error = zfs_dd_lookup(dzp, zpp); } else { - int zf; - - zf = ZEXISTS | ZSHARED; - if (flags & FIGNORECASE) - zf |= ZCILOOK; - - error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); + error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS); if (error == 0) { - *vpp = ZTOV(zp); - zfs_dirent_unlock(dl); dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ + *zpp = zp; } - rpnp = NULL; } - - if ((flags & FIGNORECASE) && rpnp && !error) - (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize); - return (error); } @@ -491,7 +241,7 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t * zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp->z_unlinked); - ASSERT3U(zp->z_phys->zp_links, ==, 0); + ASSERT(zp->z_links == 0); VERIFY3U(0, ==, zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); @@ -543,8 +293,9 @@ zfs_unlinked_drain(zfsvfs_t *zfsvfs) if (error != 0) continue; + vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); zp->z_unlinked = B_TRUE; - VN_RELE(ZTOV(zp)); + vput(ZTOV(zp)); } zap_cursor_fini(&zc); } @@ -568,7 +319,6 @@ zfs_purgedir(znode_t *dzp) znode_t *xzp; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; - zfs_dirlock_t dl; int skipped = 0; int error; @@ -582,31 +332,32 @@ zfs_purgedir(znode_t *dzp) continue; } + vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY); ASSERT((ZTOV(xzp)->v_type == VREG) || (ZTOV(xzp)->v_type == VLNK)); tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, dzp->z_id); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); - dmu_tx_hold_bonus(tx, xzp->z_id); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + /* Is this really needed ? */ + zfs_sa_upgrade_txholds(tx, xzp); + dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - VN_RELE(ZTOV(xzp)); + vput(ZTOV(xzp)); skipped += 1; continue; } - bzero(&dl, sizeof (dl)); - dl.dl_dzp = dzp; - dl.dl_name = zap.za_name; - error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); + error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL); if (error) skipped += 1; dmu_tx_commit(tx); - VN_RELE(ZTOV(xzp)); + vput(ZTOV(xzp)); } zap_cursor_fini(&zc); if (error != ENOENT) @@ -622,15 +373,19 @@ zfs_rmnode(znode_t *zp) znode_t *xzp = NULL; dmu_tx_t *tx; uint64_t acl_obj; + uint64_t xattr_obj; int error; - ASSERT(ZTOV(zp)->v_count == 0); - ASSERT(zp->z_phys->zp_links == 0); + ASSERT(zp->z_links == 0); +#ifndef __NetBSD__ + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); +#endif /* * If this is an attribute directory, purge its contents. */ - if (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_XATTR)) { + if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && + (zp->z_pflags & ZFS_XATTR)) { if (zfs_purgedir(zp) != 0) { /* * Not enough space to delete some xattrs. @@ -640,31 +395,40 @@ zfs_rmnode(znode_t *zp) zfs_znode_free(zp); return; } - } - - /* - * Free up all the data in the file. - */ - error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); - if (error) { + } else { /* - * Not enough space. Leave the file in the unlinked set. + * Free up all the data in the file. We don't do this for + * XATTR directories because we need truncate and remove to be + * in the same tx, like in zfs_znode_delete(). Otherwise, if + * we crash here we'll end up with an inconsistent truncated + * zap object in the delete queue. Note a truncated file is + * harmless since it only contains user data. */ - zfs_znode_dmu_fini(zp); - zfs_znode_free(zp); - return; + error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); + if (error) { + /* + * Not enough space. Leave the file in the unlinked + * set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } } /* * If the file has extended attributes, we're going to unlink * the xattr dir. */ - if (zp->z_phys->zp_xattr) { - error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); - ASSERT(error == 0); + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT3S(error, ==, 0); + vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY); } - acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; + acl_obj = zfs_external_acl(zp); /* * Set up the final transaction. @@ -673,11 +437,13 @@ zfs_rmnode(znode_t *zp) dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); if (xzp) { - dmu_tx_hold_bonus(tx, xzp->z_id); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } if (acl_obj) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { /* @@ -692,11 +458,11 @@ zfs_rmnode(znode_t *zp) } if (xzp) { - dmu_buf_will_dirty(xzp->z_dbuf, tx); - mutex_enter(&xzp->z_lock); + ASSERT(error == 0); xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ - xzp->z_phys->zp_links = 0; /* no more links to it */ - mutex_exit(&xzp->z_lock); + xzp->z_links = 0; /* no more links to it */ + VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &xzp->z_links, sizeof (xzp->z_links), tx)); zfs_unlinked_add(xzp, tx); } @@ -709,140 +475,211 @@ zfs_rmnode(znode_t *zp) dmu_tx_commit(tx); out: if (xzp) - VN_RELE(ZTOV(xzp)); + vput(ZTOV(xzp)); } static uint64_t -zfs_dirent(znode_t *zp) +zfs_dirent(znode_t *zp, uint64_t mode) { uint64_t de = zp->z_id; + if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) - de |= IFTODT((zp)->z_phys->zp_mode) << 60; + de |= IFTODT(mode) << 60; return (de); } /* - * Link zp into dl. Can only fail if zp has been unlinked. + * Link zp into dzp. Can only fail if zp has been unlinked. */ int -zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) +zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag) { - znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; vnode_t *vp = ZTOV(zp); uint64_t value; int zp_is_dir = (vp->v_type == VDIR); + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; int error; - dmu_buf_will_dirty(zp->z_dbuf, tx); - mutex_enter(&zp->z_lock); - + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); +#if 0 + if (zp_is_dir) { + error = 0; + if (dzp->z_links >= LINK_MAX) + error = SET_ERROR(EMLINK); + return (error); + } +#endif if (!(flag & ZRENAMING)) { if (zp->z_unlinked) { /* no new links to unlinked zp */ ASSERT(!(flag & (ZNEW | ZEXISTS))); - mutex_exit(&zp->z_lock); - return (ENOENT); + return (SET_ERROR(ENOENT)); } - zp->z_phys->zp_links++; - } - zp->z_phys->zp_parent = dzp->z_id; /* dzp is now zp's parent */ +#if 0 + if (zp->z_links >= LINK_MAX) { + return (SET_ERROR(EMLINK)); + } +#endif + zp->z_links++; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); - if (!(flag & ZNEW)) - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); - mutex_exit(&zp->z_lock); - - dmu_buf_will_dirty(dzp->z_dbuf, tx); - mutex_enter(&dzp->z_lock); - dzp->z_phys->zp_size++; /* one dirent added */ - dzp->z_phys->zp_links += zp_is_dir; /* ".." link from zp */ - zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); - mutex_exit(&dzp->z_lock); + } else { + ASSERT(zp->z_unlinked == 0); + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &dzp->z_id, sizeof (dzp->z_id)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (!(flag & ZNEW)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime, B_TRUE); + } + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + + dzp->z_size++; + dzp->z_links += zp_is_dir; + count = 0; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); - value = zfs_dirent(zp); - error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name, + value = zfs_dirent(zp, zp->z_mode); + error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name, 8, 1, &value, tx); - ASSERT(error == 0); - - dnlc_update(ZTOV(dzp), dl->dl_name, vp); + VERIFY0(error); return (0); } +static int +zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag) +{ + int error; + + if (zp->z_zfsvfs->z_norm) { + if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) + error = zap_remove_norm(zp->z_zfsvfs->z_os, + dzp->z_id, name, MT_EXACT, tx); + else + error = zap_remove_norm(zp->z_zfsvfs->z_os, + dzp->z_id, name, MT_FIRST, tx); + } else { + error = zap_remove(zp->z_zfsvfs->z_os, + dzp->z_id, name, tx); + } + + return (error); +} + /* - * Unlink zp from dl, and mark zp for deletion if this was the last link. + * Unlink zp from dzp, and mark zp for deletion if this was the last link. * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. * If it's non-NULL, we use it to indicate whether the znode needs deletion, * and it's the caller's job to do it. */ int -zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, - boolean_t *unlinkedp) +zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag, boolean_t *unlinkedp) { - znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; vnode_t *vp = ZTOV(zp); int zp_is_dir = (vp->v_type == VDIR); boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; int error; - dnlc_remove(ZTOV(dzp), dl->dl_name); + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); if (!(flag & ZRENAMING)) { - dmu_buf_will_dirty(zp->z_dbuf, tx); - if (vn_vfswlock(vp)) /* prevent new mounts on zp */ - return (EBUSY); - - if (vn_ismntpt(vp)) { /* don't remove mount point */ - vn_vfsunlock(vp); - return (EBUSY); + if (zp_is_dir && !zfs_dirempty(zp)) { +#ifdef illumos + return (SET_ERROR(EEXIST)); +#else + return (SET_ERROR(ENOTEMPTY)); +#endif } - mutex_enter(&zp->z_lock); - if (zp_is_dir && !zfs_dirempty(zp)) { /* dir not empty */ - mutex_exit(&zp->z_lock); - vn_vfsunlock(vp); - return (ENOTEMPTY); + /* + * If we get here, we are going to try to remove the object. + * First try removing the name from the directory; if that + * fails, return the error. + */ + error = zfs_dropname(dzp, name, zp, tx, flag); + if (error != 0) { + return (error); } - if (zp->z_phys->zp_links <= zp_is_dir) { + + if (zp->z_links <= zp_is_dir) { zfs_panic_recover("zfs: link count on vnode %p is %u, " - "should be at least %u", - zp->z_vnode, (int)zp->z_phys->zp_links, - zp_is_dir + 1); - zp->z_phys->zp_links = zp_is_dir + 1; + "should be at least %u", zp->z_vnode, + (int)zp->z_links, + zp_is_dir + 1); + zp->z_links = zp_is_dir + 1; } - if (--zp->z_phys->zp_links == zp_is_dir) { + if (--zp->z_links == zp_is_dir) { zp->z_unlinked = B_TRUE; - zp->z_phys->zp_links = 0; + zp->z_links = 0; unlinked = B_TRUE; } else { - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); - } - mutex_exit(&zp->z_lock); - vn_vfsunlock(vp); - } - - dmu_buf_will_dirty(dzp->z_dbuf, tx); - mutex_enter(&dzp->z_lock); - dzp->z_phys->zp_size--; /* one dirent removed */ - dzp->z_phys->zp_links -= zp_is_dir; /* ".." link from zp */ - zfs_time_stamper_locked(dzp, CONTENT_MODIFIED, tx); - mutex_exit(&dzp->z_lock); - - if (zp->z_zfsvfs->z_norm) { - if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && - (flag & ZCIEXACT)) || - ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) && - !(flag & ZCILOOK))) - error = zap_remove_norm(zp->z_zfsvfs->z_os, - dzp->z_id, dl->dl_name, MT_EXACT, tx); - else - error = zap_remove_norm(zp->z_zfsvfs->z_os, - dzp->z_id, dl->dl_name, MT_FIRST, tx); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, + B_TRUE); + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &zp->z_links, sizeof (zp->z_links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + count = 0; + ASSERT0(error); } else { - error = zap_remove(zp->z_zfsvfs->z_os, - dzp->z_id, dl->dl_name, tx); + ASSERT(zp->z_unlinked == 0); + error = zfs_dropname(dzp, name, zp, tx, flag); + if (error != 0) + return (error); } - ASSERT(error == 0); + + dzp->z_size--; /* one dirent removed */ + dzp->z_links -= zp_is_dir; /* ".." link from zp */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); if (unlinkedp != NULL) *unlinkedp = unlinked; @@ -853,14 +690,12 @@ zfs_link_destroy(zfs_dirlock_t *dl, znod } /* - * Indicate whether the directory is empty. Works with or without z_lock - * held, but can only be consider a hint in the latter case. Returns true - * if only "." and ".." remain and there's no work in progress. + * Indicate whether the directory is empty. */ boolean_t zfs_dirempty(znode_t *dzp) { - return (dzp->z_phys->zp_size == 2 && dzp->z_dirlocks == 0); + return (dzp->z_size == 2); } int @@ -872,42 +707,56 @@ zfs_make_xattrdir(znode_t *zp, vattr_t * int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; + uint64_t parent; *xvpp = NULL; + /* + * In FreeBSD, access checking for creating an EA is being done + * in zfs_setextattr(), + */ +#ifndef __FreeBSD_kernel__ if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)) return (error); +#endif if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, &acl_ids)) != 0) return (error); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); - return (EDQUOT); + return (SET_ERROR(EDQUOT)); } + getnewvnode_reserve(1); + tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); - error = dmu_tx_assign(tx, TXG_NOWAIT); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); - if (error == ERESTART) - dmu_tx_wait(tx); dmu_tx_abort(tx); return (error); } - zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, &acl_ids); + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); - ASSERT(xzp->z_phys->zp_parent == zp->z_id); - dmu_buf_will_dirty(zp->z_dbuf, tx); - zp->z_phys->zp_xattr = xzp->z_id; +#ifdef DEBUG + error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + ASSERT(error == 0 && parent == zp->z_id); +#endif + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, + sizeof (xzp->z_id), tx)); (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL, acl_ids.z_fuidp, vap); @@ -915,6 +764,8 @@ zfs_make_xattrdir(znode_t *zp, vattr_t * zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); + getnewvnode_drop_reserve(); + *xvpp = ZTOV(xzp); return (0); @@ -938,30 +789,29 @@ zfs_get_xattrdir(znode_t *zp, vnode_t ** { zfsvfs_t *zfsvfs = zp->z_zfsvfs; znode_t *xzp; - zfs_dirlock_t *dl; vattr_t va; int error; top: - error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); + error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR); if (error) return (error); if (xzp != NULL) { *xvpp = ZTOV(xzp); - zfs_dirent_unlock(dl); return (0); } - ASSERT(zp->z_phys->zp_xattr == 0); if (!(flags & CREATE_XATTR_DIR)) { - zfs_dirent_unlock(dl); - return (ENOENT); +#ifdef illumos + return (SET_ERROR(ENOENT)); +#else + return (SET_ERROR(ENOATTR)); +#endif } if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { - zfs_dirent_unlock(dl); - return (EROFS); + return (SET_ERROR(EROFS)); } /* @@ -980,12 +830,13 @@ top: zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); error = zfs_make_xattrdir(zp, &va, xvpp, cr); - zfs_dirent_unlock(dl); if (error == ERESTART) { /* NB: we already did dmu_tx_wait() if necessary */ goto top; } + if (error == 0) + VOP_UNLOCK(*xvpp, 0); return (error); } @@ -1014,16 +865,16 @@ zfs_sticky_remove_access(znode_t *zdp, z if (zdp->z_zfsvfs->z_replay) return (0); - if ((zdp->z_phys->zp_mode & S_ISVTX) == 0) + if ((zdp->z_mode & S_ISVTX) == 0) return (0); - downer = zfs_fuid_map_id(zfsvfs, zdp->z_phys->zp_uid, cr, ZFS_OWNER); - fowner = zfs_fuid_map_id(zfsvfs, zp->z_phys->zp_uid, cr, ZFS_OWNER); + downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); if ((uid = crgetuid(cr)) == downer || uid == fowner || (ZTOV(zp)->v_type == VREG && zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) return (0); else - return (secpolicy_vnode_remove(cr)); + return (secpolicy_vnode_remove(ZTOV(zp), cr)); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zfs_fm.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c 27 Feb 2010 22:31:21 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c 12 Jun 2012 05:57:28 -0000 @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + #include #include #include @@ -709,6 +713,10 @@ zfs_ereport_start_checksum(spa_t *spa, v if (report->zcr_ereport == NULL) { report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo); + if (report->zcr_ckinfo != NULL) { + kmem_free(report->zcr_ckinfo, + sizeof (*report->zcr_ckinfo)); + } kmem_free(report, sizeof (*report)); return; } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c,v retrieving revision 1.3 diff -u -p -r1.3 zfs_fuid.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c 27 Feb 2010 23:43:53 -0000 1.3 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c 18 Apr 2017 00:53:30 -0000 @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -376,7 +375,7 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, u rw_enter(&zfsvfs->z_fuid_lock, RW_READER); - if (zfsvfs->z_fuid_obj) + if (zfsvfs->z_fuid_obj || zfsvfs->z_fuid_dirty) domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx); else domain = nulldomain; @@ -389,10 +388,8 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, u void zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp) { - *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_uid, - cr, ZFS_OWNER); - *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_phys->zp_gid, - cr, ZFS_GROUP); + *uidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + *gidp = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_gid, cr, ZFS_GROUP); } uid_t @@ -409,6 +406,7 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64 domain = zfs_fuid_find_by_idx(zfsvfs, index); ASSERT(domain != NULL); +#ifdef illumos if (type == ZFS_OWNER || type == ZFS_ACE_USER) { (void) kidmap_getuidbysid(crgetzone(cr), domain, FUID_RID(fuid), &id); @@ -416,6 +414,9 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64 (void) kidmap_getgidbysid(crgetzone(cr), domain, FUID_RID(fuid), &id); } +#else + id = UID_NOBODY; +#endif return (id); } @@ -487,6 +488,11 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuid /* * Create a file system FUID, based on information in the users cred + * + * If cred contains KSID_OWNER then it should be used to determine + * the uid otherwise cred's uid will be used. By default cred's gid + * is used unless it's an ephemeral ID in which case KSID_GROUP will + * be used if it exists. */ uint64_t zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, @@ -501,32 +507,28 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, z VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); - if (type == ZFS_OWNER) - id = crgetuid(cr); - else - id = crgetgid(cr); - -#ifdef PORT_SOLARIS ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP); - if (ksid) { - id = ksid_getid(ksid); - } else { - if (type == ZFS_OWNER) - id = crgetuid(cr); - else - id = crgetgid(cr); - if (IS_EPHEMERAL(id)) { - return ((uint64_t)(type == ZFS_OWNER ? - UID_NOBODY : GID_NOBODY)); - } + if (!zfsvfs->z_use_fuids || (ksid == NULL)) { + id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr); + + if (IS_EPHEMERAL(id)) + return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY); + + return ((uint64_t)id); } -#endif - if (!zfsvfs->z_use_fuids || (!IS_EPHEMERAL(id))) + /* + * ksid is present and FUID is supported + */ + id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr); + + if (!IS_EPHEMERAL(id)) return ((uint64_t)id); -#ifdef PORT_SOLARIS + if (type == ZFS_GROUP) + id = ksid_getid(ksid); + rid = ksid_getrid(ksid); domain = ksid_getdomain(ksid); @@ -535,9 +537,6 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, z zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type); return (FUID_ENCODE(idx, rid)); -#else - panic(__func__); -#endif } /* @@ -561,9 +560,9 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64 uint32_t fuid_idx = FUID_INDEX(id); uint32_t rid; idmap_stat status; - uint64_t idx; + uint64_t idx = 0; zfs_fuid_t *zfuid = NULL; - zfs_fuid_info_t *fuidp; + zfs_fuid_info_t *fuidp = NULL; /* * If POSIX ID, or entry is already a FUID then @@ -585,10 +584,11 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64 * This is most likely a result of idmap service * not being available. */ - /* XXX NetBSD we need to define UID_NOBODY in - kernel sources otherwise */ if (fuidp == NULL) - return (crgetuid(cr)); + return (UID_NOBODY); + + VERIFY3U(type, >=, ZFS_OWNER); + VERIFY3U(type, <=, ZFS_ACE_GROUP); switch (type) { case ZFS_ACE_USER: @@ -606,9 +606,8 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64 idx = FUID_INDEX(fuidp->z_fuid_group); break; }; - domain = fuidp->z_domain_table[idx -1]; + domain = fuidp->z_domain_table[idx - 1]; } else { -#ifdef PORT_SOLARIS if (type == ZFS_OWNER || type == ZFS_ACE_USER) status = kidmap_getsidbyuid(crgetzone(cr), id, &domain, &rid); @@ -625,9 +624,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64 rid = UID_NOBODY; domain = nulldomain; } -#else - panic(__func__); -#endif } idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); @@ -707,12 +703,13 @@ zfs_fuid_info_free(zfs_fuid_info_t *fuid boolean_t zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) { +#ifdef illumos ksid_t *ksid = crgetsid(cr, KSID_GROUP); + ksidlist_t *ksidlist = crgetsidlist(cr); +#endif uid_t gid; -#ifdef PORT_SOLARIS - ksidlist_t *ksidlist = crgetsidlist(cr); - +#ifdef illumos if (ksid && ksidlist) { int i; ksid_t *ksid_groups; @@ -744,7 +741,8 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64 } } } -#endif +#endif /* illumos */ + /* * Not found in ksidlist, check posix groups */ Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c,v retrieving revision 1.10 diff -u -p -r1.10 zfs_ioctl.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c 10 Apr 2015 20:55:38 -0000 1.10 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c 13 Sep 2017 07:55:04 -0000 @@ -18,18 +18,139 @@ * * CDDL HEADER END */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved. + * Copyright 2013 Martin Matuska . All rights reserved. + * Copyright 2014 Xin Li . All rights reserved. + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Toomas Soome + */ + /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * ZFS ioctls. + * + * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage + * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. + * + * There are two ways that we handle ioctls: the legacy way where almost + * all of the logic is in the ioctl callback, and the new way where most + * of the marshalling is handled in the common entry point, zfsdev_ioctl(). + * + * Non-legacy ioctls should be registered by calling + * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked + * from userland by lzc_ioctl(). + * + * The registration arguments are as follows: + * + * const char *name + * The name of the ioctl. This is used for history logging. If the + * ioctl returns successfully (the callback returns 0), and allow_log + * is true, then a history log entry will be recorded with the input & + * output nvlists. The log entry can be printed with "zpool history -i". + * + * zfs_ioc_t ioc + * The ioctl request number, which userland will pass to ioctl(2). + * The ioctl numbers can change from release to release, because + * the caller (libzfs) must be matched to the kernel. + * + * zfs_secpolicy_func_t *secpolicy + * This function will be called before the zfs_ioc_func_t, to + * determine if this operation is permitted. It should return EPERM + * on failure, and 0 on success. Checks include determining if the + * dataset is visible in this zone, and if the user has either all + * zfs privileges in the zone (SYS_MOUNT), or has been granted permission + * to do this operation on this dataset with "zfs allow". + * + * zfs_ioc_namecheck_t namecheck + * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool + * name, a dataset name, or nothing. If the name is not well-formed, + * the ioctl will fail and the callback will not be called. + * Therefore, the callback can assume that the name is well-formed + * (e.g. is null-terminated, doesn't have more than one '@' character, + * doesn't have invalid characters). + * + * zfs_ioc_poolcheck_t pool_check + * This specifies requirements on the pool state. If the pool does + * not meet them (is suspended or is readonly), the ioctl will fail + * and the callback will not be called. If any checks are specified + * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. + * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | + * POOL_CHECK_READONLY). + * + * boolean_t smush_outnvlist + * If smush_outnvlist is true, then the output is presumed to be a + * list of errors, and it will be "smushed" down to fit into the + * caller's buffer, by removing some entries and replacing them with a + * single "N_MORE_ERRORS" entry indicating how many were removed. See + * nvlist_smush() for details. If smush_outnvlist is false, and the + * outnvlist does not fit into the userland-provided buffer, then the + * ioctl will fail with ENOMEM. + * + * zfs_ioc_func_t *func + * The callback function that will perform the operation. + * + * The callback should return 0 on success, or an error number on + * failure. If the function fails, the userland ioctl will return -1, + * and errno will be set to the callback's return value. The callback + * will be called with the following arguments: + * + * const char *name + * The name of the pool or dataset to operate on, from + * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the + * expected type (pool, dataset, or none). + * + * nvlist_t *innvl + * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or + * NULL if no input nvlist was provided. Changes to this nvlist are + * ignored. If the input nvlist could not be deserialized, the + * ioctl will fail and the callback will not be called. + * + * nvlist_t *outnvl + * The output nvlist, initially empty. The callback can fill it in, + * and it will be returned to userland by serializing it into + * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization + * fails (e.g. because the caller didn't supply a large enough + * buffer), then the overall ioctl will fail. See the + * 'smush_nvlist' argument above for additional behaviors. + * + * There are two typical uses of the output nvlist: + * - To return state, e.g. property values. In this case, + * smush_outnvlist should be false. If the buffer was not large + * enough, the caller will reallocate a larger buffer and try + * the ioctl again. + * + * - To return multiple errors from an ioctl which makes on-disk + * changes. In this case, smush_outnvlist should be true. + * Ioctls which make on-disk modifications should generally not + * use the outnvl if they succeed, because the caller can not + * distinguish between the operation failing, and + * deserialization failing. */ +#ifdef __FreeBSD__ +#include "opt_kstack_pages.h" +#endif #include #include +#include +#include +#include +#include +#include +#include +#include +#include #include #include #include -#include -#include #include #include #include @@ -42,51 +163,75 @@ #include #include #include -#include #include #include #include #include #include #include -#include +#include +#include #include -#include #include #include #include -#include #include +#ifdef __FreeBSD__ +#include +#endif +#ifdef __NetBSD__ +#include +#include +#endif #include +#include #include #include #include +#include #include -#include +#include #include -#include -#include +#include +#include +#include +#include +#include +#include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" +#include "zfs_comutil.h" +#include "zfs_ioctl_compat.h" -#ifdef __NetBSD__ -static int zfs_cmajor = -1; -static int zfs_bmajor = -1; -#define ddi_driver_major(x) zfs_cmajor +#ifdef __FreeBSD__ +CTASSERT(sizeof(zfs_cmd_t) < IOCPARM_MAX); +static struct cdev *zfsdev; #endif -extern struct modlfs zfs_modlfs; +#ifdef __NetBSD__ +static int zfs_cmajor = -1; +static int zfs_bmajor = -1; +dev_info_t *zfs_dip; -extern void zfs_init(void); -extern void zfs_fini(void); +#define ddi_driver_major(x) zfs_cmajor -ldi_ident_t zfs_li = NULL; -dev_info_t *zfs_dip; +#define zfs_init() /* nothing */ +#define zfs_fini() /* nothing */ + +#define vfs_busy(x, y) vfs_busy(x) +#define vfs_rel(x) vfs_rele(x) +#endif + +uint_t zfs_fsyncer_key; +extern uint_t rrw_tsd_key; +static uint_t zfs_allow_log_key; +extern uint_t zfs_geom_probe_vdev_key; -typedef int zfs_ioc_func_t(zfs_cmd_t *); -typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *); +typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *); +typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *); +typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *); typedef enum { NO_NAME, @@ -94,12 +239,21 @@ typedef enum { DATASET_NAME } zfs_ioc_namecheck_t; +typedef enum { + POOL_CHECK_NONE = 1 << 0, + POOL_CHECK_SUSPENDED = 1 << 1, + POOL_CHECK_READONLY = 1 << 2, +} zfs_ioc_poolcheck_t; + typedef struct zfs_ioc_vec { + zfs_ioc_legacy_func_t *zvec_legacy_func; zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; zfs_ioc_namecheck_t zvec_namecheck; - boolean_t zvec_his_log; - boolean_t zvec_pool_check; + boolean_t zvec_allow_log; + zfs_ioc_poolcheck_t zvec_pool_check; + boolean_t zvec_smush_outnvlist; + const char *zvec_name; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ @@ -117,14 +271,21 @@ static int zfs_check_clearable(char *dat nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); -int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t **); +int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); +static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); + +#ifdef __FreeBSD__ +static void zfsdev_close(void *data); +#endif + +static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature); /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ void __dprintf(const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; - char buf[256]; + char buf[512]; va_list adx; /* @@ -199,9 +360,7 @@ zfs_is_bootfs(const char *name) } /* - * zfs_earlier_version - * - * Return non-zero if the spa version is less than requested version. + * Return non-zero if the spa version is less than requested version. */ static int zfs_earlier_version(const char *name, int version) @@ -219,8 +378,6 @@ zfs_earlier_version(const char *name, in } /* - * zpl_earlier_version - * * Return TRUE if the ZPL version is less than requested version. */ static boolean_t @@ -255,7 +412,7 @@ zfs_log_history(zfs_cmd_t *zc) if (spa_open(zc->zc_name, &spa, FTAG) == 0) { if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) - (void) spa_history_log(spa, buf, LOG_CMD_NORMAL); + (void) spa_history_log(spa, buf); spa_close(spa, FTAG); } history_str_free(buf); @@ -267,7 +424,7 @@ zfs_log_history(zfs_cmd_t *zc) */ /* ARGSUSED */ static int -zfs_secpolicy_none(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (0); } @@ -278,67 +435,127 @@ zfs_secpolicy_none(zfs_cmd_t *zc, cred_t */ /* ARGSUSED */ static int -zfs_secpolicy_read(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - if (INGLOBALZONE(curproc) || + if (INGLOBALZONE(curthread) || zone_dataset_visible(zc->zc_name, NULL)) return (0); - return (ENOENT); + return (SET_ERROR(ENOENT)); } static int -zfs_dozonecheck(const char *dataset, cred_t *cr) +zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { - uint64_t zoned; int writable = 1; /* * The dataset must be visible by this zone -- check this first * so they don't see EPERM on something they shouldn't know about. */ - if (!INGLOBALZONE(curproc) && + if (!INGLOBALZONE(curthread) && !zone_dataset_visible(dataset, &writable)) - return (ENOENT); - - if (dsl_prop_get_integer(dataset, "zoned", &zoned, NULL)) - return (ENOENT); + return (SET_ERROR(ENOENT)); - if (INGLOBALZONE(curproc)) { + if (INGLOBALZONE(curthread)) { /* * If the fs is zoned, only root can access it from the * global zone. */ if (secpolicy_zfs(cr) && zoned) - return (EPERM); + return (SET_ERROR(EPERM)); } else { /* * If we are in a local zone, the 'zoned' property must be set. */ if (!zoned) - return (EPERM); + return (SET_ERROR(EPERM)); /* must be writable by this zone */ if (!writable) - return (EPERM); + return (SET_ERROR(EPERM)); } return (0); } -int -zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) +static int +zfs_dozonecheck(const char *dataset, cred_t *cr) +{ + uint64_t zoned; + +#ifdef __NetBSD__ + zoned = 0; +#else + if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL)) + return (SET_ERROR(ENOENT)); +#endif + + return (zfs_dozonecheck_impl(dataset, zoned, cr)); +} + +static int +zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) +{ + uint64_t zoned; + +#ifdef __NetBSD__ + zoned = 0; +#else + if (dsl_prop_get_int_ds(ds, "jailed", &zoned)) + return (SET_ERROR(ENOENT)); +#endif + + return (zfs_dozonecheck_impl(dataset, zoned, cr)); +} + +static int +zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, + const char *perm, cred_t *cr) { int error; - error = zfs_dozonecheck(name, cr); + error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); - if (error) - error = dsl_deleg_access(name, perm, cr); + if (error != 0) + error = dsl_deleg_access_impl(ds, perm, cr); + } + return (error); +} + +static int +zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) +{ + int error; + dsl_dataset_t *ds; + dsl_pool_t *dp; + + /* + * First do a quick check for root in the global zone, which + * is allowed to do all write_perms. This ensures that zfs_ioc_* + * will get to handle nonexistent datasets. + */ + if (INGLOBALZONE(curthread) && secpolicy_zfs(cr) == 0) + return (0); + + error = dsl_pool_hold(name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, name, FTAG, &ds); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); } + + error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); + + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } +#ifdef SECLABEL /* * Policy for setting the security label property. * @@ -347,7 +564,6 @@ zfs_secpolicy_write_perms(const char *na static int zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) { -#ifdef PORT_SOLARIS char ds_hexsl[MAXNAMELEN]; bslabel_t ds_sl, new_sl; boolean_t new_default = FALSE; @@ -358,15 +574,15 @@ zfs_set_slabel_policy(const char *name, /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); - if (error) - return (EPERM); + if (error != 0) + return (SET_ERROR(EPERM)); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) new_default = TRUE; /* The label must be translatable */ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) - return (EINVAL); + return (SET_ERROR(EINVAL)); /* * In a non-global zone, disallow attempts to set a label that @@ -375,7 +591,7 @@ zfs_set_slabel_policy(const char *name, */ if (!INGLOBALZONE(curproc)) { if (new_default || !blequal(&new_sl, CR_SL(CRED()))) - return (EPERM); + return (SET_ERROR(EPERM)); return (0); } @@ -386,10 +602,10 @@ zfs_set_slabel_policy(const char *name, */ if (dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) - return (EPERM); + return (SET_ERROR(EPERM)); if (!zoned) { if (zfs_check_global_label(name, strval) != 0) - return (EPERM); + return (SET_ERROR(EPERM)); } /* @@ -408,8 +624,8 @@ zfs_set_slabel_policy(const char *name, */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, setsl_tag, &os); - if (error) - return (EPERM); + if (error != 0) + return (SET_ERROR(EPERM)); dmu_objset_disown(os, setsl_tag); @@ -419,7 +635,7 @@ zfs_set_slabel_policy(const char *name, } if (hexstr_to_label(strval, &new_sl) != 0) - return (EPERM); + return (SET_ERROR(EPERM)); if (blstrictdom(&ds_sl, &new_sl)) needed_priv = PRIV_FILE_DOWNGRADE_SL; @@ -435,16 +651,13 @@ out_check: if (needed_priv != -1) return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); return (0); -#else - return (ENOTSUP); -#endif } +#endif /* SECLABEL */ static int zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, cred_t *cr) { -#ifdef PORT_SOLARIS char *strval; /* @@ -455,30 +668,33 @@ zfs_secpolicy_setprop(const char *dsname /* * Disallow setting of 'zoned' from within a local zone. */ - if (!INGLOBALZONE(curproc)) - return (EPERM); + if (!INGLOBALZONE(curthread)) + return (SET_ERROR(EPERM)); break; case ZFS_PROP_QUOTA: - if (!INGLOBALZONE(curproc)) { + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + if (!INGLOBALZONE(curthread)) { uint64_t zoned; - char setpoint[MAXNAMELEN]; + char setpoint[ZFS_MAX_DATASET_NAME_LEN]; /* * Unprivileged users are allowed to modify the - * quota on things *under* (ie. contained by) + * limit on things *under* (ie. contained by) * the thing they own. */ - if (dsl_prop_get_integer(dsname, "zoned", &zoned, + if (dsl_prop_get_integer(dsname, "jailed", &zoned, setpoint)) - return (EPERM); + return (SET_ERROR(EPERM)); if (!zoned || strlen(dsname) <= strlen(setpoint)) - return (EPERM); + return (SET_ERROR(EPERM)); } break; case ZFS_PROP_MLSLABEL: +#ifdef SECLABEL if (!is_system_labeled()) - return (EPERM); + return (SET_ERROR(EPERM)); if (nvpair_value_string(propval, &strval) == 0) { int err; @@ -487,22 +703,23 @@ zfs_secpolicy_setprop(const char *dsname if (err != 0) return (err); } +#else + return (EOPNOTSUPP); +#endif break; } return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); -#else - return (ENOTSUP); -#endif } -int -zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr) +/* ARGSUSED */ +static int +zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; error = zfs_dozonecheck(zc->zc_name, cr); - if (error) + if (error != 0) return (error); /* @@ -512,78 +729,107 @@ zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_ return (0); } -int -zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr) +/* ARGSUSED */ +static int +zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_ROLLBACK, cr)); } -int -zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) +/* ARGSUSED */ +static int +zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + char *cp; + int error; + + /* + * Generate the current snapshot name from the given objsetid, then + * use that name for the secpolicy/zone checks. + */ + cp = strchr(zc->zc_name, '@'); + if (cp == NULL) + return (SET_ERROR(EINVAL)); + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + dsl_dataset_name(ds, zc->zc_name); + + error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, + ZFS_DELEG_PERM_SEND, cr); + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + + return (error); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_SEND, cr)); } +/* ARGSUSED */ static int -zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { vnode_t *vp; int error; if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, - NULL, &vp)) != 0) + NO_FOLLOW, NULL, &vp)) != 0) return (error); /* Now make sure mntpnt and dataset are ZFS */ -#ifndef __NetBSD__ - if (vp->v_vfsp->vfs_fstype != zfsfstype || + + if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), zc->zc_name) != 0)) { VN_RELE(vp); - return (EPERM); + return (SET_ERROR(EPERM)); } -#endif + VN_RELE(vp); return (dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_SHARE, cr)); } int -zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { -#ifdef __NetBSD__ - printf("XXX zfs_secpolicy_share write me\n"); - return EPERM; -#else - if (!INGLOBALZONE(curproc)) - return (EPERM); + if (!INGLOBALZONE(curthread)) + return (SET_ERROR(EPERM)); if (secpolicy_nfs(cr) == 0) { return (0); } else { - return (zfs_secpolicy_deleg_share(zc, cr)); + return (zfs_secpolicy_deleg_share(zc, innvl, cr)); } -#endif } int -zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { -#ifdef __NetBSD__ - printf("XXX zfs_secpolicy_share write me\n"); - return EPERM; -#else - if (!INGLOBALZONE(curproc)) - return (EPERM); + if (!INGLOBALZONE(curthread)) + return (SET_ERROR(EPERM)); if (secpolicy_smb(cr) == 0) { return (0); } else { - return (zfs_secpolicy_deleg_share(zc, cr)); + return (zfs_secpolicy_deleg_share(zc, innvl, cr)); } -#endif /* __NetBSD__ */ } static int @@ -601,7 +847,7 @@ zfs_get_parent(const char *datasetname, } else { cp = strrchr(parent, '/'); if (cp == NULL) - return (ENOENT); + return (SET_ERROR(ENOENT)); cp[0] = '\0'; } @@ -620,51 +866,54 @@ zfs_secpolicy_destroy_perms(const char * return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); } +/* ARGSUSED */ static int -zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } /* * Destroying snapshots with delegated permissions requires - * descendent mount and destroy permissions. - * Reassemble the full filesystem@snap name so dsl_deleg_access() - * can do the correct permission check. - * - * Since this routine is used when doing a recursive destroy of snapshots - * and destroying snapshots requires descendent permissions, a successfull - * check of the top level snapshot applies to snapshots of all descendent - * datasets as well. + * descendant mount and destroy permissions. */ +/* ARGSUSED */ static int -zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - int error; - char *dsname; - - dsname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value); + nvlist_t *snaps; + nvpair_t *pair, *nextpair; + int error = 0; - error = zfs_secpolicy_destroy_perms(dsname, cr); + if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) + return (SET_ERROR(EINVAL)); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nextpair) { + nextpair = nvlist_next_nvpair(snaps, pair); + error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); + if (error == ENOENT) { + /* + * Ignore any snapshots that don't exist (we consider + * them "already destroyed"). Remove the name from the + * nvl here in case the snapshot is created between + * now and when we try to destroy it (in which case + * we don't want to destroy it since we haven't + * checked for permission). + */ + fnvlist_remove_nvpair(snaps, pair); + error = 0; + } + if (error != 0) + break; + } - strfree(dsname); return (error); } -/* - * Must have sys_config privilege to check the iscsi permission - */ -/* ARGSUSED */ -static int -zfs_secpolicy_iscsi(zfs_cmd_t *zc, cred_t *cr) -{ - return (secpolicy_zfs(cr)); -} - int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { - char parentname[MAXNAMELEN]; + char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; if ((error = zfs_secpolicy_write_perms(from, @@ -690,55 +939,83 @@ zfs_secpolicy_rename_perms(const char *f return (error); } +/* ARGSUSED */ static int -zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); + char *at = NULL; + int error; + + if ((zc->zc_cookie & 1) != 0) { + /* + * This is recursive rename, so the starting snapshot might + * not exist. Check file system or volume permission instead. + */ + at = strchr(zc->zc_name, '@'); + if (at == NULL) + return (EINVAL); + *at = '\0'; + } + + error = zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr); + + if (at != NULL) + *at = '@'; + + return (error); } +/* ARGSUSED */ static int -zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - char parentname[MAXNAMELEN]; - objset_t *clone; + dsl_pool_t *dp; + dsl_dataset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); - if (error) + if (error != 0) + return (error); + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) return (error); - error = dmu_objset_hold(zc->zc_name, FTAG, &clone); + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { - dsl_dataset_t *pclone = NULL; + char parentname[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_t *origin = NULL; dsl_dir_t *dd; - dd = clone->os_dsl_dataset->ds_dir; + dd = clone->ds_dir; - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dd->dd_pool, - dd->dd_phys->dd_origin_obj, FTAG, &pclone); - rw_exit(&dd->dd_pool->dp_config_rwlock); - if (error) { - dmu_objset_rele(clone, FTAG); + dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); + if (error != 0) { + dsl_dataset_rele(clone, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } - error = zfs_secpolicy_write_perms(zc->zc_name, + error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, ZFS_DELEG_PERM_MOUNT, cr); - dsl_dataset_name(pclone, parentname); - dmu_objset_rele(clone, FTAG); - dsl_dataset_rele(pclone, FTAG); - if (error == 0) - error = zfs_secpolicy_write_perms(parentname, + dsl_dataset_name(origin, parentname); + if (error == 0) { + error = zfs_secpolicy_write_perms_ds(parentname, origin, ZFS_DELEG_PERM_PROMOTE, cr); + } + dsl_dataset_rele(clone, FTAG); + dsl_dataset_rele(origin, FTAG); } + dsl_pool_rele(dp, FTAG); return (error); } +/* ARGSUSED */ static int -zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; @@ -761,49 +1038,141 @@ zfs_secpolicy_snapshot_perms(const char ZFS_DELEG_PERM_SNAPSHOT, cr)); } +/* + * Check for permission to create each snapshot in the nvlist. + */ +/* ARGSUSED */ static int -zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { + nvlist_t *snaps; + int error; + nvpair_t *pair; - return (zfs_secpolicy_snapshot_perms(zc->zc_name, cr)); + if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) + return (SET_ERROR(EINVAL)); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + char *name = nvpair_name(pair); + char *atp = strchr(name, '@'); + + if (atp == NULL) { + error = SET_ERROR(EINVAL); + break; + } + *atp = '\0'; + error = zfs_secpolicy_snapshot_perms(name, cr); + *atp = '@'; + if (error != 0) + break; + } + return (error); } +/* + * Check for permission to create each snapshot in the nvlist. + */ +/* ARGSUSED */ static int -zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - char parentname[MAXNAMELEN]; - int error; + int error = 0; - if ((error = zfs_get_parent(zc->zc_name, parentname, - sizeof (parentname))) != 0) - return (error); + for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { + char *name = nvpair_name(pair); + char *hashp = strchr(name, '#'); - if (zc->zc_value[0] != '\0') { - if ((error = zfs_secpolicy_write_perms(zc->zc_value, - ZFS_DELEG_PERM_CLONE, cr)) != 0) - return (error); + if (hashp == NULL) { + error = SET_ERROR(EINVAL); + break; + } + *hashp = '\0'; + error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_BOOKMARK, cr); + *hashp = '#'; + if (error != 0) + break; } + return (error); +} - if ((error = zfs_secpolicy_write_perms(parentname, - ZFS_DELEG_PERM_CREATE, cr)) != 0) - return (error); +/* ARGSUSED */ +static int +zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + nvpair_t *pair, *nextpair; + int error = 0; + + for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; + pair = nextpair) { + char *name = nvpair_name(pair); + char *hashp = strchr(name, '#'); + nextpair = nvlist_next_nvpair(innvl, pair); + + if (hashp == NULL) { + error = SET_ERROR(EINVAL); + break; + } - error = zfs_secpolicy_write_perms(parentname, - ZFS_DELEG_PERM_MOUNT, cr); + *hashp = '\0'; + error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_DESTROY, cr); + *hashp = '#'; + if (error == ENOENT) { + /* + * Ignore any filesystems that don't exist (we consider + * their bookmarks "already destroyed"). Remove + * the name from the nvl here in case the filesystem + * is created between now and when we try to destroy + * the bookmark (in which case we don't want to + * destroy it since we haven't checked for permission). + */ + fnvlist_remove_nvpair(innvl, pair); + error = 0; + } + if (error != 0) + break; + } return (error); } +/* ARGSUSED */ static int -zfs_secpolicy_umount(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - int error; + /* + * Even root must have a proper TSD so that we know what pool + * to log to. + */ + if (tsd_get(zfs_allow_log_key) == NULL) + return (SET_ERROR(EPERM)); + return (0); +} - error = secpolicy_fs_unmount(cr, NULL); - if (error) { - error = dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr); - } - return (error); +static int +zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + char parentname[ZFS_MAX_DATASET_NAME_LEN]; + int error; + char *origin; + + if ((error = zfs_get_parent(zc->zc_name, parentname, + sizeof (parentname))) != 0) + return (error); + + if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && + (error = zfs_secpolicy_write_perms(origin, + ZFS_DELEG_PERM_CLONE, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_CREATE, cr)) != 0) + return (error); + + return (zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_MOUNT, cr)); } /* @@ -812,32 +1181,49 @@ zfs_secpolicy_umount(zfs_cmd_t *zc, cred */ /* ARGSUSED */ static int -zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (secpolicy_sys_config(cr, B_FALSE) != 0) - return (EPERM); + return (SET_ERROR(EPERM)); return (0); } /* - * Policy for fault injection. Requires all privileges. + * Policy for object to name lookups. */ /* ARGSUSED */ static int -zfs_secpolicy_inject(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - return (secpolicy_zinject(cr)); + int error; + + if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) + return (0); + + error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); + return (error); +} + +/* + * Policy for fault injection. Requires all privileges. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (secpolicy_zinject(cr)); } +/* ARGSUSED */ static int -zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); if (prop == ZPROP_INVAL) { if (!zfs_prop_user(zc->zc_value)) - return (EINVAL); + return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { @@ -847,14 +1233,14 @@ zfs_secpolicy_inherit(zfs_cmd_t *zc, cre } static int -zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - int err = zfs_secpolicy_read(zc, cr); + int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) - return (EINVAL); + return (SET_ERROR(EINVAL)); if (zc->zc_value[0] == 0) { /* @@ -876,38 +1262,99 @@ zfs_secpolicy_userspace_one(zfs_cmd_t *z } static int -zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - int err = zfs_secpolicy_read(zc, cr); + int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) - return (EINVAL); + return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } +/* ARGSUSED */ static int -zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, NULL, cr)); } +/* ARGSUSED */ static int -zfs_secpolicy_hold(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_HOLD, cr)); + nvpair_t *pair; + nvlist_t *holds; + int error; + + error = nvlist_lookup_nvlist(innvl, "holds", &holds); + if (error != 0) + return (SET_ERROR(EINVAL)); + + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + error = dmu_fsname(nvpair_name(pair), fsname); + if (error != 0) + return (error); + error = zfs_secpolicy_write_perms(fsname, + ZFS_DELEG_PERM_HOLD, cr); + if (error != 0) + return (error); + } + return (0); } +/* ARGSUSED */ static int -zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr) +zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { - return (zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_RELEASE, cr)); + nvpair_t *pair; + int error; + + for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; + pair = nvlist_next_nvpair(innvl, pair)) { + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + error = dmu_fsname(nvpair_name(pair), fsname); + if (error != 0) + return (error); + error = zfs_secpolicy_write_perms(fsname, + ZFS_DELEG_PERM_RELEASE, cr); + if (error != 0) + return (error); + } + return (0); +} + +/* + * Policy for allowing temporary snapshots to be taken or released + */ +static int +zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + /* + * A temporary snapshot is the same as a snapshot, + * hold, destroy and release all rolled into one. + * Delegated diff alone is sufficient that we allow this. + */ + int error; + + if ((error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_DIFF, cr)) == 0) + return (0); + + error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); + if (error == 0) + error = zfs_secpolicy_hold(zc, innvl, cr); + if (error == 0) + error = zfs_secpolicy_release(zc, innvl, cr); + if (error == 0) + error = zfs_secpolicy_destroy(zc, innvl, cr); + return (error); } /* @@ -924,14 +1371,14 @@ get_nvlist(uint64_t nvl, uint64_t size, * Read in and unpack the user-supplied nvlist. */ if (size == 0) - return (EINVAL); + return (SET_ERROR(EINVAL)); packed = kmem_alloc(size, KM_SLEEP); if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag)) != 0) { kmem_free(packed, size); - return (error); + return (SET_ERROR(EFAULT)); } if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { @@ -945,36 +1392,40 @@ get_nvlist(uint64_t nvl, uint64_t size, return (0); } +/* + * Reduce the size of this nvlist until it can be serialized in 'max' bytes. + * Entries will be removed from the end of the nvlist, and one int32 entry + * named "N_MORE_ERRORS" will be added indicating how many entries were + * removed. + */ static int -fit_error_list(zfs_cmd_t *zc, nvlist_t **errors) +nvlist_smush(nvlist_t *errors, size_t max) { size_t size; - VERIFY(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0); + size = fnvlist_size(errors); - if (size > zc->zc_nvlist_dst_size) { + if (size > max) { nvpair_t *more_errors; int n = 0; - if (zc->zc_nvlist_dst_size < 1024) - return (ENOMEM); + if (max < 1024) + return (SET_ERROR(ENOMEM)); - VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, 0) == 0); - more_errors = nvlist_prev_nvpair(*errors, NULL); + fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); + more_errors = nvlist_prev_nvpair(errors, NULL); do { - nvpair_t *pair = nvlist_prev_nvpair(*errors, + nvpair_t *pair = nvlist_prev_nvpair(errors, more_errors); - VERIFY(nvlist_remove_nvpair(*errors, pair) == 0); + fnvlist_remove_nvpair(errors, pair); n++; - VERIFY(nvlist_size(*errors, &size, - NV_ENCODE_NATIVE) == 0); - } while (size > zc->zc_nvlist_dst_size); - - VERIFY(nvlist_remove_nvpair(*errors, more_errors) == 0); - VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, n) == 0); - ASSERT(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0); - ASSERT(size <= zc->zc_nvlist_dst_size); + size = fnvlist_size(errors); + } while (size > max); + + fnvlist_remove_nvpair(errors, more_errors); + fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); + ASSERT3U(fnvlist_size(errors), <=, max); } return (0); @@ -984,23 +1435,31 @@ static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; + int error = 0; size_t size; - int error; - VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); + size = fnvlist_size(nvl); if (size > zc->zc_nvlist_dst_size) { - error = ENOMEM; + /* + * Solaris returns ENOMEM here, because even if an error is + * returned from an ioctl(2), new zc_nvlist_dst_size will be + * passed to the userland. This is not the case for FreeBSD. + * We need to return 0, so the kernel will copy the + * zc_nvlist_dst_size back and the userland can discover that a + * bigger buffer is needed. + */ + error = 0; } else { - packed = kmem_alloc(size, KM_SLEEP); - VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, - KM_SLEEP) == 0); - error = ddi_copyout(packed, - (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags); - kmem_free(packed, size); + packed = fnvlist_pack(nvl, &size); + if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, + size, zc->zc_iflags) != 0) + error = SET_ERROR(EFAULT); + fnvlist_pack_free(packed, size); } zc->zc_nvlist_dst_size = size; + zc->zc_nvlist_dst_filled = B_TRUE; return (error); } @@ -1008,49 +1467,62 @@ static int getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; + vfs_t *vfsp; int error; error = dmu_objset_hold(dsname, FTAG, &os); - if (error) + if (error != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); - return (EINVAL); + return (SET_ERROR(EINVAL)); } mutex_enter(&os->os_user_ptr_lock); *zfvp = dmu_objset_get_user(os); if (*zfvp) { - VFS_HOLD((*zfvp)->z_vfs); + vfsp = (*zfvp)->z_vfs; + vfs_ref(vfsp); } else { - error = ESRCH; + error = SET_ERROR(ESRCH); } mutex_exit(&os->os_user_ptr_lock); dmu_objset_rele(os, FTAG); + if (error == 0) { + error = vfs_busy(vfsp, 0); + vfs_rel(vfsp); + if (error != 0) { + *zfvp = NULL; + error = SET_ERROR(ESRCH); + } + } return (error); } /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_vfs will be NULL, and it will be opened as the owner. + * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, + * which prevents all vnode ops from running. */ static int -zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp) +zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) { int error = 0; if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, zfvp); if (error == 0) { - rrw_enter(&(*zfvp)->z_teardown_lock, RW_READER, tag); + rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : + RW_READER, tag); if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ - rrw_exit(&(*zfvp)->z_teardown_lock, tag); - return (EBUSY); + rrm_exit(&(*zfvp)->z_teardown_lock, tag); + return (SET_ERROR(EBUSY)); } } return (error); @@ -1059,10 +1531,14 @@ zfsvfs_hold(const char *name, void *tag, static void zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) { - rrw_exit(&zfsvfs->z_teardown_lock, tag); + rrm_exit(&zfsvfs->z_teardown_lock, tag); if (zfsvfs->z_vfs) { +#ifdef illumos VFS_RELE(zfsvfs->z_vfs); +#else + vfs_unbusy(zfsvfs->z_vfs); +#endif } else { dmu_objset_disown(zfsvfs->z_os, zfsvfs); zfsvfs_free(zfsvfs); @@ -1076,7 +1552,6 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) nvlist_t *config, *props = NULL; nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; - char *buf; if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) @@ -1095,8 +1570,8 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); - if (version < SPA_VERSION_INITIAL || version > SPA_VERSION) { - error = EINVAL; + if (!SPA_VERSION_IS_SUPPORTED(version)) { + error = SET_ERROR(EINVAL); goto pool_props_bad; } (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); @@ -1112,13 +1587,11 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); - if (error) + if (error != 0) goto pool_props_bad; } - buf = history_str_get(zc); - - error = spa_create(zc->zc_name, config, props, buf, zplprops); + error = spa_create(zc->zc_name, config, props, zplprops); /* * Set the remaining root properties @@ -1127,9 +1600,6 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) (void) spa_destroy(zc->zc_name); - if (buf != NULL) - history_str_free(buf); - pool_props_bad: nvlist_free(rootprops); nvlist_free(zplprops); @@ -1170,19 +1640,20 @@ zfs_ioc_pool_import(zfs_cmd_t *zc) if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) - error = EINVAL; - else if (zc->zc_cookie) - error = spa_import_verbatim(zc->zc_name, config, props); + error = SET_ERROR(EINVAL); else - error = spa_import(zc->zc_name, config, props); + error = spa_import(zc->zc_name, config, props, zc->zc_cookie); + + if (zc->zc_nvlist_dst != 0) { + int err; - if (zc->zc_nvlist_dst != 0) - (void) put_nvlist(zc, config); + if ((err = put_nvlist(zc, config)) != 0) + error = err; + } nvlist_free(config); - if (props) - nvlist_free(props); + nvlist_free(props); return (error); } @@ -1208,7 +1679,7 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc) int error; if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) - return (EEXIST); + return (SET_ERROR(EEXIST)); error = put_nvlist(zc, configs); @@ -1217,6 +1688,15 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of the pool + * + * outputs: + * zc_cookie real errno + * zc_nvlist_dst config nvlist + * zc_nvlist_dst_size size of config nvlist + */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { @@ -1263,7 +1743,7 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc) nvlist_free(tryconfig); if (config == NULL) - return (EINVAL); + return (SET_ERROR(EINVAL)); error = put_nvlist(zc, config); nvlist_free(config); @@ -1271,8 +1751,13 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of the pool + * zc_cookie scan func (pool_scan_func_t) + */ static int -zfs_ioc_pool_scrub(zfs_cmd_t *zc) +zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; @@ -1280,7 +1765,10 @@ zfs_ioc_pool_scrub(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - error = spa_scrub(spa, zc->zc_cookie); + if (zc->zc_cookie == POOL_SCAN_NONE) + error = spa_scan_stop(spa); + else + error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); @@ -1310,9 +1798,10 @@ zfs_ioc_pool_upgrade(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - if (zc->zc_cookie < spa_version(spa) || zc->zc_cookie > SPA_VERSION) { + if (zc->zc_cookie < spa_version(spa) || + !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); - return (EINVAL); + return (SET_ERROR(EINVAL)); } spa_upgrade(spa, zc->zc_cookie); @@ -1330,14 +1819,14 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc) int error; if ((size = zc->zc_history_len) == 0) - return (EINVAL); + return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); } hist_buf = kmem_alloc(size, KM_SLEEP); @@ -1354,14 +1843,23 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc) } static int -zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) +zfs_ioc_pool_reguid(zfs_cmd_t *zc) { + spa_t *spa; int error; - if (error = dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)) - return (error); + error = spa_open(zc->zc_name, &spa, FTAG); + if (error == 0) { + error = spa_change_guid(spa); + spa_close(spa, FTAG); + } + return (error); +} - return (0); +static int +zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) +{ + return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); } /* @@ -1383,7 +1881,7 @@ zfs_ioc_obj_to_path(zfs_cmd_t *zc) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); - return (EINVAL); + return (SET_ERROR(EINVAL)); } error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); @@ -1392,6 +1890,35 @@ zfs_ioc_obj_to_path(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of filesystem + * zc_obj object to find + * + * outputs: + * zc_stat stats on object + * zc_value path to object + */ +static int +zfs_ioc_obj_to_stats(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + + /* XXX reading from objset not owned */ + if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) + return (error); + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele(os, FTAG); + return (SET_ERROR(EINVAL)); + } + error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, + sizeof (zc->zc_value)); + dmu_objset_rele(os, FTAG); + + return (error); +} + static int zfs_ioc_vdev_add(zfs_cmd_t *zc) { @@ -1412,6 +1939,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, &spares, &nspares); +#ifdef illumos /* * A root pool with concatenated devices is not supported. * Thus, can not add a device to a root pool. @@ -1425,8 +1953,9 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) { nvlist_free(config); spa_close(spa, FTAG); - return (EDOM); + return (SET_ERROR(EDOM)); } +#endif /* illumos */ if (error == 0) { error = spa_vdev_add(spa, config); @@ -1436,6 +1965,12 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * zc_name name of the pool + * zc_nvlist_conf nvlist of devices to remove + * zc_cookie to stop the remove? + */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { @@ -1485,7 +2020,7 @@ zfs_ioc_vdev_set_state(zfs_cmd_t *zc) break; default: - error = EINVAL; + error = SET_ERROR(EINVAL); } zc->zc_cookie = newstate; spa_close(spa, FTAG); @@ -1597,26 +2132,12 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc) return (error); } -/* - * inputs: - * zc_name name of filesystem - * zc_nvlist_dst_size size of buffer for property nvlist - * - * outputs: - * zc_objset_stats stats - * zc_nvlist_dst property nvlist - * zc_nvlist_dst_size size of property nvlist - */ static int -zfs_ioc_objset_stats(zfs_cmd_t *zc) +zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) { - objset_t *os = NULL; - int error; + int error = 0; nvlist_t *nv; - if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) - return (error); - dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (zc->zc_nvlist_dst != 0 && @@ -1629,15 +2150,44 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc) * inconsistent. So this is a bit of a workaround... * XXX reading with out owning */ - if (!zc->zc_objset_stats.dds_inconsistent) { - if (dmu_objset_type(os) == DMU_OST_ZVOL) - VERIFY(zvol_get_stats(os, nv) == 0); + if (!zc->zc_objset_stats.dds_inconsistent && + dmu_objset_type(os) == DMU_OST_ZVOL) { + error = zvol_get_stats(os, nv); + if (error == EIO) + return (error); + VERIFY0(error); } error = put_nvlist(zc, nv); nvlist_free(nv); } - dmu_objset_rele(os, FTAG); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + */ +static int +zfs_ioc_objset_stats(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + + error = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (error == 0) { + error = zfs_ioc_objset_stats_impl(zc, os); + dmu_objset_rele(os, FTAG); + } + + if (error == ENOMEM) + error = 0; return (error); } @@ -1657,30 +2207,23 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc) static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { - objset_t *os = NULL; - int error; + int error = 0; nvlist_t *nv; - if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) - return (error); - /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ - if (!dsl_prop_get_hasrecvd(os)) { - dmu_objset_rele(os, FTAG); - return (ENOTSUP); - } + if (!dsl_prop_get_hasrecvd(zc->zc_name)) + return (SET_ERROR(ENOTSUP)); if (zc->zc_nvlist_dst != 0 && - (error = dsl_prop_get_received(os, &nv)) == 0) { + (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } - dmu_objset_rele(os, FTAG); return (error); } @@ -1739,13 +2282,13 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc) err = put_nvlist(zc, nv); nvlist_free(nv); } else { - err = ENOENT; + err = SET_ERROR(ENOENT); } dmu_objset_rele(os, FTAG); return (err); } -static boolean_t +boolean_t dataset_name_hidden(const char *name) { /* @@ -1757,7 +2300,7 @@ dataset_name_hidden(const char *name) return (B_TRUE); if (strchr(name, '%') != NULL) return (B_TRUE); - if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL)) + if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL)) return (B_TRUE); return (B_FALSE); } @@ -1786,7 +2329,7 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc) top: if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) { if (error == ENOENT) - error = ESRCH; + error = SET_ERROR(ESRCH); return (error); } @@ -1795,26 +2338,13 @@ top: (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); - /* - * Pre-fetch the datasets. dmu_objset_prefetch() always returns 0 - * but is not declared void because its called by dmu_objset_find(). - */ - if (zc->zc_cookie == 0) { - uint64_t cookie = 0; - int len = sizeof (zc->zc_name) - (p - zc->zc_name); - - while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) - (void) dmu_objset_prefetch(p, NULL); - } - do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, NULL, &zc->zc_cookie); if (error == ENOENT) - error = ESRCH; - } while (error == 0 && dataset_name_hidden(zc->zc_name) && - !(zc->zc_iflags & FKIOCTL)); + error = SET_ERROR(ESRCH); + } while (error == 0 && dataset_name_hidden(zc->zc_name)); dmu_objset_rele(os, FTAG); /* @@ -1837,6 +2367,7 @@ top: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist + * zc_simple when set, only name is requested * * outputs: * zc_name name of next snapshot @@ -1850,41 +2381,46 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc objset_t *os; int error; -top: - if (zc->zc_cookie == 0) - (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch, - NULL, DS_FIND_SNAPSHOTS); - error = dmu_objset_hold(zc->zc_name, FTAG, &os); - if (error) + if (error != 0) { return (error == ENOENT ? ESRCH : error); + } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ - if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) { + if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= + ZFS_MAX_DATASET_NAME_LEN) { dmu_objset_rele(os, FTAG); - return (ESRCH); + return (SET_ERROR(ESRCH)); } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), - zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL); - dmu_objset_rele(os, FTAG); - if (error == 0) { - error = zfs_ioc_objset_stats(zc); /* fill in the stats */ - if (error == ENOENT) { - /* We lost a race with destroy, get the next one. */ - *strchr(zc->zc_name, '@') = '\0'; - goto top; + zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, + NULL); + + if (error == 0 && !zc->zc_simple) { + dsl_dataset_t *ds; + dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; + + error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds); + if (error == 0) { + objset_t *ossnap; + + error = dmu_objset_from_ds(ds, &ossnap); + if (error == 0) + error = zfs_ioc_objset_stats_impl(zc, ossnap); + dsl_dataset_rele(ds, FTAG); } } else if (error == ENOENT) { - error = ESRCH; + error = SET_ERROR(ESRCH); } + dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ - if (error) + if (error != 0) *strchr(zc->zc_name, '@') = '\0'; return (error); } @@ -1896,6 +2432,7 @@ zfs_prop_set_userquota(const char *dsnam uint64_t *valary; unsigned int vallen; const char *domain; + char *dash; zfs_userquota_prop_t type; uint64_t rid; uint64_t quota; @@ -1905,22 +2442,26 @@ zfs_prop_set_userquota(const char *dsnam if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); - VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &pair) == 0); + if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &pair) != 0) + return (SET_ERROR(EINVAL)); } - VERIFY(nvpair_value_uint64_array(pair, &valary, &vallen) == 0); - VERIFY(vallen == 3); - type = valary[0]; - rid = valary[1]; - quota = valary[2]; /* - * The propname is encoded as + * A correctly constructed propname is encoded as * userquota@-. */ - domain = strchr(propname, '-') + 1; + if ((dash = strchr(propname, '-')) == NULL || + nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || + vallen != 3) + return (SET_ERROR(EINVAL)); + + domain = dash + 1; + type = valary[0]; + rid = valary[1]; + quota = valary[2]; - err = zfsvfs_hold(dsname, FTAG, &zfsvfs); + err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); if (err == 0) { err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); zfsvfs_rele(zfsvfs, FTAG); @@ -1934,7 +2475,7 @@ zfs_prop_set_userquota(const char *dsnam * return 0 on success and a positive error code on failure; otherwise if it is * not one of the special properties handled by this function, return -1. * - * XXX: It would be better for callers of the properety interface if we handled + * XXX: It would be better for callers of the property interface if we handled * these special cases in dsl_prop.c (in the dsl layer). */ static int @@ -1944,7 +2485,7 @@ zfs_prop_set_special(const char *dsname, const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval; - int err; + int err = -1; if (prop == ZPROP_INVAL) { if (zfs_prop_userquota(propname)) @@ -1969,23 +2510,37 @@ zfs_prop_set_special(const char *dsname, err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: - err = dsl_dataset_set_quota(dsname, source, intval); + err = dsl_dataset_set_refquota(dsname, source, intval); + break; + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + if (intval == UINT64_MAX) { + /* clearing the limit, just do it */ + err = 0; + } else { + err = dsl_dir_activate_fs_ss_limit(dsname); + } + /* + * Set err to -1 to force the zfs_set_prop_nvlist code down the + * default path to set the value in the nvlist. + */ + if (err == 0) + err = -1; break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: - err = dsl_dataset_set_reservation(dsname, source, intval); + err = dsl_dataset_set_refreservation(dsname, source, intval); break; case ZFS_PROP_VOLSIZE: - err = zvol_set_volsize(dsname, ddi_driver_major(zfs_dip), - intval); + err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_VERSION: { zfsvfs_t *zfsvfs; - if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs)) != 0) + if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; err = zfs_set_version(zfsvfs, intval); @@ -2001,7 +2556,6 @@ zfs_prop_set_special(const char *dsname, } break; } - default: err = -1; } @@ -2011,31 +2565,25 @@ zfs_prop_set_special(const char *dsname, /* * This function is best effort. If it fails to set any of the given properties, - * it continues to set as many as it can and returns the first error - * encountered. If the caller provides a non-NULL errlist, it also gives the - * complete list of names of all the properties it failed to set along with the - * corresponding error numbers. The caller is responsible for freeing the - * returned errlist. + * it continues to set as many as it can and returns the last error + * encountered. If the caller provides a non-NULL errlist, it will be filled in + * with the list of names of all the properties that failed along with the + * corresponding error numbers. * - * If every property is set successfully, zero is returned and the list pointed - * at by errlist is NULL. + * If every property is set successfully, zero is returned and errlist is not + * modified. */ int zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, - nvlist_t **errlist) + nvlist_t *errlist) { nvpair_t *pair; nvpair_t *propval; int rv = 0; uint64_t intval; char *strval; - nvlist_t *genericnvl; - nvlist_t *errors; - nvlist_t *retrynvl; - - VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_alloc(&retrynvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); + nvlist_t *genericnvl = fnvlist_alloc(); + nvlist_t *retrynvl = fnvlist_alloc(); retry: pair = NULL; @@ -2048,48 +2596,50 @@ retry: propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; - VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); - VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &propval) == 0); + attrs = fnvpair_value_nvlist(pair); + if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &propval) != 0) + err = SET_ERROR(EINVAL); } /* Validate value type */ - if (prop == ZPROP_INVAL) { + if (err == 0 && prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if (nvpair_type(propval) != DATA_TYPE_STRING) - err = EINVAL; + err = SET_ERROR(EINVAL); } else if (zfs_prop_userquota(propname)) { if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) - err = EINVAL; + err = SET_ERROR(EINVAL); + } else { + err = SET_ERROR(EINVAL); } - } else { + } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) - err = EINVAL; + err = SET_ERROR(EINVAL); } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; - VERIFY(nvpair_value_uint64(propval, - &intval) == 0); + intval = fnvpair_value_uint64(propval); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: - err = EINVAL; + err = SET_ERROR(EINVAL); break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) - err = EINVAL; + err = SET_ERROR(EINVAL); break; default: cmn_err(CE_PANIC, "unknown property type"); } } else { - err = EINVAL; + err = SET_ERROR(EINVAL); } } @@ -2115,8 +2665,11 @@ retry: } } - if (err != 0) - VERIFY(nvlist_add_int32(errors, propname, err) == 0); + if (err != 0) { + if (errlist != NULL) + fnvlist_add_int32(errlist, propname, err); + rv = err; + } } if (nvl != retrynvl && !nvlist_empty(retrynvl)) { @@ -2138,44 +2691,33 @@ retry: propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; - VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); - VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, - &propval) == 0); + attrs = fnvpair_value_nvlist(pair); + propval = fnvlist_lookup_nvpair(attrs, + ZPROP_VALUE); } if (nvpair_type(propval) == DATA_TYPE_STRING) { - VERIFY(nvpair_value_string(propval, - &strval) == 0); - err = dsl_prop_set(dsname, propname, source, 1, - strlen(strval) + 1, strval); + strval = fnvpair_value_string(propval); + err = dsl_prop_set_string(dsname, propname, + source, strval); } else { - VERIFY(nvpair_value_uint64(propval, - &intval) == 0); - err = dsl_prop_set(dsname, propname, source, 8, - 1, &intval); + intval = fnvpair_value_uint64(propval); + err = dsl_prop_set_int(dsname, propname, source, + intval); } if (err != 0) { - VERIFY(nvlist_add_int32(errors, propname, - err) == 0); + if (errlist != NULL) { + fnvlist_add_int32(errlist, propname, + err); + } + rv = err; } } } nvlist_free(genericnvl); nvlist_free(retrynvl); - if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { - nvlist_free(errors); - errors = NULL; - } else { - VERIFY(nvpair_value_int32(pair, &rv) == 0); - } - - if (errlist == NULL) - nvlist_free(errors); - else - *errlist = errors; - return (rv); } @@ -2183,28 +2725,26 @@ retry: * Check that all the properties are valid user properties. */ static int -zfs_check_userprops(char *fsname, nvlist_t *nvl) +zfs_check_userprops(const char *fsname, nvlist_t *nvl) { nvpair_t *pair = NULL; int error = 0; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); - char *valstr; if (!zfs_prop_user(propname) || nvpair_type(pair) != DATA_TYPE_STRING) - return (EINVAL); + return (SET_ERROR(EINVAL)); if (error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_USERPROP, CRED())) return (error); if (strlen(propname) >= ZAP_MAXNAMELEN) - return (ENAMETOOLONG); + return (SET_ERROR(ENAMETOOLONG)); - VERIFY(nvpair_value_string(pair, &valstr) == 0); - if (strlen(valstr) >= ZAP_MAXVALUELEN) + if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) return (E2BIG); } return (0); @@ -2227,7 +2767,7 @@ props_skip(nvlist_t *props, nvlist_t *sk } static int -clear_received_props(objset_t *os, const char *fs, nvlist_t *props, +clear_received_props(const char *dsname, nvlist_t *props, nvlist_t *skipped) { int err = 0; @@ -2239,8 +2779,8 @@ clear_received_props(objset_t *os, const * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | - (dsl_prop_get_hasrecvd(os) ? ZPROP_SRC_RECEIVED : 0)); - err = zfs_set_prop_nvlist(fs, flags, cleared_props, NULL); + (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); + err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); @@ -2263,7 +2803,7 @@ zfs_ioc_set_prop(zfs_cmd_t *zc) boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : ZPROP_SRC_LOCAL); - nvlist_t *errors = NULL; + nvlist_t *errors; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, @@ -2272,23 +2812,21 @@ zfs_ioc_set_prop(zfs_cmd_t *zc) if (received) { nvlist_t *origprops; - objset_t *os; - - if (dmu_objset_hold(zc->zc_name, FTAG, &os) == 0) { - if (dsl_prop_get_received(os, &origprops) == 0) { - (void) clear_received_props(os, - zc->zc_name, origprops, nvl); - nvlist_free(origprops); - } - dsl_prop_set_hasrecvd(os); - dmu_objset_rele(os, FTAG); + if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { + (void) clear_received_props(zc->zc_name, + origprops, nvl); + nvlist_free(origprops); } + + error = dsl_prop_set_hasrecvd(zc->zc_name); } - error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, &errors); + errors = fnvlist_alloc(); + if (error == 0) + error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); - if (zc->zc_nvlist_dst != 0 && errors != 0) { + if (zc->zc_nvlist_dst != 0 && errors != NULL) { (void) put_nvlist(zc, errors); } @@ -2327,12 +2865,12 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc) */ if (prop == ZPROP_INVAL) { if (!zfs_prop_user(propname)) - return (EINVAL); + return (SET_ERROR(EINVAL)); type = PROP_TYPE_STRING; } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) { - return (EINVAL); + return (SET_ERROR(EINVAL)); } else { type = zfs_prop_get_type(prop); } @@ -2349,7 +2887,7 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc) break; default: nvlist_free(dummy); - return (EINVAL); + return (SET_ERROR(EINVAL)); } pair = nvlist_next_nvpair(dummy, NULL); @@ -2365,11 +2903,11 @@ zfs_ioc_inherit_prop(zfs_cmd_t *zc) * they are not considered inheritable. */ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) - return (EINVAL); + return (SET_ERROR(EINVAL)); } - /* the property name has been validated by zfs_secpolicy_inherit() */ - return (dsl_prop_set(zc->zc_name, zc->zc_value, source, 0, 0, NULL)); + /* property name has been validated by zfs_secpolicy_inherit_prop() */ + return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source)); } static int @@ -2442,56 +2980,9 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc) if (error == 0 && zc->zc_nvlist_dst != 0) error = put_nvlist(zc, nvp); else - error = EFAULT; - - nvlist_free(nvp); - return (error); -} - -static int -zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc) -{ - nvlist_t *nvp; - int error; - uint32_t uid; - uint32_t gid; - uint32_t *groups; - uint_t group_cnt; - cred_t *usercred; - - if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &nvp)) != 0) { - return (error); - } - - if ((error = nvlist_lookup_uint32(nvp, - ZFS_DELEG_PERM_UID, &uid)) != 0) { - nvlist_free(nvp); - return (EPERM); - } - - if ((error = nvlist_lookup_uint32(nvp, - ZFS_DELEG_PERM_GID, &gid)) != 0) { - nvlist_free(nvp); - return (EPERM); - } + error = SET_ERROR(EFAULT); - if ((error = nvlist_lookup_uint32_array(nvp, ZFS_DELEG_PERM_GROUPS, - &groups, &group_cnt)) != 0) { - nvlist_free(nvp); - return (EPERM); - } - usercred = cralloc(); - if ((crsetugid(usercred, uid, gid) != 0) || - (crsetgroups(usercred, group_cnt, (gid_t *)groups) != 0)) { - nvlist_free(nvp); - crfree(usercred); - return (EPERM); - } nvlist_free(nvp); - error = dsl_deleg_access(zc->zc_name, - zfs_prop_to_name(ZFS_PROP_SHAREISCSI), usercred); - crfree(usercred); return (error); } @@ -2518,7 +3009,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc) */ if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { nvlist_free(fsaclnv); - return (EINVAL); + return (SET_ERROR(EINVAL)); } /* @@ -2528,7 +3019,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc) */ error = secpolicy_zfs(CRED()); - if (error) { + if (error != 0) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); @@ -2574,9 +3065,35 @@ zfs_ioc_get_fsacl(zfs_cmd_t *zc) static vfs_t * zfs_get_vfs(const char *resource) { + vfs_t *vfsp; + +#ifdef __FreeBSD__ + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(vfsp, &mountlist, mnt_list) { + if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) { + if (vfs_busy(vfsp, MBF_MNTLSTLOCK) != 0) + vfsp = NULL; + break; + } + } + if (vfsp == NULL) + mtx_unlock(&mountlist_mtx); +#endif +#ifdef __NetBSD__ + mount_iterator_t *iter; + + mountlist_iterator_init(&iter); + while ((vfsp = mountlist_iterator_next(iter)) != NULL) { + if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) { + if (vfs_busy(vfsp, 0) != 0) + vfsp = NULL; + break; + } + } + mountlist_iterator_destroy(iter); +#endif - printf("XXX zfs_get_vfs write me\n"); - return NULL; + return (vfsp); } /* ARGSUSED */ @@ -2592,10 +3109,10 @@ zfs_create_cb(objset_t *os, void *arg, c /* * inputs: - * createprops list of properties requested by creator - * default_zplver zpl version to use if unspecified in createprops - * fuids_ok fuids allowed in this version of the spa? * os parent objset pointer (NULL if root fs) + * fuids_ok fuids allowed in this version of the spa? + * sa_ok SAs allowed in this version of the spa? + * createprops list of properties requested by creator * * outputs: * zplprops values for the zplprops we attach to the master node object @@ -2612,8 +3129,8 @@ zfs_create_cb(objset_t *os, void *arg, c */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, - boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops, - boolean_t *is_ci) + boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, + nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; @@ -2649,10 +3166,11 @@ zfs_fill_zplprops_impl(objset_t *os, uin */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || + (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); /* * Put the version in the zplprops @@ -2690,11 +3208,13 @@ static int zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { - boolean_t fuids_ok = B_TRUE; + boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; - char parentname[MAXNAMELEN]; + char parentname[ZFS_MAX_DATASET_NAME_LEN]; char *cp; + spa_t *spa; + uint64_t spa_vers; int error; (void) strlcpy(parentname, dataset, sizeof (parentname)); @@ -2702,12 +3222,15 @@ zfs_fill_zplprops(const char *dataset, n ASSERT(cp != NULL); cp[0] = '\0'; - if (zfs_earlier_version(dataset, SPA_VERSION_USERSPACE)) - zplver = ZPL_VERSION_USERSPACE - 1; - if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) { - zplver = ZPL_VERSION_FUID - 1; - fuids_ok = B_FALSE; - } + if ((error = spa_open(dataset, &spa, FTAG)) != 0) + return (error); + + spa_vers = spa_version(spa); + spa_close(spa, FTAG); + + zplver = zfs_zpl_version_map(spa_vers); + fuids_ok = (zplver >= ZPL_VERSION_FUID); + sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. @@ -2715,7 +3238,7 @@ zfs_fill_zplprops(const char *dataset, n if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); - error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, createprops, + error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); dmu_objset_rele(os, FTAG); return (error); @@ -2725,41 +3248,45 @@ static int zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { - boolean_t fuids_ok = B_TRUE; + boolean_t fuids_ok; + boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; - if (spa_vers < SPA_VERSION_FUID) { - zplver = ZPL_VERSION_FUID - 1; - fuids_ok = B_FALSE; - } + zplver = zfs_zpl_version_map(spa_vers); + fuids_ok = (zplver >= ZPL_VERSION_FUID); + sa_ok = (zplver >= ZPL_VERSION_SA); - error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, createprops, - zplprops, is_ci); + error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, + createprops, zplprops, is_ci); return (error); } /* - * inputs: - * zc_objset_type type of objset to create (fs vs zvol) - * zc_name name of new objset - * zc_value name of snapshot to clone from (may be empty) - * zc_nvlist_src{_size} nvlist of properties to apply + * innvl: { + * "type" -> dmu_objset_type_t (int32) + * (optional) "props" -> { prop -> value } + * } * - * outputs: none + * outnvl: propname -> error code (int32) */ static int -zfs_ioc_create(zfs_cmd_t *zc) +zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { - objset_t *clone; int error = 0; - zfs_creat_t zct; + zfs_creat_t zct = { 0 }; nvlist_t *nvprops = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); - dmu_objset_type_t type = zc->zc_objset_type; + int32_t type32; + dmu_objset_type_t type; + boolean_t is_insensitive = B_FALSE; + + if (nvlist_lookup_int32(innvl, "type", &type32) != 0) + return (SET_ERROR(EINVAL)); + type = type32; + (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); switch (type) { - case DMU_OST_ZFS: cbfunc = zfs_create_cb; break; @@ -2772,359 +3299,626 @@ zfs_ioc_create(zfs_cmd_t *zc) cbfunc = NULL; break; } - if (strchr(zc->zc_name, '@') || - strchr(zc->zc_name, '%')) - return (EINVAL); - - if (zc->zc_nvlist_src != 0 && - (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &nvprops)) != 0) - return (error); + if (strchr(fsname, '@') || + strchr(fsname, '%')) + return (SET_ERROR(EINVAL)); - zct.zct_zplprops = NULL; zct.zct_props = nvprops; - if (zc->zc_value[0] != '\0') { - /* - * We're creating a clone of an existing snapshot. - */ - zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; - if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0) { - nvlist_free(nvprops); - return (EINVAL); - } - - error = dmu_objset_hold(zc->zc_value, FTAG, &clone); - if (error) { - nvlist_free(nvprops); - return (error); - } - - error = dmu_objset_clone(zc->zc_name, dmu_objset_ds(clone), 0); - dmu_objset_rele(clone, FTAG); - if (error) { - nvlist_free(nvprops); - return (error); - } - } else { - boolean_t is_insensitive = B_FALSE; - - if (cbfunc == NULL) { - nvlist_free(nvprops); - return (EINVAL); - } - - if (type == DMU_OST_ZVOL) { - uint64_t volsize, volblocksize; + if (cbfunc == NULL) + return (SET_ERROR(EINVAL)); - if (nvprops == NULL || - nvlist_lookup_uint64(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLSIZE), - &volsize) != 0) { - nvlist_free(nvprops); - return (EINVAL); - } + if (type == DMU_OST_ZVOL) { + uint64_t volsize, volblocksize; - if ((error = nvlist_lookup_uint64(nvprops, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), - &volblocksize)) != 0 && error != ENOENT) { - nvlist_free(nvprops); - return (EINVAL); - } + if (nvprops == NULL) + return (SET_ERROR(EINVAL)); + if (nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) + return (SET_ERROR(EINVAL)); + + if ((error = nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + &volblocksize)) != 0 && error != ENOENT) + return (SET_ERROR(EINVAL)); - if (error != 0) - volblocksize = zfs_prop_default_numeric( - ZFS_PROP_VOLBLOCKSIZE); + if (error != 0) + volblocksize = zfs_prop_default_numeric( + ZFS_PROP_VOLBLOCKSIZE); - if ((error = zvol_check_volblocksize( - volblocksize)) != 0 || - (error = zvol_check_volsize(volsize, - volblocksize)) != 0) { - nvlist_free(nvprops); - return (error); - } - } else if (type == DMU_OST_ZFS) { - int error; + if ((error = zvol_check_volblocksize( + volblocksize)) != 0 || + (error = zvol_check_volsize(volsize, + volblocksize)) != 0) + return (error); + } else if (type == DMU_OST_ZFS) { + int error; - /* - * We have to have normalization and - * case-folding flags correct when we do the - * file system creation, so go figure them out - * now. - */ - VERIFY(nvlist_alloc(&zct.zct_zplprops, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - error = zfs_fill_zplprops(zc->zc_name, nvprops, - zct.zct_zplprops, &is_insensitive); - if (error != 0) { - nvlist_free(nvprops); - nvlist_free(zct.zct_zplprops); - return (error); - } + /* + * We have to have normalization and + * case-folding flags correct when we do the + * file system creation, so go figure them out + * now. + */ + VERIFY(nvlist_alloc(&zct.zct_zplprops, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + error = zfs_fill_zplprops(fsname, nvprops, + zct.zct_zplprops, &is_insensitive); + if (error != 0) { + nvlist_free(zct.zct_zplprops); + return (error); } - error = dmu_objset_create(zc->zc_name, type, - is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); - nvlist_free(zct.zct_zplprops); } + error = dmu_objset_create(fsname, type, + is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); + nvlist_free(zct.zct_zplprops); + /* * It would be nice to do this atomically. */ if (error == 0) { - error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL, - nvprops, NULL); + error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, + nvprops, outnvl); if (error != 0) - (void) dmu_objset_destroy(zc->zc_name, B_FALSE); + (void) dsl_destroy_head(fsname); } - nvlist_free(nvprops); +#ifdef __FreeBSD__ + if (error == 0 && type == DMU_OST_ZVOL) + zvol_create_minors(fsname); +#endif return (error); } /* - * inputs: - * zc_name name of filesystem - * zc_value short name of snapshot - * zc_cookie recursive flag - * zc_nvlist_src[_size] property list + * innvl: { + * "origin" -> name of origin snapshot + * (optional) "props" -> { prop -> value } + * } * - * outputs: - * zc_value short snapname (i.e. part after the '@') + * outnvl: propname -> error code (int32) */ static int -zfs_ioc_snapshot(zfs_cmd_t *zc) +zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { + int error = 0; nvlist_t *nvprops = NULL; - int error; - boolean_t recursive = zc->zc_cookie; - - if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) - return (EINVAL); + char *origin_name; - if (zc->zc_nvlist_src != 0 && - (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - zc->zc_iflags, &nvprops)) != 0) + if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0) + return (SET_ERROR(EINVAL)); + (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); + + if (strchr(fsname, '@') || + strchr(fsname, '%')) + return (SET_ERROR(EINVAL)); + + if (dataset_namecheck(origin_name, NULL, NULL) != 0) + return (SET_ERROR(EINVAL)); + error = dmu_objset_clone(fsname, origin_name); + if (error != 0) return (error); - error = zfs_check_userprops(zc->zc_name, nvprops); - if (error) - goto out; - - if (!nvlist_empty(nvprops) && - zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) { - error = ENOTSUP; - goto out; + /* + * It would be nice to do this atomically. + */ + if (error == 0) { + error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, + nvprops, outnvl); + if (error != 0) + (void) dsl_destroy_head(fsname); } - - error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, - nvprops, recursive); - -out: - nvlist_free(nvprops); +#ifdef __FreeBSD__ + if (error == 0) + zvol_create_minors(fsname); +#endif return (error); } -int -zfs_unmount_snap(const char *name, void *arg) +/* + * innvl: { + * "snaps" -> { snapshot1, snapshot2 } + * (optional) "props" -> { prop -> value (string) } + * } + * + * outnvl: snapshot -> error code (int32) + */ +static int +zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { - vfs_t *vfsp = NULL; + nvlist_t *snaps; + nvlist_t *props = NULL; + int error, poollen; + nvpair_t *pair; - if (arg) { - char *snapname = arg; - char *fullname = kmem_asprintf("%s@%s", name, snapname); - vfsp = zfs_get_vfs(fullname); - strfree(fullname); - } else if (strchr(name, '@')) { - vfsp = zfs_get_vfs(name); - } + (void) nvlist_lookup_nvlist(innvl, "props", &props); + if ((error = zfs_check_userprops(poolname, props)) != 0) + return (error); - if (vfsp) { -#ifdef __NetBSD__ - int err; - if ((err = dounmount(vfsp, MNT_FORCE, curlwp)) != 0) - return (err); -#else + if (!nvlist_empty(props) && + zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) + return (SET_ERROR(ENOTSUP)); + + if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) + return (SET_ERROR(EINVAL)); + poollen = strlen(poolname); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + const char *name = nvpair_name(pair); + const char *cp = strchr(name, '@'); /* - * Always force the unmount for snapshots. + * The snap name must contain an @, and the part after it must + * contain only valid characters. */ - int flag = MS_FORCE; - int err; + if (cp == NULL || + zfs_component_namecheck(cp + 1, NULL, NULL) != 0) + return (SET_ERROR(EINVAL)); - if ((err = vn_vfswlock(vfsp->vfs_vnodecovered)) != 0) { - VFS_RELE(vfsp); - return (err); + /* + * The snap must be in the specified pool. + */ + if (strncmp(name, poolname, poollen) != 0 || + (name[poollen] != '/' && name[poollen] != '@')) + return (SET_ERROR(EXDEV)); + + /* This must be the only snap of this fs. */ + for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); + pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { + if (strncmp(name, nvpair_name(pair2), cp - name + 1) + == 0) { + return (SET_ERROR(EXDEV)); + } } - VFS_RELE(vfsp); - if ((err = dounmount(vfsp, flag, kcred)) != 0) - return (err); -#endif } - return (0); + + error = dsl_dataset_snapshot(snaps, props, outnvl); + return (error); } /* - * inputs: - * zc_name name of filesystem - * zc_value short name of snapshot - * zc_defer_destroy mark for deferred destroy - * - * outputs: none + * innvl: "message" -> string */ +/* ARGSUSED */ static int -zfs_ioc_destroy_snaps(zfs_cmd_t *zc) +zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { - int err; + char *message; + spa_t *spa; + int error; + char *poolname; - if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) - return (EINVAL); - err = dmu_objset_find(zc->zc_name, - zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN); - if (err) - return (err); - return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value, - zc->zc_defer_destroy)); + /* + * The poolname in the ioctl is not set, we get it from the TSD, + * which was set at the end of the last successful ioctl that allows + * logging. The secpolicy func already checked that it is set. + * Only one log ioctl is allowed after each successful ioctl, so + * we clear the TSD here. + */ + poolname = tsd_get(zfs_allow_log_key); + (void) tsd_set(zfs_allow_log_key, NULL); + error = spa_open(poolname, &spa, FTAG); + strfree(poolname); + if (error != 0) + return (error); + + if (nvlist_lookup_string(innvl, "message", &message) != 0) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + + if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + error = spa_history_log(spa, message); + spa_close(spa, FTAG); + return (error); } -/* - * inputs: - * zc_name name of dataset to destroy - * zc_objset_type type of objset - * zc_defer_destroy mark for deferred destroy - * - * outputs: none - */ +#ifdef __FreeBSD__ static int -zfs_ioc_destroy(zfs_cmd_t *zc) +zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { - int err; - if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) { - err = zfs_unmount_snap(zc->zc_name, NULL); - if (err) - return (err); - } + char name[MAXNAMELEN]; + spa_t *spa; + vdev_t *vd; + char *command; + uint64_t pool_guid; + uint64_t vdev_guid; + int error; - err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy); - if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) - (void) zvol_remove_minor(zc->zc_name); - return (err); + if (nvlist_lookup_uint64(innvl, + ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) + return (EINVAL); + if (nvlist_lookup_uint64(innvl, + ZPOOL_CONFIG_GUID, &vdev_guid) != 0) + return (EINVAL); + if (nvlist_lookup_string(innvl, + "command", &command) != 0) + return (EINVAL); + + mutex_enter(&spa_namespace_lock); + spa = spa_by_guid(pool_guid, vdev_guid); + if (spa != NULL) + strcpy(name, spa_name(spa)); + mutex_exit(&spa_namespace_lock); + if (spa == NULL) + return (ENOENT); + + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); + if (vd == NULL) { + (void) spa_vdev_state_exit(spa, NULL, ENXIO); + spa_close(spa, FTAG); + return (ENODEV); + } + error = vdev_label_write_pad2(vd, command, strlen(command)); + (void) spa_vdev_state_exit(spa, NULL, 0); + txg_wait_synced(spa->spa_dsl_pool, 0); + spa_close(spa, FTAG); + return (error); } +#endif /* - * inputs: - * zc_name name of dataset to rollback (to most recent snapshot) + * The dp_config_rwlock must not be held when calling this, because the + * unmount may need to write out data. * - * outputs: none + * This function is best-effort. Callers must deal gracefully if it + * remains mounted (or is remounted after this call). + * + * Returns 0 if the argument is not a snapshot, or it is not currently a + * filesystem, or we were able to unmount it. Returns error code otherwise. */ -static int -zfs_ioc_rollback(zfs_cmd_t *zc) +int +zfs_unmount_snap(const char *snapname) { - dsl_dataset_t *ds, *clone; - int error; + vfs_t *vfsp; zfsvfs_t *zfsvfs; - char *clone_name; - - error = dsl_dataset_hold(zc->zc_name, FTAG, &ds); - if (error) - return (error); + int err; - /* must not be a snapshot */ - if (dsl_dataset_is_snapshot(ds)) { - dsl_dataset_rele(ds, FTAG); - return (EINVAL); - } + if (strchr(snapname, '@') == NULL) + return (0); - /* must have a most recent snapshot */ - if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) { - dsl_dataset_rele(ds, FTAG); - return (EINVAL); - } + vfsp = zfs_get_vfs(snapname); + if (vfsp == NULL) + return (0); - /* - * Create clone of most recent snapshot. - */ - clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name); - error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT); - if (error) - goto out; + zfsvfs = vfsp->vfs_data; + ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); - error = dsl_dataset_own(clone_name, B_TRUE, FTAG, &clone); - if (error) - goto out; + err = vn_vfswlock(vfsp->vfs_vnodecovered); +#ifdef illumos + VFS_RELE(vfsp); +#else + vfs_unbusy(vfsp); +#endif + if (err != 0) + return (SET_ERROR(err)); /* - * Do clone swap. + * Always force the unmount for snapshots. */ - if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { - error = zfs_suspend_fs(zfsvfs); - if (error == 0) { - int resume_err; - - if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { - error = dsl_dataset_clone_swap(clone, ds, - B_TRUE); - dsl_dataset_disown(ds, FTAG); - ds = NULL; - } else { - error = EBUSY; - } - resume_err = zfs_resume_fs(zfsvfs, zc->zc_name); - error = error ? error : resume_err; - } - VFS_RELE(zfsvfs->z_vfs); - } else { - if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { - error = dsl_dataset_clone_swap(clone, ds, B_TRUE); - dsl_dataset_disown(ds, FTAG); - ds = NULL; - } else { - error = EBUSY; - } - } - /* - * Destroy clone (which also closes it). - */ - (void) dsl_dataset_destroy(clone, FTAG, B_FALSE); +#ifdef illumos + (void) dounmount(vfsp, MS_FORCE, kcred); +#else + vfs_ref(vfsp); + (void) dounmount(vfsp, MS_FORCE, curthread); +#endif + return (0); +} -out: - strfree(clone_name); - if (ds) - dsl_dataset_rele(ds, FTAG); - return (error); +/* ARGSUSED */ +static int +zfs_unmount_snap_cb(const char *snapname, void *arg) +{ + return (zfs_unmount_snap(snapname)); } /* - * inputs: - * zc_name old name of dataset - * zc_value new name of dataset - * zc_cookie recursive flag (only valid for snapshots) - * - * outputs: none + * When a clone is destroyed, its origin may also need to be destroyed, + * in which case it must be unmounted. This routine will do that unmount + * if necessary. */ -static int +void +zfs_destroy_unmount_origin(const char *fsname) +{ + int error; + objset_t *os; + dsl_dataset_t *ds; + + error = dmu_objset_hold(fsname, FTAG, &os); + if (error != 0) + return; + ds = dmu_objset_ds(os); + if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { + char originname[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(ds->ds_prev, originname); + dmu_objset_rele(os, FTAG); + (void) zfs_unmount_snap(originname); + } else { + dmu_objset_rele(os, FTAG); + } +} + +/* + * innvl: { + * "snaps" -> { snapshot1, snapshot2 } + * (optional boolean) "defer" + * } + * + * outnvl: snapshot -> error code (int32) + * + */ +/* ARGSUSED */ +static int +zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int error, poollen; + nvlist_t *snaps; + nvpair_t *pair; + boolean_t defer; + + if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) + return (SET_ERROR(EINVAL)); + defer = nvlist_exists(innvl, "defer"); + + poollen = strlen(poolname); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + const char *name = nvpair_name(pair); + + /* + * The snap must be in the specified pool to prevent the + * invalid removal of zvol minors below. + */ + if (strncmp(name, poolname, poollen) != 0 || + (name[poollen] != '/' && name[poollen] != '@')) + return (SET_ERROR(EXDEV)); + + error = zfs_unmount_snap(name); + if (error != 0) + return (error); +#if defined(__FreeBSD__) + zvol_remove_minors(name); +#endif + } + + return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); +} + +/* + * Create bookmarks. Bookmark names are of the form #. + * All bookmarks must be in the same pool. + * + * innvl: { + * bookmark1 -> snapshot1, bookmark2 -> snapshot2 + * } + * + * outnvl: bookmark -> error code (int32) + * + */ +/* ARGSUSED */ +static int +zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { + char *snap_name; + + /* + * Verify the snapshot argument. + */ + if (nvpair_value_string(pair, &snap_name) != 0) + return (SET_ERROR(EINVAL)); + + + /* Verify that the keys (bookmarks) are unique */ + for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair); + pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) { + if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0) + return (SET_ERROR(EINVAL)); + } + } + + return (dsl_bookmark_create(innvl, outnvl)); +} + +/* + * innvl: { + * property 1, property 2, ... + * } + * + * outnvl: { + * bookmark name 1 -> { property 1, property 2, ... }, + * bookmark name 2 -> { property 1, property 2, ... } + * } + * + */ +static int +zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + return (dsl_get_bookmarks(fsname, innvl, outnvl)); +} + +/* + * innvl: { + * bookmark name 1, bookmark name 2 + * } + * + * outnvl: bookmark -> error code (int32) + * + */ +static int +zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, + nvlist_t *outnvl) +{ + int error, poollen; + + poollen = strlen(poolname); + for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { + const char *name = nvpair_name(pair); + const char *cp = strchr(name, '#'); + + /* + * The bookmark name must contain an #, and the part after it + * must contain only valid characters. + */ + if (cp == NULL || + zfs_component_namecheck(cp + 1, NULL, NULL) != 0) + return (SET_ERROR(EINVAL)); + + /* + * The bookmark must be in the specified pool. + */ + if (strncmp(name, poolname, poollen) != 0 || + (name[poollen] != '/' && name[poollen] != '#')) + return (SET_ERROR(EXDEV)); + } + + error = dsl_bookmark_destroy(innvl, outnvl); + return (error); +} + +/* + * inputs: + * zc_name name of dataset to destroy + * zc_objset_type type of objset + * zc_defer_destroy mark for deferred destroy + * + * outputs: none + */ +static int +zfs_ioc_destroy(zfs_cmd_t *zc) +{ + int err; + + if (zc->zc_objset_type == DMU_OST_ZFS) { + err = zfs_unmount_snap(zc->zc_name); + if (err != 0) + return (err); + } + + if (strchr(zc->zc_name, '@')) + err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); + else + err = dsl_destroy_head(zc->zc_name); + if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) +#ifdef __FreeBSD__ + zvol_remove_minors(zc->zc_name); +#else + (void) zvol_remove_minor(zc->zc_name); +#endif + return (err); +} + +/* + * fsname is name of dataset to rollback (to most recent snapshot) + * + * innvl is not used. + * + * outnvl: "target" -> name of most recent snapshot + * } + */ +/* ARGSUSED */ +static int +zfs_ioc_rollback(const char *fsname, nvlist_t *args, nvlist_t *outnvl) +{ + zfsvfs_t *zfsvfs; + int error; + + if (getzfsvfs(fsname, &zfsvfs) == 0) { + dsl_dataset_t *ds; + + ds = dmu_objset_ds(zfsvfs->z_os); + error = zfs_suspend_fs(zfsvfs); + if (error == 0) { + int resume_err; + + error = dsl_dataset_rollback(fsname, zfsvfs, outnvl); + resume_err = zfs_resume_fs(zfsvfs, ds); + error = error ? error : resume_err; + } +#ifdef illumos + VFS_RELE(zfsvfs->z_vfs); +#else + vfs_unbusy(zfsvfs->z_vfs); +#endif + } else { + error = dsl_dataset_rollback(fsname, NULL, outnvl); + } + return (error); +} + +static int +recursive_unmount(const char *fsname, void *arg) +{ + const char *snapname = arg; + char fullname[ZFS_MAX_DATASET_NAME_LEN]; + + (void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname); + return (zfs_unmount_snap(fullname)); +} + +/* + * inputs: + * zc_name old name of dataset + * zc_value new name of dataset + * zc_cookie recursive flag (only valid for snapshots) + * + * outputs: none + */ +static int zfs_ioc_rename(zfs_cmd_t *zc) { boolean_t recursive = zc->zc_cookie & 1; + char *at; + boolean_t allow_mounted = B_TRUE; + +#ifdef __FreeBSD__ + allow_mounted = (zc->zc_cookie & 2) != 0; +#endif zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '%')) - return (EINVAL); + return (SET_ERROR(EINVAL)); - /* - * Unmount snapshot unless we're doing a recursive rename, - * in which case the dataset code figures out which snapshots - * to unmount. - */ - if (!recursive && strchr(zc->zc_name, '@') != NULL && - zc->zc_objset_type == DMU_OST_ZFS) { - int err = zfs_unmount_snap(zc->zc_name, NULL); - if (err) - return (err); + at = strchr(zc->zc_name, '@'); + if (at != NULL) { + /* snaps must be in same fs */ + int error; + + if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) + return (SET_ERROR(EXDEV)); + *at = '\0'; + if (zc->zc_objset_type == DMU_OST_ZFS && !allow_mounted) { + error = dmu_objset_find(zc->zc_name, + recursive_unmount, at + 1, + recursive ? DS_FIND_CHILDREN : 0); + if (error != 0) { + *at = '@'; + return (error); + } + } + error = dsl_dataset_rename_snapshot(zc->zc_name, + at + 1, strchr(zc->zc_value, '@') + 1, recursive); + *at = '@'; + + return (error); + } else { +#ifdef illumos + if (zc->zc_objset_type == DMU_OST_ZVOL) + (void) zvol_remove_minor(zc->zc_name); +#endif + return (dsl_dir_rename(zc->zc_name, zc->zc_value)); } - if (zc->zc_objset_type == DMU_OST_ZVOL) - (void) zvol_remove_minor(zc->zc_name); - return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive)); } static int @@ -3159,7 +3953,7 @@ zfs_check_settable(const char *dsname, n perm = ZFS_DELEG_PERM_GROUPQUOTA; } else { /* USERUSED and GROUPUSED are read-only */ - return (EINVAL); + return (SET_ERROR(EINVAL)); } if (err = zfs_secpolicy_write_perms(dsname, perm, cr)) @@ -3167,11 +3961,11 @@ zfs_check_settable(const char *dsname, n return (0); } - return (EINVAL); + return (SET_ERROR(EINVAL)); } if (issnap) - return (EINVAL); + return (SET_ERROR(EINVAL)); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* @@ -3194,19 +3988,32 @@ zfs_check_settable(const char *dsname, n * the SPA supports it. We ignore any errors here since * we'll catch them later. */ - if (nvpair_type(pair) == DATA_TYPE_UINT64 && - nvpair_value_uint64(pair, &intval) == 0) { + if (nvpair_value_uint64(pair, &intval) == 0) { if (intval >= ZIO_COMPRESS_GZIP_1 && intval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); } if (intval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); + + if (intval == ZIO_COMPRESS_LZ4) { + spa_t *spa; + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + + if (!spa_feature_is_enabled(spa, + SPA_FEATURE_LZ4_COMPRESS)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + } /* * If this is a bootable dataset then @@ -3217,24 +4024,45 @@ zfs_check_settable(const char *dsname, n */ if (zfs_is_bootfs(dsname) && !BOOTFS_COMPRESS_VALID(intval)) { - return (ERANGE); + return (SET_ERROR(ERANGE)); } } break; case ZFS_PROP_COPIES: if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); break; - case ZFS_PROP_DEDUP: - if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) - return (ENOTSUP); + case ZFS_PROP_RECORDSIZE: + /* Record sizes above 128k need the feature to be enabled */ + if (nvpair_value_uint64(pair, &intval) == 0 && + intval > SPA_OLD_MAXBLOCKSIZE) { + spa_t *spa; + + /* + * We don't allow setting the property above 1MB, + * unless the tunable has been changed. + */ + if (intval > zfs_max_recordsize || + intval > SPA_MAXBLOCKSIZE) + return (SET_ERROR(ERANGE)); + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + + if (!spa_feature_is_enabled(spa, + SPA_FEATURE_LARGE_BLOCKS)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + } break; case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_ACLINHERIT: @@ -3243,15 +4071,104 @@ zfs_check_settable(const char *dsname, n if (intval == ZFS_ACL_PASSTHROUGH_X && zfs_earlier_version(dsname, SPA_VERSION_PASSTHROUGH_X)) - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); + } + break; + + case ZFS_PROP_CHECKSUM: + case ZFS_PROP_DEDUP: + { + spa_feature_t feature; + spa_t *spa; + + /* dedup feature version checks */ + if (prop == ZFS_PROP_DEDUP && + zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) + return (SET_ERROR(ENOTSUP)); + + if (nvpair_value_uint64(pair, &intval) != 0) + return (SET_ERROR(EINVAL)); + + /* check prop value is enabled in features */ + feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK); + if (feature == SPA_FEATURE_NONE) + break; + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + /* + * Salted checksums are not supported on root pools. + */ + if (spa_bootfs(spa) != 0 && + intval < ZIO_CHECKSUM_FUNCTIONS && + (zio_checksum_table[intval].ci_flags & + ZCHECKSUM_FLAG_SALTED)) { + spa_close(spa, FTAG); + return (SET_ERROR(ERANGE)); + } + if (!spa_feature_is_enabled(spa, feature)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); } + spa_close(spa, FTAG); break; } + } return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); } /* + * Checks for a race condition to make sure we don't increment a feature flag + * multiple times. + */ +static int +zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_feature_t *featurep = arg; + + if (!spa_feature_is_active(spa, *featurep)) + return (0); + else + return (SET_ERROR(EBUSY)); +} + +/* + * The callback invoked on feature activation in the sync task caused by + * zfs_prop_activate_feature. + */ +static void +zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_feature_t *featurep = arg; + + spa_feature_incr(spa, *featurep, tx); +} + +/* + * Activates a feature on a pool in response to a property setting. This + * creates a new sync task which modifies the pool to reflect the feature + * as being active. + */ +static int +zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature) +{ + int err; + + /* EBUSY here indicates that the feature is already active */ + err = dsl_sync_task(spa_name(spa), + zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync, + &feature, 2, ZFS_SPACE_CHECK_RESERVED); + + if (err != 0 && err != EBUSY) + return (err); + else + return (0); +} + +/* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. @@ -3286,7 +4203,7 @@ zfs_check_clearable(char *dataset, nvlis (void) strcpy(zc->zc_value, nvpair_name(pair)); if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || - (err = zfs_secpolicy_inherit(zc, CRED())) != 0) { + (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { VERIFY(nvlist_remove_nvpair(props, pair) == 0); VERIFY(nvlist_add_int32(errors, zc->zc_value, err) == 0); @@ -3379,6 +4296,56 @@ next: } } +/* + * Extract properties that cannot be set PRIOR to the receipt of a dataset. + * For example, refquota cannot be set until after the receipt of a dataset, + * because in replication streams, an older/earlier snapshot may exceed the + * refquota. We want to receive the older/earlier snapshot, but setting + * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent + * the older/earlier snapshot from being received (with EDQUOT). + * + * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. + * + * libzfs will need to be judicious handling errors encountered by props + * extracted by this function. + */ +static nvlist_t * +extract_delay_props(nvlist_t *props) +{ + nvlist_t *delayprops; + nvpair_t *nvp, *tmp; + static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 }; + int i; + + VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; + nvp = nvlist_next_nvpair(props, nvp)) { + /* + * strcmp() is safe because zfs_prop_to_name() always returns + * a bounded string. + */ + for (i = 0; delayable[i] != 0; i++) { + if (strcmp(zfs_prop_to_name(delayable[i]), + nvpair_name(nvp)) == 0) { + break; + } + } + if (delayable[i] != 0) { + tmp = nvlist_prev_nvpair(props, nvp); + VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); + VERIFY(nvlist_remove_nvpair(props, nvp) == 0); + nvp = tmp; + } + } + + if (nvlist_empty(delayprops)) { + nvlist_free(delayprops); + delayprops = NULL; + } + return (delayprops); +} + #ifdef DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif @@ -3392,17 +4359,20 @@ static boolean_t zfs_ioc_recv_inject_err * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag + * zc_cleanup_fd cleanup-on-exit file descriptor + * zc_action_handle handle for this guid/ds mapping (or zero on first call) + * zc_resumable if data is incomplete assume sender will resume * * outputs: * zc_cookie number of bytes read * zc_nvlist_dst{_size} error for each unapplied received property * zc_obj zprop_errflags_t + * zc_action_handle handle for this guid/ds mapping */ static int zfs_ioc_recv(zfs_cmd_t *zc) { file_t *fp; - objset_t *os; dmu_recv_cookie_t drc; boolean_t force = (boolean_t)zc->zc_guid; int fd; @@ -3412,15 +4382,19 @@ zfs_ioc_recv(zfs_cmd_t *zc) offset_t off; nvlist_t *props = NULL; /* sent properties */ nvlist_t *origprops = NULL; /* existing properties */ - objset_t *origin = NULL; + nvlist_t *delayprops = NULL; /* sent properties applied post-receive */ + char *origin = NULL; char *tosnap; - char tofs[ZFS_MAXNAMELEN]; + char tofs[ZFS_MAX_DATASET_NAME_LEN]; +#ifdef __FreeBSD__ + cap_rights_t rights; +#endif boolean_t first_recvd_props = B_FALSE; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || strchr(zc->zc_value, '%')) - return (EINVAL); + return (SET_ERROR(EINVAL)); (void) strcpy(tofs, zc->zc_value); tosnap = strchr(tofs, '@'); @@ -3432,28 +4406,45 @@ zfs_ioc_recv(zfs_cmd_t *zc) return (error); fd = zc->zc_cookie; - error = fd_getvnode(fd, &fp); - if (error != 0) { +#ifdef __FreeBSD__ + fget_read(curthread, fd, cap_rights_init(&rights, CAP_PREAD), &fp); +#else + fp = getf(fd); +#endif + if (fp == NULL) { nvlist_free(props); - return (error); + return (SET_ERROR(EBADF)); } - VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); + errors = fnvlist_alloc(); - if (props && dmu_objset_hold(tofs, FTAG, &os) == 0) { - if ((spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) && - !dsl_prop_get_hasrecvd(os)) { - first_recvd_props = B_TRUE; - } + if (zc->zc_string[0]) + origin = zc->zc_string; - /* - * If new received properties are supplied, they are to - * completely replace the existing received properties, so stash - * away the existing ones. - */ - if (dsl_prop_get_received(os, &origprops) == 0) { - nvlist_t *errlist = NULL; - /* + error = dmu_recv_begin(tofs, tosnap, + &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc); + if (error != 0) + goto out; + + /* + * Set properties before we receive the stream so that they are applied + * to the new data. Note that we must call dmu_recv_stream() if + * dmu_recv_begin() succeeds. + */ + if (props != NULL && !drc.drc_newfs) { + if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= + SPA_VERSION_RECVD_PROPS && + !dsl_prop_get_hasrecvd(tofs)) + first_recvd_props = B_TRUE; + + /* + * If new received properties are supplied, they are to + * completely replace the existing received properties, so stash + * away the existing ones. + */ + if (dsl_prop_get_received(tofs, &origprops) == 0) { + nvlist_t *errlist = NULL; + /* * Don't bother writing a property if its value won't * change (and avoid the unnecessary security checks). * @@ -3463,97 +4454,96 @@ zfs_ioc_recv(zfs_cmd_t *zc) */ if (!first_recvd_props) props_reduce(props, origprops); - if (zfs_check_clearable(tofs, origprops, - &errlist) != 0) + if (zfs_check_clearable(tofs, origprops, &errlist) != 0) (void) nvlist_merge(errors, errlist, 0); nvlist_free(errlist); - } - - dmu_objset_rele(os, FTAG); - } - - if (zc->zc_string[0]) { - error = dmu_objset_hold(zc->zc_string, FTAG, &origin); - if (error) - goto out; - } - - error = dmu_recv_begin(tofs, tosnap, zc->zc_top_ds, - &zc->zc_begin_record, force, origin, &drc); - if (origin) - dmu_objset_rele(origin, FTAG); - if (error) - goto out; - - /* - * Set properties before we receive the stream so that they are applied - * to the new data. Note that we must call dmu_recv_stream() if - * dmu_recv_begin() succeeds. - */ - if (props) { - nvlist_t *errlist; - if (dmu_objset_from_ds(drc.drc_logical_ds, &os) == 0) { - if (drc.drc_newfs) { - if (spa_version(os->os_spa) >= - SPA_VERSION_RECVD_PROPS) - first_recvd_props = B_TRUE; - } else if (origprops != NULL) { - if (clear_received_props(os, tofs, origprops, - first_recvd_props ? NULL : props) != 0) - zc->zc_obj |= ZPROP_ERR_NOCLEAR; - } else { + if (clear_received_props(tofs, origprops, + first_recvd_props ? NULL : props) != 0) zc->zc_obj |= ZPROP_ERR_NOCLEAR; - } - dsl_prop_set_hasrecvd(os); - } else if (!drc.drc_newfs) { + } else { zc->zc_obj |= ZPROP_ERR_NOCLEAR; } - - (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, - props, &errlist); - (void) nvlist_merge(errors, errlist, 0); - nvlist_free(errlist); } - if (fit_error_list(zc, &errors) != 0 || put_nvlist(zc, errors) != 0) { - /* - * Caller made zc->zc_nvlist_dst less than the minimum expected - * size or supplied an invalid address. - */ - props_error = EINVAL; + if (props != NULL) { + props_error = dsl_prop_set_hasrecvd(tofs); + + if (props_error == 0) { + delayprops = extract_delay_props(props); + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, + props, errors); + } } off = fp->f_offset; - error = dmu_recv_stream(&drc, fp->f_data, &off); + error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd, + &zc->zc_action_handle); if (error == 0) { zfsvfs_t *zfsvfs = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { /* online recv */ + dsl_dataset_t *ds; int end_err; + ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); /* * If the suspend fails, then the recv_end will * likely also fail, and clean up after itself. */ - end_err = dmu_recv_end(&drc); - if (error == 0) { - int resume_err = - zfs_resume_fs(zfsvfs, tofs); - error = error ? error : resume_err; - } + end_err = dmu_recv_end(&drc, zfsvfs); + if (error == 0) + error = zfs_resume_fs(zfsvfs, ds); error = error ? error : end_err; +#ifdef illumos VFS_RELE(zfsvfs->z_vfs); +#else + vfs_unbusy(zfsvfs->z_vfs); +#endif } else { - error = dmu_recv_end(&drc); + error = dmu_recv_end(&drc, NULL); + } + + /* Set delayed properties now, after we're done receiving. */ + if (delayprops != NULL && error == 0) { + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, + delayprops, errors); } } + if (delayprops != NULL) { + /* + * Merge delayed props back in with initial props, in case + * we're DEBUG and zfs_ioc_recv_inject_err is set (which means + * we have to make sure clear_received_props() includes + * the delayed properties). + * + * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, + * using ASSERT() will be just like a VERIFY. + */ + ASSERT(nvlist_merge(props, delayprops, 0) == 0); + nvlist_free(delayprops); + } + + /* + * Now that all props, initial and delayed, are set, report the prop + * errors to the caller. + */ + if (zc->zc_nvlist_dst_size != 0 && + (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || + put_nvlist(zc, errors) != 0)) { + /* + * Caller made zc->zc_nvlist_dst less than the minimum expected + * size or supplied an invalid address. + */ + props_error = SET_ERROR(EINVAL); + } + zc->zc_cookie = off - fp->f_offset; - if (VOP_SEEK(fp->f_data, fp->f_offset, &off, NULL) == 0) + if (off >= 0 && off <= MAXOFFSET_T) fp->f_offset = off; #ifdef DEBUG @@ -3562,25 +4552,25 @@ zfs_ioc_recv(zfs_cmd_t *zc) error = 1; } #endif + +#ifdef __FreeBSD__ + if (error == 0) + zvol_create_minors(tofs); +#endif + /* * On error, restore the original props. */ - if (error && props) { - if (dmu_objset_hold(tofs, FTAG, &os) == 0) { - if (clear_received_props(os, tofs, props, NULL) != 0) { - /* - * We failed to clear the received properties. - * Since we may have left a $recvd value on the - * system, we can't clear the $hasrecvd flag. - */ - zc->zc_obj |= ZPROP_ERR_NORESTORE; - } else if (first_recvd_props) { - dsl_prop_unset_hasrecvd(os); - } - dmu_objset_rele(os, FTAG); - } else if (!drc.drc_newfs) { - /* We failed to clear the received properties. */ + if (error != 0 && props != NULL && !drc.drc_newfs) { + if (clear_received_props(tofs, props, NULL) != 0) { + /* + * We failed to clear the received properties. + * Since we may have left a $recvd value on the + * system, we can't clear the $hasrecvd flag. + */ zc->zc_obj |= ZPROP_ERR_NORESTORE; + } else if (first_recvd_props) { + dsl_prop_unset_hasrecvd(tofs); } if (origprops == NULL && !drc.drc_newfs) { @@ -3609,7 +4599,7 @@ out: nvlist_free(props); nvlist_free(origprops); nvlist_free(errors); - fd_putfile(fd); + releasef(fd); if (error == 0) error = props_error; @@ -3620,60 +4610,157 @@ out: /* * inputs: * zc_name name of snapshot to send - * zc_value short name of incremental fromsnap (may be empty) * zc_cookie file descriptor to send stream to - * zc_obj fromorigin flag (mutually exclusive with zc_value) + * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) + * zc_sendobj objsetid of snapshot to send + * zc_fromobj objsetid of incremental fromsnap (may be zero) + * zc_guid if set, estimate size of stream only. zc_cookie is ignored. + * output size in zc_objset_type. + * zc_flags lzc_send_flags * - * outputs: none + * outputs: + * zc_objset_type estimated size, if zc_guid is set */ static int zfs_ioc_send(zfs_cmd_t *zc) { - objset_t *fromsnap = NULL; - objset_t *tosnap; - file_t *fp; int error; offset_t off; + boolean_t estimate = (zc->zc_guid != 0); + boolean_t embedok = (zc->zc_flags & 0x1); + boolean_t large_block_ok = (zc->zc_flags & 0x2); + + if (zc->zc_obj != 0) { + dsl_pool_t *dp; + dsl_dataset_t *tosnap; - error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap); - if (error) - return (error); + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + if (dsl_dir_is_clone(tosnap->ds_dir)) + zc->zc_fromobj = + dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + } + + if (estimate) { + dsl_pool_t *dp; + dsl_dataset_t *tosnap; + dsl_dataset_t *fromsnap = NULL; + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); - if (zc->zc_value[0] != '\0') { - char *buf; - char *cp; - - buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); - (void) strncpy(buf, zc->zc_name, MAXPATHLEN); - cp = strchr(buf, '@'); - if (cp) - *(cp+1) = 0; - (void) strncat(buf, zc->zc_value, MAXPATHLEN); - error = dmu_objset_hold(buf, FTAG, &fromsnap); - kmem_free(buf, MAXPATHLEN); - if (error) { - dmu_objset_rele(tosnap, FTAG); + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); return (error); } + + if (zc->zc_fromobj != 0) { + error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, + FTAG, &fromsnap); + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + } + + error = dmu_send_estimate(tosnap, fromsnap, + &zc->zc_objset_type); + + if (fromsnap != NULL) + dsl_dataset_rele(fromsnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + } else { + file_t *fp; +#ifdef __FreeBSD__ + cap_rights_t rights; + + fget_write(curthread, zc->zc_cookie, + cap_rights_init(&rights, CAP_WRITE), &fp); +#else + fp = getf(zc->zc_cookie); +#endif + if (fp == NULL) + return (SET_ERROR(EBADF)); + + off = fp->f_offset; + error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, + zc->zc_fromobj, embedok, large_block_ok, +#ifdef illumos + zc->zc_cookie, fp->f_vnode, &off); +#else + zc->zc_cookie, fp, &off); +#endif + + if (off >= 0 && off <= MAXOFFSET_T) + fp->f_offset = off; + releasef(zc->zc_cookie); } + return (error); +} + +/* + * inputs: + * zc_name name of snapshot on which to report progress + * zc_cookie file descriptor of send stream + * + * outputs: + * zc_cookie number of bytes written in send stream thus far + */ +static int +zfs_ioc_send_progress(zfs_cmd_t *zc) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + dmu_sendarg_t *dsp = NULL; + int error; + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); - error = fd_getvnode(zc->zc_cookie, &fp); + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { - dmu_objset_rele(tosnap, FTAG); - if (fromsnap) - dmu_objset_rele(fromsnap, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } - off = fp->f_offset; - error = dmu_sendbackup(tosnap, fromsnap, zc->zc_obj, fp->f_data, &off); + mutex_enter(&ds->ds_sendstream_lock); - if (VOP_SEEK(fp->f_data, fp->f_offset, &off, NULL) == 0) - fp->f_offset = off; - fd_putfile(zc->zc_cookie); - if (fromsnap) - dmu_objset_rele(fromsnap, FTAG); - dmu_objset_rele(tosnap, FTAG); + /* + * Iterate over all the send streams currently active on this dataset. + * If there's one which matches the specified file descriptor _and_ the + * stream was started by the current process, return the progress of + * that stream. + */ + for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; + dsp = list_next(&ds->ds_sendstreams, dsp)) { + if (dsp->dsa_outfd == zc->zc_cookie && + dsp->dsa_proc == curproc) + break; + } + + if (dsp != NULL) + zc->zc_cookie = *(dsp->dsa_off); + else + error = SET_ERROR(ENOENT); + + mutex_exit(&ds->ds_sendstream_lock); + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } @@ -3747,7 +4834,7 @@ zfs_ioc_clear(zfs_cmd_t *zc) spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); - return (EIO); + return (SET_ERROR(EIO)); } if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ @@ -3763,21 +4850,24 @@ zfs_ioc_clear(zfs_cmd_t *zc) nvlist_t *config = NULL; if (zc->zc_nvlist_src == 0) - return (EINVAL); + return (SET_ERROR(EINVAL)); if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { - (void) put_nvlist(zc, config); + int err; + + if ((err = put_nvlist(zc, config)) != 0) + error = err; nvlist_free(config); } nvlist_free(policy); } } - if (error) + if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); @@ -3789,7 +4879,7 @@ zfs_ioc_clear(zfs_cmd_t *zc) if (vd == NULL) { (void) spa_vdev_state_exit(spa, NULL, ENODEV); spa_close(spa, FTAG); - return (ENODEV); + return (SET_ERROR(ENODEV)); } } @@ -3801,13 +4891,39 @@ zfs_ioc_clear(zfs_cmd_t *zc) * Resume any suspended I/Os. */ if (zio_resume(spa) != 0) - error = EIO; + error = SET_ERROR(EIO); spa_close(spa, FTAG); return (error); } +static int +zfs_ioc_pool_reopen(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + spa_vdev_state_enter(spa, SCL_NONE); + + /* + * If a resilver is already in progress then set the + * spa_scrub_reopen flag to B_TRUE so that we don't restart + * the scan as a side effect of the reopen. Otherwise, let + * vdev_open() decided if a resilver is required. + */ + spa->spa_scrub_reopen = dsl_scan_resilvering(spa->spa_dsl_pool); + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + + (void) spa_vdev_state_exit(spa, NULL, 0); + spa_close(spa, FTAG); + return (0); +} /* * inputs: * zc_name name of filesystem @@ -3829,7 +4945,7 @@ zfs_ioc_promote(zfs_cmd_t *zc) if (cp) *cp = '\0'; (void) dmu_objset_find(zc->zc_value, - zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS); + zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } @@ -3852,10 +4968,10 @@ zfs_ioc_userspace_one(zfs_cmd_t *zc) int error; if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) - return (EINVAL); + return (SET_ERROR(EINVAL)); - error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs); - if (error) + error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); + if (error != 0) return (error); error = zfs_userspace_one(zfsvfs, @@ -3880,22 +4996,24 @@ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; - int error; + int bufsize = zc->zc_nvlist_dst_size; + + if (bufsize <= 0) + return (SET_ERROR(ENOMEM)); - error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs); - if (error) + int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); + if (error != 0) return (error); - int bufsize = zc->zc_nvlist_dst_size; void *buf = kmem_alloc(bufsize, KM_SLEEP); error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, buf, &zc->zc_nvlist_dst_size); if (error == 0) { - error = xcopyout(buf, + error = ddi_copyout(buf, (void *)(uintptr_t)zc->zc_nvlist_dst, - zc->zc_nvlist_dst_size); + zc->zc_nvlist_dst_size, zc->zc_iflags); } kmem_free(buf, bufsize); zfsvfs_rele(zfsvfs, FTAG); @@ -3924,17 +5042,27 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) * objset needs to be closed & reopened (to grow the * objset_phys_t). Suspend/resume the fs will do that. */ + dsl_dataset_t *ds; + + ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); - if (error == 0) - error = zfs_resume_fs(zfsvfs, zc->zc_name); + if (error == 0) { + dmu_objset_refresh_ownership(zfsvfs->z_os, + zfsvfs); + error = zfs_resume_fs(zfsvfs, ds); + } } if (error == 0) error = dmu_objset_userspace_upgrade(zfsvfs->z_os); +#ifdef illumos VFS_RELE(zfsvfs->z_vfs); +#else + vfs_unbusy(zfsvfs->z_vfs); +#endif } else { /* XXX kind of reading contents without owning */ error = dmu_objset_hold(zc->zc_name, FTAG, &os); - if (error) + if (error != 0) return (error); error = dmu_objset_userspace_upgrade(os); @@ -3944,6 +5072,7 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) return (error); } +#ifdef illumos /* * We don't want to have a hard dependency * against some special symbols in sharefs @@ -3951,17 +5080,6 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) * the first file system is shared. * Neither sharefs, nfs or smbsrv are unloadable modules. */ -#ifdef __NetBSD__ - -static int -zfs_ioc_share(zfs_cmd_t *zc) -{ - - return EOPNOTSUPP; -} - -#else /* __NetBSD__ */ - int (*znfsexport_fs)(void *arg); int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t); int (*zsmbexport_fs)(void *arg, boolean_t add_share); @@ -3972,8 +5090,10 @@ int zfs_smbshare_inited; ddi_modhandle_t nfs_mod; ddi_modhandle_t sharefs_mod; ddi_modhandle_t smbsrv_mod; +#endif /* illumos */ kmutex_t zfs_share_lock; +#ifdef illumos static int zfs_init_sharefs() { @@ -3984,19 +5104,21 @@ zfs_init_sharefs() if (sharefs_mod == NULL && ((sharefs_mod = ddi_modopen("fs/sharefs", KRTLD_MODE_FIRST, &error)) == NULL)) { - return (ENOSYS); + return (SET_ERROR(ENOSYS)); } if (zshare_fs == NULL && ((zshare_fs = (int (*)(enum sharefs_sys_op, share_t *, uint32_t)) ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) { - return (ENOSYS); + return (SET_ERROR(ENOSYS)); } return (0); } +#endif /* illumos */ static int zfs_ioc_share(zfs_cmd_t *zc) { +#ifdef illumos int error; int opcode; @@ -4008,19 +5130,19 @@ zfs_ioc_share(zfs_cmd_t *zc) if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs", KRTLD_MODE_FIRST, &error)) == NULL)) { mutex_exit(&zfs_share_lock); - return (ENOSYS); + return (SET_ERROR(ENOSYS)); } if (znfsexport_fs == NULL && ((znfsexport_fs = (int (*)(void *)) ddi_modsym(nfs_mod, "nfs_export", &error)) == NULL)) { mutex_exit(&zfs_share_lock); - return (ENOSYS); + return (SET_ERROR(ENOSYS)); } error = zfs_init_sharefs(); - if (error) { + if (error != 0) { mutex_exit(&zfs_share_lock); - return (ENOSYS); + return (SET_ERROR(ENOSYS)); } zfs_nfsshare_inited = 1; mutex_exit(&zfs_share_lock); @@ -4034,25 +5156,25 @@ zfs_ioc_share(zfs_cmd_t *zc) ddi_modopen("drv/smbsrv", KRTLD_MODE_FIRST, &error)) == NULL)) { mutex_exit(&zfs_share_lock); - return (ENOSYS); + return (SET_ERROR(ENOSYS)); } if (zsmbexport_fs == NULL && ((zsmbexport_fs = (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod, "smb_server_share", &error)) == NULL)) { mutex_exit(&zfs_share_lock); - return (ENOSYS); + return (SET_ERROR(ENOSYS)); } error = zfs_init_sharefs(); - if (error) { + if (error != 0) { mutex_exit(&zfs_share_lock); - return (ENOSYS); + return (SET_ERROR(ENOSYS)); } zfs_smbshare_inited = 1; mutex_exit(&zfs_share_lock); } break; default: - return (EINVAL); + return (SET_ERROR(EINVAL)); } switch (zc->zc_share.z_sharetype) { @@ -4087,6 +5209,9 @@ zfs_ioc_share(zfs_cmd_t *zc) return (error); +#else /* !illumos */ + return (ENOSYS); +#endif /* illumos */ } ace_t full_access[] = { @@ -4094,6 +5219,109 @@ ace_t full_access[] = { }; /* + * inputs: + * zc_name name of containing filesystem + * zc_obj object # beyond which we want next in-use object # + * + * outputs: + * zc_obj next in-use object # + */ +static int +zfs_ioc_next_obj(zfs_cmd_t *zc) +{ + objset_t *os = NULL; + int error; + + error = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (error != 0) + return (error); + + error = dmu_object_next(os, &zc->zc_obj, B_FALSE, + dsl_dataset_phys(os->os_dsl_dataset)->ds_prev_snap_txg); + + dmu_objset_rele(os, FTAG); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value prefix name for snapshot + * zc_cleanup_fd cleanup-on-exit file descriptor for calling process + * + * outputs: + * zc_value short name of new snapshot + */ +static int +zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) +{ + char *snap_name; + char *hold_name; + int error; + minor_t minor; + + error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); + if (error != 0) + return (error); + + snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, + (u_longlong_t)ddi_get_lbolt64()); + hold_name = kmem_asprintf("%%%s", zc->zc_value); + + error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, + hold_name); + if (error == 0) + (void) strcpy(zc->zc_value, snap_name); + strfree(snap_name); + strfree(hold_name); + zfs_onexit_fd_rele(zc->zc_cleanup_fd); + return (error); +} + +/* + * inputs: + * zc_name name of "to" snapshot + * zc_value name of "from" snapshot + * zc_cookie file descriptor to write diff data on + * + * outputs: + * dmu_diff_record_t's to the file descriptor + */ +static int +zfs_ioc_diff(zfs_cmd_t *zc) +{ + file_t *fp; + offset_t off; + int error; + +#ifdef __FreeBSD__ + cap_rights_t rights; + + fget_write(curthread, zc->zc_cookie, + cap_rights_init(&rights, CAP_WRITE), &fp); +#else + fp = getf(zc->zc_cookie); +#endif + if (fp == NULL) + return (SET_ERROR(EBADF)); + + off = fp->f_offset; + +#ifdef __FreeBSD__ + error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); +#else + error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off); +#endif + + if (off >= 0 && off <= MAXOFFSET_T) + fp->f_offset = off; + releasef(zc->zc_cookie); + + return (error); +} + +#ifdef illumos +/* * Remove all ACL files in shares dir */ static int @@ -4114,10 +5342,12 @@ zfs_smb_acl_purge(znode_t *dzp) zap_cursor_fini(&zc); return (error); } +#endif /* illumos */ static int zfs_ioc_smb_acl(zfs_cmd_t *zc) { +#ifdef illumos vnode_t *vp; znode_t *dzp; vnode_t *resourcevp = NULL; @@ -4130,18 +5360,18 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) int error = 0; if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, - NULL, &vp)) != 0) + NO_FOLLOW, NULL, &vp)) != 0) return (error); /* Now make sure mntpnt and dataset are ZFS */ -#ifndef __NetBSD__ - if (vp->v_vfsp->vfs_fstype != zfsfstype || + + if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), zc->zc_name) != 0)) { VN_RELE(vp); - return (EINVAL); + return (SET_ERROR(EINVAL)); } -#endif + dzp = VTOZ(vp); zfsvfs = dzp->z_zfsvfs; ZFS_ENTER(zfsvfs); @@ -4158,13 +5388,13 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) ZFS_SHARES_DIR); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { + if (error != 0) { dmu_tx_abort(tx); } else { error = zfs_create_share_dir(zfsvfs, tx); dmu_tx_commit(tx); } - if (error) { + if (error != 0) { mutex_exit(&zfsvfs->z_lock); VN_RELE(vp); ZFS_EXIT(zfsvfs); @@ -4208,6 +5438,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) { VN_RELE(vp); + VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); return (error); } @@ -4230,7 +5461,7 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) break; default: - error = EINVAL; + error = SET_ERROR(EINVAL); break; } @@ -4240,457 +5471,1187 @@ zfs_ioc_smb_acl(zfs_cmd_t *zc) ZFS_EXIT(zfsvfs); return (error); +#else /* !illumos */ + return (EOPNOTSUPP); +#endif /* illumos */ } -#endif /* __NetBSD__ */ /* - * inputs: - * zc_name name of filesystem - * zc_value short name of snap - * zc_string user-supplied tag for this reference - * zc_cookie recursive flag - * zc_temphold set if hold is temporary + * innvl: { + * "holds" -> { snapname -> holdname (string), ... } + * (optional) "cleanup_fd" -> fd (int32) + * } * - * outputs: none + * outnvl: { + * snapname -> error value (int32) + * ... + * } */ +/* ARGSUSED */ static int -zfs_ioc_hold(zfs_cmd_t *zc) +zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { - boolean_t recursive = zc->zc_cookie; + nvpair_t *pair; + nvlist_t *holds; + int cleanup_fd = -1; + int error; + minor_t minor = 0; - if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) - return (EINVAL); + error = nvlist_lookup_nvlist(args, "holds", &holds); + if (error != 0) + return (SET_ERROR(EINVAL)); - return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value, - zc->zc_string, recursive, zc->zc_temphold)); -} + /* make sure the user didn't pass us any invalid (empty) tags */ + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + char *htag; -/* - * inputs: - * zc_name name of dataset from which we're releasing a user reference - * zc_value short name of snap - * zc_string user-supplied tag for this reference - * zc_cookie recursive flag - * - * outputs: none + error = nvpair_value_string(pair, &htag); + if (error != 0) + return (SET_ERROR(error)); + + if (strlen(htag) == 0) + return (SET_ERROR(EINVAL)); + } + + if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { + error = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (error != 0) + return (error); + } + + error = dsl_dataset_user_hold(holds, minor, errlist); + if (minor != 0) + zfs_onexit_fd_rele(cleanup_fd); + return (error); +} + +/* + * innvl is not used. + * + * outnvl: { + * holdname -> time added (uint64 seconds since epoch) + * ... + * } */ +/* ARGSUSED */ static int -zfs_ioc_release(zfs_cmd_t *zc) +zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { - boolean_t recursive = zc->zc_cookie; - - if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) - return (EINVAL); + return (dsl_dataset_get_holds(snapname, outnvl)); +} - return (dsl_dataset_user_release(zc->zc_name, zc->zc_value, - zc->zc_string, recursive)); +/* + * innvl: { + * snapname -> { holdname, ... } + * ... + * } + * + * outnvl: { + * snapname -> error value (int32) + * ... + * } + */ +/* ARGSUSED */ +static int +zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) +{ + return (dsl_dataset_user_release(holds, errlist)); } /* * inputs: - * zc_name name of filesystem + * zc_name name of new filesystem or snapshot + * zc_value full name of old snapshot * * outputs: - * zc_nvlist_src{_size} nvlist of snapshot holds + * zc_cookie space in bytes + * zc_objset_type compressed space in bytes + * zc_perm_action uncompressed space in bytes */ static int -zfs_ioc_get_holds(zfs_cmd_t *zc) +zfs_ioc_space_written(zfs_cmd_t *zc) { - nvlist_t *nvp; int error; + dsl_pool_t *dp; + dsl_dataset_t *new, *old; - if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) { - error = put_nvlist(zc, nvp); - nvlist_free(nvp); + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); + if (error != 0) { + dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); } + error = dsl_dataset_space_written(old, new, &zc->zc_cookie, + &zc->zc_objset_type, &zc->zc_perm_action); + dsl_dataset_rele(old, FTAG); + dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } /* - * pool create, destroy, and export don't log the history as part of - * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export - * do the logging of those commands. - */ -static zfs_ioc_vec_t zfs_ioc_vec[] = { - { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_FALSE }, - { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE }, - { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE }, - { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE }, - { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE }, - { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_FALSE }, - { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE }, - { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE }, - { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_TRUE }, - { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_TRUE }, - { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE, - B_TRUE }, - { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, - B_TRUE }, - { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, - B_TRUE }, - { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE }, - { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE }, - { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE, - B_TRUE}, - { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE, - B_TRUE }, - { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE, B_TRUE }, - { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, B_TRUE }, - { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE, B_FALSE }, - { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_FALSE }, - { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE, - B_TRUE }, - { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, DATASET_NAME, - B_TRUE, B_TRUE }, - { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE, - B_TRUE }, - { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_obj_to_path, zfs_secpolicy_config, DATASET_NAME, B_FALSE, - B_TRUE }, - { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE }, - { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE, - B_TRUE }, - { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, DATASET_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, B_FALSE }, - { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE, - B_TRUE }, - /*{ zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE, - B_FALSE },*/ - { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one, - DATASET_NAME, B_FALSE, B_FALSE }, - { zfs_ioc_userspace_many, zfs_secpolicy_userspace_many, - DATASET_NAME, B_FALSE, B_FALSE }, - { zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, - DATASET_NAME, B_FALSE, B_TRUE }, - { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, B_TRUE }, - { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE, - B_TRUE }, - { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE, - B_TRUE }, - { zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE, - B_FALSE }, - { zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE, - B_TRUE } -}; - -int -pool_status_check(const char *name, zfs_ioc_namecheck_t type) + * innvl: { + * "firstsnap" -> snapshot name + * } + * + * outnvl: { + * "used" -> space in bytes + * "compressed" -> compressed space in bytes + * "uncompressed" -> uncompressed space in bytes + * } + */ +static int +zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { - spa_t *spa; int error; + dsl_pool_t *dp; + dsl_dataset_t *new, *old; + char *firstsnap; + uint64_t used, comp, uncomp; - ASSERT(type == POOL_NAME || type == DATASET_NAME); + if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0) + return (SET_ERROR(EINVAL)); - error = spa_open(name, &spa, FTAG); - if (error == 0) { - if (spa_suspended(spa)) - error = EAGAIN; - spa_close(spa, FTAG); + error = dsl_pool_hold(lastsnap, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); + if (error == 0 && !new->ds_is_snapshot) { + dsl_dataset_rele(new, FTAG); + error = SET_ERROR(EINVAL); + } + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); + if (error == 0 && !old->ds_is_snapshot) { + dsl_dataset_rele(old, FTAG); + error = SET_ERROR(EINVAL); + } + if (error != 0) { + dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); } + + error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); + dsl_dataset_rele(old, FTAG); + dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); + fnvlist_add_uint64(outnvl, "used", used); + fnvlist_add_uint64(outnvl, "compressed", comp); + fnvlist_add_uint64(outnvl, "uncompressed", uncomp); return (error); } +#ifdef __FreeBSD__ + static int -zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) +zfs_ioc_jail(zfs_cmd_t *zc) { - zfs_cmd_t *zc; - uint_t vec; - int error, rc; - dprintf("zfsdev_ioctl called \n"); + return (zone_dataset_attach(curthread->td_ucred, zc->zc_name, + (int)zc->zc_jailid)); +} - if (getminor(dev) != 0) - return (zvol_ioctl(dev, cmd, arg, flag, cr, rvalp)); - dprintf("zfsdev_ioctl -> zvol_ioctl\n"); - vec = cmd - ZFS_IOC; - ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip)); +static int +zfs_ioc_unjail(zfs_cmd_t *zc) +{ - if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) - return (EINVAL); + return (zone_dataset_detach(curthread->td_ucred, zc->zc_name, + (int)zc->zc_jailid)); +} - zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); +#endif - error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); - if ((error == 0) && !(flag & FKIOCTL)) - error = zfs_ioc_vec[vec].zvec_secpolicy(zc, cr); +/* + * innvl: { + * "fd" -> file descriptor to write stream to (int32) + * (optional) "fromsnap" -> full snap name to send an incremental from + * (optional) "largeblockok" -> (value ignored) + * indicates that blocks > 128KB are permitted + * (optional) "embedok" -> (value ignored) + * presence indicates DRR_WRITE_EMBEDDED records are permitted + * (optional) "resume_object" and "resume_offset" -> (uint64) + * if present, resume send stream from specified object and offset. + * } + * + * outnvl is unused + */ +/* ARGSUSED */ +static int +zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int error; + offset_t off; + char *fromname = NULL; + int fd; + boolean_t largeblockok; + boolean_t embedok; + uint64_t resumeobj = 0; + uint64_t resumeoff = 0; - /* - * Ensure that all pool/dataset names are valid before we pass down to - * the lower layers. - */ - if (error == 0) { - dprintf("zfsdev_ioctl, zc->zc_name %s\n", zc->zc_name); - zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; - zc->zc_iflags = flag & FKIOCTL; - switch (zfs_ioc_vec[vec].zvec_namecheck) { - case POOL_NAME: - if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) - error = EINVAL; - if (zfs_ioc_vec[vec].zvec_pool_check) - error = pool_status_check(zc->zc_name, - zfs_ioc_vec[vec].zvec_namecheck); - break; + error = nvlist_lookup_int32(innvl, "fd", &fd); + if (error != 0) + return (SET_ERROR(EINVAL)); - case DATASET_NAME: - if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) - error = EINVAL; - if (zfs_ioc_vec[vec].zvec_pool_check) - error = pool_status_check(zc->zc_name, - zfs_ioc_vec[vec].zvec_namecheck); - break; + (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); - case NO_NAME: - break; - } - } + largeblockok = nvlist_exists(innvl, "largeblockok"); + embedok = nvlist_exists(innvl, "embedok"); - dprintf("zfsdev_ioctl -> calling zfs_ioc_vec zvec_func on %d\n", vec); - if (error == 0) - error = zfs_ioc_vec[vec].zvec_func(zc); + (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); + (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); - rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); +#ifdef __FreeBSD__ + cap_rights_t rights; + + fget_write(curthread, fd, cap_rights_init(&rights, CAP_WRITE), &fp); +#else + file_t *fp = getf(fd); +#endif + if (fp == NULL) + return (SET_ERROR(EBADF)); + + off = fp->f_offset; + error = dmu_send(snapname, fromname, embedok, largeblockok, fd, +#ifdef illumos + resumeobj, resumeoff, fp->f_vnode, &off); +#else + resumeobj, resumeoff, fp, &off); +#endif + +#ifdef illumos + if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) + fp->f_offset = off; +#else + fp->f_offset = off; +#endif + + releasef(fd); + return (error); +} + +/* + * Determine approximately how large a zfs send stream will be -- the number + * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). + * + * innvl: { + * (optional) "from" -> full snap or bookmark name to send an incremental + * from + * } + * + * outnvl: { + * "space" -> bytes of space (uint64) + * } + */ +static int +zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) +{ + dsl_pool_t *dp; + dsl_dataset_t *tosnap; + int error; + char *fromname; + uint64_t space; + + error = dsl_pool_hold(snapname, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + error = nvlist_lookup_string(innvl, "from", &fromname); if (error == 0) { - error = rc; - if (zfs_ioc_vec[vec].zvec_his_log) - zfs_log_history(zc); + if (strchr(fromname, '@') != NULL) { + /* + * If from is a snapshot, hold it and use the more + * efficient dmu_send_estimate to estimate send space + * size using deadlists. + */ + dsl_dataset_t *fromsnap; + error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); + if (error != 0) + goto out; + error = dmu_send_estimate(tosnap, fromsnap, &space); + dsl_dataset_rele(fromsnap, FTAG); + } else if (strchr(fromname, '#') != NULL) { + /* + * If from is a bookmark, fetch the creation TXG of the + * snapshot it was created from and use that to find + * blocks that were born after it. + */ + zfs_bookmark_phys_t frombm; + + error = dsl_bookmark_lookup(dp, fromname, tosnap, + &frombm); + if (error != 0) + goto out; + error = dmu_send_estimate_from_txg(tosnap, + frombm.zbm_creation_txg, &space); + } else { + /* + * from is not properly formatted as a snapshot or + * bookmark + */ + error = SET_ERROR(EINVAL); + goto out; + } + } else { + // If estimating the size of a full send, use dmu_send_estimate + error = dmu_send_estimate(tosnap, NULL, &space); } - kmem_free(zc, sizeof (zfs_cmd_t)); - dprintf("zfsdev_ioctl %d\n", error); + fnvlist_add_uint64(outnvl, "space", space); + +out: + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); return (error); } -#ifdef __NetBSD__ +static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; -#include -#include +static void +zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, + boolean_t log_history, zfs_ioc_poolcheck_t pool_check) +{ + zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; -MODULE(MODULE_CLASS_DRIVER, zfs, "solaris"); + ASSERT3U(ioc, >=, ZFS_IOC_FIRST); + ASSERT3U(ioc, <, ZFS_IOC_LAST); + ASSERT3P(vec->zvec_legacy_func, ==, NULL); + ASSERT3P(vec->zvec_func, ==, NULL); -static int -nb_zvol_copen(dev_t dev, int flag, int mode, lwp_t *l) + vec->zvec_legacy_func = func; + vec->zvec_secpolicy = secpolicy; + vec->zvec_namecheck = namecheck; + vec->zvec_allow_log = log_history; + vec->zvec_pool_check = pool_check; +} + +/* + * See the block comment at the beginning of this file for details on + * each argument to this function. + */ +static void +zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, + zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, + zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, + boolean_t allow_log) +{ + zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; + + ASSERT3U(ioc, >=, ZFS_IOC_FIRST); + ASSERT3U(ioc, <, ZFS_IOC_LAST); + ASSERT3P(vec->zvec_legacy_func, ==, NULL); + ASSERT3P(vec->zvec_func, ==, NULL); + + /* if we are logging, the name must be valid */ + ASSERT(!allow_log || namecheck != NO_NAME); + + vec->zvec_name = name; + vec->zvec_func = func; + vec->zvec_secpolicy = secpolicy; + vec->zvec_namecheck = namecheck; + vec->zvec_pool_check = pool_check; + vec->zvec_smush_outnvlist = smush_outnvlist; + vec->zvec_allow_log = allow_log; +} + +static void +zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy, boolean_t log_history, + zfs_ioc_poolcheck_t pool_check) { + zfs_ioctl_register_legacy(ioc, func, secpolicy, + POOL_NAME, log_history, pool_check); +} - return zvol_open(&dev, flag, OTYPCHR, kauth_cred_get()); +static void +zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) +{ + zfs_ioctl_register_legacy(ioc, func, secpolicy, + DATASET_NAME, B_FALSE, pool_check); } -static int -nb_zvol_cclose(dev_t dev, int flag, int mode, lwp_t *l) +static void +zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { + zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, + POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); +} - return zvol_close(dev, flag, OTYPCHR, kauth_cred_get()); +static void +zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy) +{ + zfs_ioctl_register_legacy(ioc, func, secpolicy, + NO_NAME, B_FALSE, POOL_CHECK_NONE); } -static int -nb_zvol_bopen(dev_t dev, int flag, int mode, lwp_t *l) +static void +zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, + zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) +{ + zfs_ioctl_register_legacy(ioc, func, secpolicy, + DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); +} + +static void +zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { + zfs_ioctl_register_dataset_read_secpolicy(ioc, func, + zfs_secpolicy_read); +} - return zvol_open(&dev, flag, OTYPBLK, kauth_cred_get()); +static void +zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy) +{ + zfs_ioctl_register_legacy(ioc, func, secpolicy, + DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } -static int -nb_zvol_bclose(dev_t dev, int flag, int mode, lwp_t *l) +static void +zfs_ioctl_init(void) { + zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, + zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, + zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE); + + zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, + zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + + zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, + zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + + zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, + zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + + zfs_ioctl_register("create", ZFS_IOC_CREATE, + zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("clone", ZFS_IOC_CLONE, + zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, + zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("hold", ZFS_IOC_HOLD, + zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + zfs_ioctl_register("release", ZFS_IOC_RELEASE, + zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, + zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + + zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, + zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE); + + zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, + zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, + zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); + + zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, + zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, + POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + + /* IOCTLS that use the legacy function signature */ + + zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, + zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); + + zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, + zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); + zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, + zfs_ioc_pool_scan); + zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, + zfs_ioc_pool_upgrade); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, + zfs_ioc_vdev_add); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, + zfs_ioc_vdev_remove); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, + zfs_ioc_vdev_set_state); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, + zfs_ioc_vdev_attach); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, + zfs_ioc_vdev_detach); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, + zfs_ioc_vdev_setpath); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, + zfs_ioc_vdev_setfru); + zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, + zfs_ioc_pool_set_props); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, + zfs_ioc_vdev_split); + zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, + zfs_ioc_pool_reguid); + + zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, + zfs_ioc_pool_configs, zfs_secpolicy_none); + zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, + zfs_ioc_pool_tryimport, zfs_secpolicy_config); + zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, + zfs_ioc_inject_fault, zfs_secpolicy_inject); + zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, + zfs_ioc_clear_fault, zfs_secpolicy_inject); + zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, + zfs_ioc_inject_list_next, zfs_secpolicy_inject); + + /* + * pool destroy, and export don't log the history as part of + * zfsdev_ioctl, but rather zfs_ioc_pool_export + * does the logging of those commands. + */ + zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, + zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); + zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, + zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); + + zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, + zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); + zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, + zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); + + zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, + zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE); + zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, + zfs_ioc_dsobj_to_dsname, + zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE); + zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, + zfs_ioc_pool_get_history, + zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); + + zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, + zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); + + zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, + zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); + zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, + zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED); + + zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, + zfs_ioc_space_written); + zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, + zfs_ioc_objset_recvd_props); + zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, + zfs_ioc_next_obj); + zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, + zfs_ioc_get_fsacl); + zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, + zfs_ioc_objset_stats); + zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, + zfs_ioc_objset_zplprops); + zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, + zfs_ioc_dataset_list_next); + zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, + zfs_ioc_snapshot_list_next); + zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, + zfs_ioc_send_progress); + + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, + zfs_ioc_diff, zfs_secpolicy_diff); + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, + zfs_ioc_obj_to_stats, zfs_secpolicy_diff); + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, + zfs_ioc_obj_to_path, zfs_secpolicy_diff); + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, + zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, + zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, + zfs_ioc_send, zfs_secpolicy_send); + + zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, + zfs_secpolicy_none); + zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, + zfs_secpolicy_destroy); + zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, + zfs_secpolicy_rename); + zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, + zfs_secpolicy_recv); + zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, + zfs_secpolicy_promote); + zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, + zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); + zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, + zfs_secpolicy_set_fsacl); + + zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, + zfs_secpolicy_share, POOL_CHECK_NONE); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, + zfs_secpolicy_smb_acl, POOL_CHECK_NONE); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, + zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, + zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); + +#ifdef __FreeBSD__ + zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail, + zfs_secpolicy_config, POOL_CHECK_NONE); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail, + zfs_secpolicy_config, POOL_CHECK_NONE); + zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT, + zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME, + POOL_CHECK_NONE, B_FALSE, B_FALSE); +#endif +} + +int +pool_status_check(const char *name, zfs_ioc_namecheck_t type, + zfs_ioc_poolcheck_t check) +{ + spa_t *spa; + int error; - return zvol_close(dev, flag, OTYPBLK, kauth_cred_get()); + ASSERT(type == POOL_NAME || type == DATASET_NAME); + + if (check & POOL_CHECK_NONE) + return (0); + + error = spa_open(name, &spa, FTAG); + if (error == 0) { + if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) + error = SET_ERROR(EAGAIN); + else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) + error = SET_ERROR(EROFS); + spa_close(spa, FTAG); + } + return (error); } -static int -nb_zvol_read(dev_t dev, struct uio *uio, int flag) +/* + * Find a free minor number. + */ +minor_t +zfsdev_minor_alloc(void) { + static minor_t last_minor; + minor_t m; - return zvol_read(dev, uio, kauth_cred_get()); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + for (m = last_minor + 1; m != last_minor; m++) { + if (m > ZFSDEV_MAX_MINOR) + m = 1; + if (ddi_get_soft_state(zfsdev_state, m) == NULL) { + last_minor = m; + return (m); + } + } + + return (0); } +#ifdef __FreeBSD__ static int -nb_zvol_write(dev_t dev, struct uio *uio, int flag) +zfs_ctldev_init(struct cdev *devp) +#else +static int +zfs_ctldev_init(dev_t *devp) +#endif { + minor_t minor; + zfs_soft_state_t *zs; - return zvol_write(dev, uio, kauth_cred_get()); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + minor = zfsdev_minor_alloc(); + if (minor == 0) + return (SET_ERROR(ENXIO)); + + if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) + return (SET_ERROR(EAGAIN)); + +#ifdef __FreeBSD__ + devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close); +#endif + + zs = ddi_get_soft_state(zfsdev_state, minor); + zs->zss_type = ZSST_CTLDEV; + zfs_onexit_init((zfs_onexit_t **)&zs->zss_data); + + return (0); } +static void +zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + zfs_onexit_destroy(zo); + ddi_soft_state_free(zfsdev_state, minor); +} + +void * +zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which) +{ + zfs_soft_state_t *zp; + + zp = ddi_get_soft_state(zfsdev_state, minor); + if (zp == NULL || zp->zss_type != which) + return (NULL); + + return (zp->zss_data); +} + +#ifdef __FreeBSD__ static int -nb_zfsdev_ioctl(dev_t dev, u_long cmd, void *argp, int flag, lwp_t *l) +zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td) +#endif +#ifdef __NetBSD__ +static int +zfsdev_open(dev_t *devp, int flag, int otyp, cred_t *cr) +#endif { - int rval; + int error = 0; - return zfsdev_ioctl(dev, cmd, (intptr_t)argp, flag, kauth_cred_get(), - &rval); +#ifndef __FreeBSD__ + if (getminor(*devp) != 0) + return (zvol_open(devp, flag, otyp, cr)); +#endif + + /* This is the control device. Allocate a new minor if requested. */ + if (flag & FEXCL) { + mutex_enter(&spa_namespace_lock); + error = zfs_ctldev_init(devp); + mutex_exit(&spa_namespace_lock); + } + + return (error); } -const struct bdevsw zfs_bdevsw = { - .d_open = nb_zvol_bopen, - .d_close = nb_zvol_bclose, - .d_strategy = zvol_strategy, - .d_ioctl = nb_zfsdev_ioctl, - .d_dump = nodump, - .d_psize = nosize, - .d_flag = D_DISK | D_MPSAFE -}; +#ifdef __FreeBSD__ +static void +zfsdev_close(void *data) +#endif +#ifdef __NetBSD__ +static int +zfsdev_close(dev_t dev, int flag, int otyp, cred_t *cr) +#endif +{ + zfs_onexit_t *zo; +#ifdef __FreeBSD__ + minor_t minor = (minor_t)(uintptr_t)data; +#endif +#ifdef __NetBSD__ + minor_t minor = getminor(dev); +#endif -const struct cdevsw zfs_cdevsw = { - .d_open = nb_zvol_copen, - .d_close = nb_zvol_cclose, - .d_read = nb_zvol_read, - .d_write = nb_zvol_write, - .d_ioctl = nb_zfsdev_ioctl, - .d_stop = nostop, - .d_tty = notty, - .d_poll = nopoll, - .d_mmap = nommap, - .d_kqfilter = nokqfilter, - .d_flag = D_DISK | D_MPSAFE -}; + if (minor == 0) +#ifdef __FreeBSD__ + return; +#else + return (0); +#endif -uint_t zfs_fsyncer_key; -extern uint_t rrw_tsd_key; + mutex_enter(&spa_namespace_lock); + zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV); + if (zo == NULL) { + mutex_exit(&spa_namespace_lock); +#ifdef __FreeBSD__ + return; +#else + return 0; +#endif + } + zfs_ctldev_destroy(zo, minor); + mutex_exit(&spa_namespace_lock); -/* ZFS must be used on machines with at least 512Mb. */ -#define ZFS_MIN_MEGS 512 +#ifndef __FreeBSD__ + return (0); +#endif +} +#ifdef __FreeBSD__ static int -zfs_modcmd(modcmd_t cmd, void *arg) +zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, + struct thread *td) +#endif +#ifdef __NetBSD__ +static int +zfsdev_ioctl(dev_t dev, int zcmd, intptr_t iarg, int flag, cred_t *cr, int *rvalp) +#endif { - int error; - int active, inactive; - uint64_t availrmem; - - switch (cmd) { - case MODULE_CMD_INIT: - if (!rootvnode) - return EAGAIN; + zfs_cmd_t *zc; + uint_t vecnum; + int error, rc, len; + zfs_iocparm_t *zc_iocparm; + int cflag, cmd, oldvecnum; + boolean_t newioc, compat; + void *compat_zc = NULL; +#ifdef __FreeBSD__ + cred_t *cr = td->td_ucred; +#endif + const zfs_ioc_vec_t *vec; + char *saved_poolname = NULL; + nvlist_t *innvl = NULL; +#ifdef __NetBSD__ + caddr_t arg = (caddr_t)iarg; +#endif - printf("WARNING: ZFS on NetBSD is under development\n"); - availrmem = (uint64_t)physmem * PAGE_SIZE / 1048576; - if (availrmem < ZFS_MIN_MEGS * 80 / 100) { - printf("ERROR: at least %dMB of memory required to" - "use ZFS\n", ZFS_MIN_MEGS); - return ENOMEM; +#if defined(illumos) || defined(__NetBSD__) + minor_t minor = getminor(dev); + + if (minor != 0 && + zfsdev_get_soft_state(minor, ZSST_CTLDEV) == NULL) + return (zvol_ioctl(dev, zcmd, iarg, flag, cr, rvalp)); +#endif +#ifdef illumos + vecnum = cmd - ZFS_IOC_FIRST; + ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip)); +#endif + + cflag = ZFS_CMD_COMPAT_NONE; + compat = B_FALSE; + newioc = B_TRUE; /* "new" style (zfs_iocparm_t) ioctl */ + len = IOCPARM_LEN(zcmd); + vecnum = cmd = zcmd & 0xff; + + /* + * Check if we are talking to supported older binaries + * and translate zfs_cmd if necessary + */ + if (len != sizeof(zfs_iocparm_t)) { + newioc = B_FALSE; + compat = B_TRUE; + + vecnum = cmd; + + switch (len) { + case sizeof(zfs_cmd_zcmd_t): + cflag = ZFS_CMD_COMPAT_LZC; + break; + case sizeof(zfs_cmd_deadman_t): + cflag = ZFS_CMD_COMPAT_DEADMAN; + break; + case sizeof(zfs_cmd_v28_t): + cflag = ZFS_CMD_COMPAT_V28; + break; + case sizeof(zfs_cmd_v15_t): + cflag = ZFS_CMD_COMPAT_V15; + vecnum = zfs_ioctl_v15_to_v28[cmd]; + + /* + * Return without further handling + * if the command is blacklisted. + */ + if (vecnum == ZFS_IOC_COMPAT_PASS) + return (0); + else if (vecnum == ZFS_IOC_COMPAT_FAIL) + return (ENOTSUP); + break; + default: + return (EINVAL); } - error = lwp_specific_key_create(&zfs_fsyncer_key, NULL); - if (error != 0) { - return error; + } + + if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) + return (SET_ERROR(EINVAL)); + vec = &zfs_ioc_vec[vecnum]; + + zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); + +#ifdef illumos + error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); + if (error != 0) { + error = SET_ERROR(EFAULT); + goto out; + } +#else /* !illumos */ + + bzero(zc, sizeof(zfs_cmd_t)); + + if (newioc) { + zc_iocparm = (void *)arg; + + switch (zc_iocparm->zfs_ioctl_version) { + case ZFS_IOCVER_CURRENT: + if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) { + error = SET_ERROR(EINVAL); + goto out; + } + break; + case ZFS_IOCVER_INLANES: + if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_inlanes_t)) { + error = SET_ERROR(EFAULT); + goto out; + } + compat = B_TRUE; + cflag = ZFS_CMD_COMPAT_INLANES; + break; + case ZFS_IOCVER_RESUME: + if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_resume_t)) { + error = SET_ERROR(EFAULT); + goto out; + } + compat = B_TRUE; + cflag = ZFS_CMD_COMPAT_RESUME; + break; + case ZFS_IOCVER_EDBP: + if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) { + error = SET_ERROR(EFAULT); + goto out; + } + compat = B_TRUE; + cflag = ZFS_CMD_COMPAT_EDBP; + break; + case ZFS_IOCVER_ZCMD: + if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) || + zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) { + error = SET_ERROR(EFAULT); + goto out; + } + compat = B_TRUE; + cflag = ZFS_CMD_COMPAT_ZCMD; + break; + default: + error = SET_ERROR(EINVAL); + goto out; + /* NOTREACHED */ } - error = lwp_specific_key_create(&rrw_tsd_key, NULL); - if (error != 0) { - lwp_specific_key_delete(zfs_fsyncer_key); - return error; + + if (compat) { + ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size); + compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); + bzero(compat_zc, sizeof(zfs_cmd_t)); + + error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd, + compat_zc, zc_iocparm->zfs_cmd_size, flag); + if (error != 0) { + error = SET_ERROR(EFAULT); + goto out; + } + } else { + error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd, + zc, zc_iocparm->zfs_cmd_size, flag); + if (error != 0) { + error = SET_ERROR(EFAULT); + goto out; + } } - spa_init(FREAD | FWRITE); - zvol_init(); - zfs_vfsinit(16, MOUNT_ZFS); /* I need to use well defined args. */ - error = devsw_attach("zfs", &zfs_bdevsw, &zfs_bmajor, - &zfs_cdevsw, &zfs_cmajor); - if (error != 0) { - zvol_fini(); - zfs_vfsfini(); - spa_fini(); - lwp_specific_key_delete(zfs_fsyncer_key); - lwp_specific_key_delete(rrw_tsd_key); + } + + if (compat) { + if (newioc) { + ASSERT(compat_zc != NULL); + zfs_cmd_compat_get(zc, compat_zc, cflag); + } else { + ASSERT(compat_zc == NULL); + zfs_cmd_compat_get(zc, arg, cflag); } - return error; + oldvecnum = vecnum; + error = zfs_ioctl_compat_pre(zc, &vecnum, cflag); + if (error != 0) + goto out; + if (oldvecnum != vecnum) + vec = &zfs_ioc_vec[vecnum]; + } +#endif /* !illumos */ - case MODULE_CMD_FINI: - if (spa_busy() || zfs_busy() || zvol_busy() || - zio_injection_enabled) - return EBUSY; - error = devsw_detach(&zfs_bdevsw, &zfs_cdevsw); - zvol_fini(); - zfs_vfsfini(); - spa_fini(); - lwp_specific_key_delete(zfs_fsyncer_key); - lwp_specific_key_delete(rrw_tsd_key); - return error; + zc->zc_iflags = flag & FKIOCTL; + if (zc->zc_nvlist_src_size != 0) { + error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &innvl); + if (error != 0) + goto out; + } - case MODULE_CMD_AUTOUNLOAD: - /* - * We don't want to be autounloaded because unlike - * other subsystems, we read our own configuration - * from disk and provide things that might be used - * later (zvols). - */ - return EBUSY; + /* rewrite innvl for backwards compatibility */ + if (compat) + innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag); - default: - return ENOTTY; + /* + * Ensure that all pool/dataset names are valid before we pass down to + * the lower layers. + */ + zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; + switch (vec->zvec_namecheck) { + case POOL_NAME: + if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) + error = SET_ERROR(EINVAL); + else + error = pool_status_check(zc->zc_name, + vec->zvec_namecheck, vec->zvec_pool_check); + break; + + case DATASET_NAME: + if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) + error = SET_ERROR(EINVAL); + else + error = pool_status_check(zc->zc_name, + vec->zvec_namecheck, vec->zvec_pool_check); + break; + + case NO_NAME: + break; } -} -#else /* __NetBSD__ */ + if (error == 0) + error = vec->zvec_secpolicy(zc, innvl, cr); -int -_fini(void) -{ - int error; + if (error != 0) + goto out; - if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) - return (EBUSY); + /* legacy ioctls can modify zc_name */ + len = strcspn(zc->zc_name, "/@#") + 1; + saved_poolname = kmem_alloc(len, KM_SLEEP); + (void) strlcpy(saved_poolname, zc->zc_name, len); + + if (vec->zvec_func != NULL) { + nvlist_t *outnvl; + int puterror = 0; + spa_t *spa; + nvlist_t *lognv = NULL; - if ((error = mod_remove(&modlinkage)) != 0) - return (error); + ASSERT(vec->zvec_legacy_func == NULL); - zvol_fini(); - zfs_fini(); - spa_fini(); - if (zfs_nfsshare_inited) - (void) ddi_modclose(nfs_mod); - if (zfs_smbshare_inited) - (void) ddi_modclose(smbsrv_mod); - if (zfs_nfsshare_inited || zfs_smbshare_inited) - (void) ddi_modclose(sharefs_mod); + /* + * Add the innvl to the lognv before calling the func, + * in case the func changes the innvl. + */ + if (vec->zvec_allow_log) { + lognv = fnvlist_alloc(); + fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, + vec->zvec_name); + if (!nvlist_empty(innvl)) { + fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, + innvl); + } + } + + outnvl = fnvlist_alloc(); + error = vec->zvec_func(zc->zc_name, innvl, outnvl); + + if (error == 0 && vec->zvec_allow_log && + spa_open(zc->zc_name, &spa, FTAG) == 0) { + if (!nvlist_empty(outnvl)) { + fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, + outnvl); + } + (void) spa_history_log_nvl(spa, lognv); + spa_close(spa, FTAG); + } + fnvlist_free(lognv); + + /* rewrite outnvl for backwards compatibility */ + if (compat) + outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum, + cflag); + + if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { + int smusherror = 0; + if (vec->zvec_smush_outnvlist) { + smusherror = nvlist_smush(outnvl, + zc->zc_nvlist_dst_size); + } + if (smusherror == 0) + puterror = put_nvlist(zc, outnvl); + } + + if (puterror != 0) + error = puterror; + + nvlist_free(outnvl); + } else { + error = vec->zvec_legacy_func(zc); + } - tsd_destroy(&zfs_fsyncer_key); - ldi_ident_release(zfs_li); - zfs_li = NULL; - mutex_destroy(&zfs_share_lock); +out: + nvlist_free(innvl); + +#ifdef illumos + rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); + if (error == 0 && rc != 0) + error = SET_ERROR(EFAULT); +#else + if (compat) { + zfs_ioctl_compat_post(zc, cmd, cflag); + if (newioc) { + ASSERT(compat_zc != NULL); + ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size); + + zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag); + rc = ddi_copyout(compat_zc, + (void *)(uintptr_t)zc_iocparm->zfs_cmd, + zc_iocparm->zfs_cmd_size, flag); + if (error == 0 && rc != 0) + error = SET_ERROR(EFAULT); + kmem_free(compat_zc, sizeof (zfs_cmd_t)); + } else { + zfs_cmd_compat_put(zc, arg, vecnum, cflag); + } + } else { + ASSERT(newioc); + + rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd, + sizeof (zfs_cmd_t), flag); + if (error == 0 && rc != 0) + error = SET_ERROR(EFAULT); + } +#endif + if (error == 0 && vec->zvec_allow_log) { + char *s = tsd_get(zfs_allow_log_key); + if (s != NULL) + strfree(s); + (void) tsd_set(zfs_allow_log_key, saved_poolname); + } else { + if (saved_poolname != NULL) + strfree(saved_poolname); + } + kmem_free(zc, sizeof (zfs_cmd_t)); return (error); } +static void +zfs_allow_log_destroy(void *arg) +{ + char *poolname = arg; + strfree(poolname); +} + +#ifdef illumos static int zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { @@ -4752,8 +6713,8 @@ zfs_info(dev_info_t *dip, ddi_info_cmd_t * so most of the standard driver entry points are in zvol.c. */ static struct cb_ops zfs_cb_ops = { - zvol_open, /* open */ - zvol_close, /* close */ + zfsdev_open, /* open */ + zfsdev_close, /* close */ zvol_strategy, /* strategy */ nodev, /* print */ zvol_dump, /* dump */ @@ -4800,10 +6761,6 @@ static struct modlinkage modlinkage = { NULL }; - -uint_t zfs_fsyncer_key; -extern uint_t rrw_tsd_key; - int _init(void) { @@ -4812,6 +6769,7 @@ _init(void) spa_init(FREAD | FWRITE); zfs_init(); zvol_init(); + zfs_ioctl_init(); if ((error = mod_install(&modlinkage)) != 0) { zvol_fini(); @@ -4821,7 +6779,9 @@ _init(void) } tsd_create(&zfs_fsyncer_key, NULL); - tsd_create(&rrw_tsd_key, NULL); + tsd_create(&zfs_putpages_key, NULL); + tsd_create(&rrw_tsd_key, rrw_tsd_destroy); + tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); error = ldi_ident_from_mod(&modlinkage, &zfs_li); ASSERT(error == 0); @@ -4836,7 +6796,7 @@ _fini(void) int error; if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) - return (EBUSY); + return (SET_ERROR(EBUSY)); if ((error = mod_remove(&modlinkage)) != 0) return (error); @@ -4864,4 +6824,382 @@ _info(struct modinfo *modinfop) { return (mod_info(&modlinkage, modinfop)); } -#endif /* __NetBSD__ */ +#endif /* illumos */ + +#ifdef __FreeBSD__ +static struct cdevsw zfs_cdevsw = { + .d_version = D_VERSION, + .d_open = zfsdev_open, + .d_ioctl = zfsdev_ioctl, + .d_name = ZFS_DEV_NAME +}; + +static void +zfsdev_init(void) +{ + zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666, + ZFS_DEV_NAME); +} + +static void +zfsdev_fini(void) +{ + if (zfsdev != NULL) + destroy_dev(zfsdev); +} + +static struct root_hold_token *zfs_root_token; +struct proc *zfsproc; + +static int zfs__init(void); +static int zfs__fini(void); +static void zfs_shutdown(void *, int); + +static eventhandler_tag zfs_shutdown_event_tag; + +#define ZFS_MIN_KSTACK_PAGES 4 + +int +zfs__init(void) +{ + +#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES + printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack " + "overflow panic!\nPlease consider adding " + "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES, + ZFS_MIN_KSTACK_PAGES); +#endif + zfs_root_token = root_mount_hold("ZFS"); + + mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); + + spa_init(FREAD | FWRITE); + zfs_init(); + zvol_init(); + zfs_ioctl_init(); + + tsd_create(&zfs_fsyncer_key, NULL); + tsd_create(&rrw_tsd_key, rrw_tsd_destroy); + tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); + tsd_create(&zfs_geom_probe_vdev_key, NULL); + + printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n"); + root_mount_rel(zfs_root_token); + + zfsdev_init(); + + return (0); +} + +int +zfs__fini(void) +{ + if (spa_busy() || zfs_busy() || zvol_busy() || + zio_injection_enabled) { + return (EBUSY); + } + + zfsdev_fini(); + zvol_fini(); + zfs_fini(); + spa_fini(); + + tsd_destroy(&zfs_fsyncer_key); + tsd_destroy(&rrw_tsd_key); + tsd_destroy(&zfs_allow_log_key); + + mutex_destroy(&zfs_share_lock); + + return (0); +} + +static void +zfs_shutdown(void *arg __unused, int howto __unused) +{ + + /* + * ZFS fini routines can not properly work in a panic-ed system. + */ + if (panicstr == NULL) + (void)zfs__fini(); +} + + +static int +zfs_modevent(module_t mod, int type, void *unused __unused) +{ + int err; + + switch (type) { + case MOD_LOAD: + err = zfs__init(); + if (err == 0) + zfs_shutdown_event_tag = EVENTHANDLER_REGISTER( + shutdown_post_sync, zfs_shutdown, NULL, + SHUTDOWN_PRI_FIRST); + return (err); + case MOD_UNLOAD: + err = zfs__fini(); + if (err == 0 && zfs_shutdown_event_tag != NULL) + EVENTHANDLER_DEREGISTER(shutdown_post_sync, + zfs_shutdown_event_tag); + return (err); + case MOD_SHUTDOWN: + return (0); + default: + break; + } + return (EOPNOTSUPP); +} + +static moduledata_t zfs_mod = { + "zfsctrl", + zfs_modevent, + 0 +}; +DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY); +MODULE_VERSION(zfsctrl, 1); +MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1); +MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1); +MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1); + +#endif /* __FreeBSD__ */ + +#ifdef __NetBSD__ + +#include +#include + +MODULE(MODULE_CLASS_DRIVER, zfs, "solaris"); + +static int +nb_zfsdev_copen(dev_t dev, int flag, int mode, lwp_t *l) +{ + + return zfsdev_open(&dev, flag, OTYPCHR, kauth_cred_get()); +} + +static int +nb_zfsdev_cclose(dev_t dev, int flag, int mode, lwp_t *l) +{ + + return zfsdev_close(dev, flag, OTYPCHR, kauth_cred_get()); +} + +static int +nb_zfsdev_bopen(dev_t dev, int flag, int mode, lwp_t *l) +{ + + return zfsdev_open(&dev, flag, OTYPBLK, kauth_cred_get()); +} + +static int +nb_zfsdev_bclose(dev_t dev, int flag, int mode, lwp_t *l) +{ + + return zfsdev_close(dev, flag, OTYPBLK, kauth_cred_get()); +} + +static int +nb_zvol_read(dev_t dev, struct uio *uio, int flag) +{ + + return zvol_read(dev, uio, kauth_cred_get()); +} + +static int +nb_zvol_write(dev_t dev, struct uio *uio, int flag) +{ + + return zvol_write(dev, uio, kauth_cred_get()); +} + +static int +nb_zfsdev_ioctl(dev_t dev, u_long cmd, void *argp, int flag, lwp_t *l) +{ + int rval; + + return zfsdev_ioctl(dev, cmd, (intptr_t)argp, flag, kauth_cred_get(), + &rval); +} + +static void +nb_zvol_strategy(struct buf *bp) +{ + + (void) zvol_strategy(bp); +} + +const struct bdevsw zfs_bdevsw = { + .d_open = nb_zfsdev_bopen, + .d_close = nb_zfsdev_bclose, + .d_strategy = nb_zvol_strategy, + .d_ioctl = nb_zfsdev_ioctl, + .d_dump = nodump, + .d_psize = nosize, + .d_flag = D_DISK | D_MPSAFE +}; + +const struct cdevsw zfs_cdevsw = { + .d_open = nb_zfsdev_copen, + .d_close = nb_zfsdev_cclose, + .d_read = nb_zvol_read, + .d_write = nb_zvol_write, + .d_ioctl = nb_zfsdev_ioctl, + .d_stop = nostop, + .d_tty = notty, + .d_poll = nopoll, + .d_mmap = nommap, + .d_kqfilter = nokqfilter, + .d_flag = D_DISK | D_MPSAFE +}; + +/* ZFS should only be used on systems with enough memory. */ +#define ZFS_MIN_MEGS 512 + +static int zfs_version_ioctl = ZFS_IOCVER_CURRENT; +static int zfs_version_spa = SPA_VERSION; +static struct sysctllog *zfs_sysctl_log; + +static void +zfs_sysctl_init(void) +{ + const struct sysctlnode *rnode; + + sysctl_createv(&zfs_sysctl_log, 0, NULL, &rnode, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "zfs", + SYSCTL_DESCR("zfs"), + NULL, 0, NULL, 0, + CTL_VFS, CTL_CREATE, CTL_EOL); + + sysctl_createv(&zfs_sysctl_log, 0, &rnode, &rnode, + CTLFLAG_PERMANENT, + CTLTYPE_NODE, "version", + SYSCTL_DESCR("version"), + NULL, 0, NULL, 0, + CTL_CREATE, CTL_EOL); + + sysctl_createv(&zfs_sysctl_log, 0, &rnode, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READONLY, + CTLTYPE_INT, "ioctl", + SYSCTL_DESCR("ZFS ioctl version"), + NULL, 0, &zfs_version_ioctl, 0, + CTL_CREATE, CTL_EOL); + + sysctl_createv(&zfs_sysctl_log, 0, &rnode, NULL, + CTLFLAG_PERMANENT|CTLFLAG_READONLY, + CTLTYPE_INT, "spa", + SYSCTL_DESCR("ZFS SPA version"), + NULL, 0, &zfs_version_spa, 0, + CTL_CREATE, CTL_EOL); +} + +static void +zfs_sysctl_fini(void) +{ + + sysctl_teardown(&zfs_sysctl_log); +} + + +static void +zfs_loadvnode_destroy(void *arg) +{ + + if (arg != NULL) + panic("thread exiting with TSD loadvnode data %p", arg); +} + +static int +zfs_modcmd(modcmd_t cmd, void *arg) +{ + int error; + int active, inactive; + uint64_t availrmem; + + extern struct vfsops zfs_vfsops; + extern uint_t zfs_loadvnode_key; + extern uint_t zfs_putpage_key; + + switch (cmd) { + case MODULE_CMD_INIT: + if (!rootvnode) + return EAGAIN; + + /* XXXNETBSD trim is not supported yet */ + zfs_trim_enabled = B_FALSE; + + printf("WARNING: ZFS on NetBSD is under development\n"); + availrmem = (uint64_t)physmem * PAGE_SIZE / 1048576; + if (availrmem < ZFS_MIN_MEGS * 80 / 100) { + printf("ERROR: at least %dMB of memory required to" + "use ZFS\n", ZFS_MIN_MEGS); + return ENOMEM; + } + mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfs_debug_mtx, NULL, MUTEX_DEFAULT, NULL); + + tsd_create(&zfs_fsyncer_key, NULL); + tsd_create(&rrw_tsd_key, rrw_tsd_destroy); + tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); + tsd_create(&zfs_loadvnode_key, zfs_loadvnode_destroy); + tsd_create(&zfs_putpage_key, NULL); + + spa_init(FREAD | FWRITE); + zfs_init(); + zvol_init(); + zfs_ioctl_init(); + zfs_sysctl_init(); + + error = devsw_attach("zfs", &zfs_bdevsw, &zfs_bmajor, + &zfs_cdevsw, &zfs_cmajor); + if (error != 0) { + goto attacherr; + } + (void) vfs_attach(&zfs_vfsops); + return error; + + case MODULE_CMD_FINI: + if (spa_busy() || zfs_busy() || zvol_busy() || + zio_injection_enabled) + return EBUSY; + + error = vfs_detach(&zfs_vfsops); + if (error) + return error; + + (void) devsw_detach(&zfs_bdevsw, &zfs_cdevsw); + +attacherr: + zfs_sysctl_fini(); + zvol_fini(); + zfs_fini(); + spa_fini(); + + tsd_destroy(&zfs_putpage_key); + tsd_destroy(&zfs_loadvnode_key); + tsd_destroy(&zfs_fsyncer_key); + tsd_destroy(&rrw_tsd_key); + tsd_destroy(&zfs_allow_log_key); + + mutex_destroy(&zfs_debug_mtx); + mutex_destroy(&zfs_share_lock); + + return error; + + case MODULE_CMD_AUTOUNLOAD: + /* + * We don't want to be autounloaded because unlike + * other subsystems, we read our own configuration + * from disk and provide things that might be used + * later (zvols). + */ + return EBUSY; + + default: + return ENOTTY; + } +} + +#endif /* __NetBSD__ */ Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c,v retrieving revision 1.1.1.2 diff -u -p -r1.1.1.2 zfs_log.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c 27 Feb 2010 22:31:25 -0000 1.1.1.2 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c 3 Dec 2016 17:03:49 -0000 @@ -19,8 +19,9 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ #include @@ -29,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -39,12 +39,10 @@ #include #include #include -#include #include #include #include #include -#include #include /* @@ -75,7 +73,11 @@ zfs_log_create_txtype(zil_create_t type, if (vsecp == NULL && !isxvattr) return (TX_CREATE); if (vsecp && isxvattr) +#ifdef TODO return (TX_CREATE_ACL_ATTR); +#else + panic("%s:%u: unsupported condition", __func__, __LINE__); +#endif if (vsecp) return (TX_CREATE_ACL); else @@ -85,7 +87,11 @@ zfs_log_create_txtype(zil_create_t type, if (vsecp == NULL && !isxvattr) return (TX_MKDIR); if (vsecp && isxvattr) +#ifdef TODO return (TX_MKDIR_ACL_ATTR); +#else + panic("%s:%u: unsupported condition", __func__, __LINE__); +#endif if (vsecp) return (TX_MKDIR_ACL); else @@ -170,6 +176,12 @@ zfs_log_xvattr(lr_attr_t *lrattr, xvattr if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) *attrs |= (xoap->xoa_reparse == 0) ? 0 : XAT0_REPARSE; + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) + *attrs |= (xoap->xoa_offline == 0) ? 0 : + XAT0_OFFLINE; + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) + *attrs |= (xoap->xoa_sparse == 0) ? 0 : + XAT0_SPARSE; } static void * @@ -206,9 +218,8 @@ zfs_log_fuid_domains(zfs_fuid_info_t *fu } /* - * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, - * TX_MKDIR_ATTR and TX_MKXATTR - * transactions. + * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and + * TK_MKXATTR transactions. * * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID * domain information appended prior to the name. In this case the @@ -231,10 +242,9 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t zfs_fuid_info_t *fuidp, vattr_t *vap) { itx_t *itx; - uint64_t seq; lr_create_t *lr; lr_acl_create_t *lracl; - size_t aclsize; + size_t aclsize = (vsecp != NULL) ? vsecp->vsa_aclentsz : 0; size_t xvatsize = 0; size_t txsize; xvattr_t *xvap = (xvattr_t *)vap; @@ -264,7 +274,6 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t txsize = sizeof (*lr) + namesize + fuidsz + xvatsize; lrsize = sizeof (*lr); } else { - aclsize = (vsecp) ? vsecp->vsa_aclentsz : 0; txsize = sizeof (lr_acl_create_t) + namesize + fuidsz + ZIL_ACE_LENGTH(aclsize) + xvatsize; @@ -276,21 +285,25 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t lr = (lr_create_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_foid = zp->z_id; - lr->lr_mode = zp->z_phys->zp_mode; - if (!IS_EPHEMERAL(zp->z_phys->zp_uid)) { - lr->lr_uid = (uint64_t)zp->z_phys->zp_uid; + lr->lr_mode = zp->z_mode; + if (!IS_EPHEMERAL(zp->z_uid)) { + lr->lr_uid = (uint64_t)zp->z_uid; } else { lr->lr_uid = fuidp->z_fuid_owner; } - if (!IS_EPHEMERAL(zp->z_phys->zp_gid)) { - lr->lr_gid = (uint64_t)zp->z_phys->zp_gid; + if (!IS_EPHEMERAL(zp->z_gid)) { + lr->lr_gid = (uint64_t)zp->z_gid; } else { lr->lr_gid = fuidp->z_fuid_group; } - lr->lr_gen = zp->z_phys->zp_gen; - lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; - lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; - lr->lr_rdev = zp->z_phys->zp_rdev; + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen, + sizeof (uint64_t)); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + lr->lr_crtime, sizeof (uint64_t) * 2); + + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev, + sizeof (lr->lr_rdev)) != 0) + lr->lr_rdev = 0; /* * Fill in xvattr info if any @@ -329,20 +342,17 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t */ bcopy(name, end, namesize); - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* - * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions. + * Handles both TX_REMOVE and TX_RMDIR transactions. */ void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, char *name) + znode_t *dzp, char *name, uint64_t foid) { itx_t *itx; - uint64_t seq; lr_remove_t *lr; size_t namesize = strlen(name) + 1; @@ -354,19 +364,19 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t lr->lr_doid = dzp->z_id; bcopy(name, (char *)(lr + 1), namesize); - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; + itx->itx_oid = foid; + + zil_itx_assign(zilog, itx, tx); } /* - * zfs_log_link() handles TX_LINK transactions. + * Handles TX_LINK transactions. */ void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *dzp, znode_t *zp, char *name) + znode_t *dzp, znode_t *zp, char *name) { itx_t *itx; - uint64_t seq; lr_link_t *lr; size_t namesize = strlen(name) + 1; @@ -379,20 +389,17 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *t lr->lr_link_obj = zp->z_id; bcopy(name, (char *)(lr + 1), namesize); - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* - * zfs_log_symlink() handles TX_SYMLINK transactions. + * Handles TX_SYMLINK transactions. */ void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name, char *link) { itx_t *itx; - uint64_t seq; lr_create_t *lr; size_t namesize = strlen(name) + 1; size_t linksize = strlen(link) + 1; @@ -404,29 +411,27 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t lr = (lr_create_t *)&itx->itx_lr; lr->lr_doid = dzp->z_id; lr->lr_foid = zp->z_id; - lr->lr_mode = zp->z_phys->zp_mode; - lr->lr_uid = zp->z_phys->zp_uid; - lr->lr_gid = zp->z_phys->zp_gid; - lr->lr_gen = zp->z_phys->zp_gen; - lr->lr_crtime[0] = zp->z_phys->zp_crtime[0]; - lr->lr_crtime[1] = zp->z_phys->zp_crtime[1]; + lr->lr_uid = zp->z_uid; + lr->lr_gid = zp->z_gid; + lr->lr_mode = zp->z_mode; + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen, + sizeof (uint64_t)); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + lr->lr_crtime, sizeof (uint64_t) * 2); bcopy(name, (char *)(lr + 1), namesize); bcopy(link, (char *)(lr + 1) + namesize, linksize); - seq = zil_itx_assign(zilog, itx, tx); - dzp->z_last_itx = seq; - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* - * zfs_log_rename() handles TX_RENAME transactions. + * Handles TX_RENAME transactions. */ void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, - znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) { itx_t *itx; - uint64_t seq; lr_rename_t *lr; size_t snamesize = strlen(sname) + 1; size_t dnamesize = strlen(dname) + 1; @@ -440,36 +445,36 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t lr->lr_tdoid = tdzp->z_id; bcopy(sname, (char *)(lr + 1), snamesize); bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize); + itx->itx_oid = szp->z_id; - seq = zil_itx_assign(zilog, itx, tx); - sdzp->z_last_itx = seq; - tdzp->z_last_itx = seq; - szp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* - * zfs_log_write() handles TX_WRITE transactions. + * Handles TX_WRITE transactions. */ ssize_t zfs_immediate_write_sz = 32768; +#ifdef _KERNEL +SYSCTL_DECL(_vfs_zfs); +SYSCTL_LONG(_vfs_zfs, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN, + &zfs_immediate_write_sz, 0, "Minimal size for indirect log write"); +#endif void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t resid, int ioflag) + znode_t *zp, offset_t off, ssize_t resid, int ioflag) { + uint32_t blocksize = zp->z_blksz; itx_wr_state_t write_state; - boolean_t slogging; uintptr_t fsync_cnt; - ssize_t immediate_write_sz; if (zil_replaying(zilog, tx) || zp->z_unlinked) return; - immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) - ? 0 : zfs_immediate_write_sz; - - slogging = spa_has_slogs(zilog->zl_spa) && - (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); - if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz) + if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) + write_state = WR_INDIRECT; + else if (!spa_has_slogs(zilog->zl_spa) && + resid >= zfs_immediate_write_sz) write_state = WR_INDIRECT; else if (ioflag & (FSYNC | FDSYNC)) write_state = WR_COPIED; @@ -483,30 +488,26 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t * while (resid) { itx_t *itx; lr_write_t *lr; - ssize_t len; + itx_wr_state_t wr_state = write_state; + ssize_t len = resid; - /* - * If the write would overflow the largest block then split it. - */ - if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA) - len = SPA_MAXBLOCKSIZE >> 1; - else - len = resid; + if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA) + wr_state = WR_NEED_COPY; + else if (wr_state == WR_INDIRECT) + len = MIN(blocksize - P2PHASE(off, blocksize), resid); itx = zil_itx_create(txtype, sizeof (*lr) + - (write_state == WR_COPIED ? len : 0)); + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; - if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, + if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; - write_state = WR_NEED_COPY; + wr_state = WR_NEED_COPY; } - itx->itx_wr_state = write_state; - if (write_state == WR_NEED_COPY) - itx->itx_sod += len; + itx->itx_wr_state = wr_state; lr->lr_foid = zp->z_id; lr->lr_offset = off; lr->lr_length = len; @@ -515,13 +516,11 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t * itx->itx_private = zp->z_zfsvfs; - if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) || - (ioflag & (FSYNC | FDSYNC))) - itx->itx_sync = B_TRUE; - else + if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) && + (fsync_cnt == 0)) itx->itx_sync = B_FALSE; - zp->z_last_itx = zil_itx_assign(zilog, itx, tx); + zil_itx_assign(zilog, itx, tx); off += len; resid -= len; @@ -529,14 +528,13 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t * } /* - * zfs_log_truncate() handles TX_TRUNCATE transactions. + * Handles TX_TRUNCATE transactions. */ void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, uint64_t off, uint64_t len) + znode_t *zp, uint64_t off, uint64_t len) { itx_t *itx; - uint64_t seq; lr_truncate_t *lr; if (zil_replaying(zilog, tx) || zp->z_unlinked) @@ -549,19 +547,17 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_ lr->lr_length = len; itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* - * zfs_log_setattr() handles TX_SETATTR transactions. + * Handles TX_SETATTR transactions. */ void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) + znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) { itx_t *itx; - uint64_t seq; lr_setattr_t *lr; xvattr_t *xvap = (xvattr_t *)vap; size_t recsize = sizeof (lr_setattr_t); @@ -613,19 +609,17 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t (void) zfs_log_fuid_domains(fuidp, start); itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } /* - * zfs_log_acl() handles TX_ACL transactions. + * Handles TX_ACL transactions. */ void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp) { itx_t *itx; - uint64_t seq; lr_acl_v0_t *lrv0; lr_acl_t *lr; int txtype; @@ -681,6 +675,5 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx } itx->itx_sync = (zp->z_sync_cnt != 0); - seq = zil_itx_assign(zilog, itx, tx); - zp->z_last_itx = seq; + zil_itx_assign(zilog, itx, tx); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_onexit.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_onexit.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_onexit.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_onexit.c 5 May 2017 22:07:28 -0000 @@ -0,0 +1,265 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ZFS kernel routines may add/delete callback routines to be invoked + * upon process exit (triggered via the close operation from the /dev/zfs + * driver). + * + * These cleanup callbacks are intended to allow for the accumulation + * of kernel state across multiple ioctls. User processes participate + * by opening ZFS_DEV with O_EXCL. This causes the ZFS driver to do a + * clone-open, generating a unique minor number. The process then passes + * along that file descriptor to each ioctl that might have a cleanup operation. + * + * Consumers of the onexit routines should call zfs_onexit_fd_hold() early + * on to validate the given fd and add a reference to its file table entry. + * This allows the consumer to do its work and then add a callback, knowing + * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers + * should call zfs_onexit_fd_rele(). + * + * A simple example is zfs_ioc_recv(), where we might create an AVL tree + * with dataset/GUID mappings and then reuse that tree on subsequent + * zfs_ioc_recv() calls. + * + * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc() + * the AVL tree and pass it along with a callback function to + * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the + * callback and return an action handle. + * + * The action handle is then passed from user space to subsequent + * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree + * by calling zfs_onexit_cb_data() with the device minor number and + * action handle. + * + * If the user process exits abnormally, the callback is invoked implicitly + * as part of the driver close operation. Once the user space process is + * finished with the accumulated kernel state, it can also just call close(2) + * on the cleanup fd to trigger the cleanup callback. + */ + +void +zfs_onexit_init(zfs_onexit_t **zop) +{ + zfs_onexit_t *zo; + + zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP); + mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t), + offsetof(zfs_onexit_action_node_t, za_link)); +} + +void +zfs_onexit_destroy(zfs_onexit_t *zo) +{ + zfs_onexit_action_node_t *ap; + + mutex_enter(&zo->zo_lock); + while ((ap = list_head(&zo->zo_actions)) != NULL) { + list_remove(&zo->zo_actions, ap); + mutex_exit(&zo->zo_lock); + ap->za_func(ap->za_data); + kmem_free(ap, sizeof (zfs_onexit_action_node_t)); + mutex_enter(&zo->zo_lock); + } + mutex_exit(&zo->zo_lock); + + list_destroy(&zo->zo_actions); + mutex_destroy(&zo->zo_lock); + kmem_free(zo, sizeof (zfs_onexit_t)); +} + +static int +zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) +{ + *zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV); + if (*zo == NULL) + return (SET_ERROR(EBADF)); + + return (0); +} + +/* + * Consumers might need to operate by minor number instead of fd, since + * they might be running in another thread (e.g. txg_sync_thread). Callers + * of this function must call zfs_onexit_fd_rele() when they're finished + * using the minor number. + */ +int +zfs_onexit_fd_hold(int fd, minor_t *minorp) +{ + file_t *fp; + zfs_onexit_t *zo; + +#ifdef __FreeBSD__ + file_t *tmpfp; + cap_rights_t rights; + void *data; + int error; + + fp = getf(fd, cap_rights_init(&rights)); + if (fp == NULL) + return (SET_ERROR(EBADF)); + + tmpfp = curthread->td_fpop; + curthread->td_fpop = fp; + error = devfs_get_cdevpriv(&data); + if (error == 0) + *minorp = (minor_t)(uintptr_t)data; + curthread->td_fpop = tmpfp; + if (error != 0) + return (SET_ERROR(EBADF)); +#else + fp = getf(fd); + if (fp == NULL) + return (SET_ERROR(EBADF)); + + *minorp = getminor(fp->f_vnode->v_rdev); +#endif + + return (zfs_onexit_minor_to_state(*minorp, &zo)); +} + +void +zfs_onexit_fd_rele(int fd) +{ + releasef(fd); +} + +/* + * Add a callback to be invoked when the calling process exits. + */ +int +zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, + uint64_t *action_handle) +{ + zfs_onexit_t *zo; + zfs_onexit_action_node_t *ap; + int error; + + error = zfs_onexit_minor_to_state(minor, &zo); + if (error) + return (error); + + ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP); + list_link_init(&ap->za_link); + ap->za_func = func; + ap->za_data = data; + + mutex_enter(&zo->zo_lock); + list_insert_tail(&zo->zo_actions, ap); + mutex_exit(&zo->zo_lock); + if (action_handle) + *action_handle = (uint64_t)(uintptr_t)ap; + + return (0); +} + +static zfs_onexit_action_node_t * +zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle) +{ + zfs_onexit_action_node_t *match; + zfs_onexit_action_node_t *ap; + list_t *l; + + ASSERT(MUTEX_HELD(&zo->zo_lock)); + + match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle; + l = &zo->zo_actions; + for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) { + if (match == ap) + break; + } + return (ap); +} + +/* + * Delete the callback, triggering it first if 'fire' is set. + */ +int +zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) +{ + zfs_onexit_t *zo; + zfs_onexit_action_node_t *ap; + int error; + + error = zfs_onexit_minor_to_state(minor, &zo); + if (error) + return (error); + + mutex_enter(&zo->zo_lock); + ap = zfs_onexit_find_cb(zo, action_handle); + if (ap != NULL) { + list_remove(&zo->zo_actions, ap); + mutex_exit(&zo->zo_lock); + if (fire) + ap->za_func(ap->za_data); + kmem_free(ap, sizeof (zfs_onexit_action_node_t)); + } else { + mutex_exit(&zo->zo_lock); + error = SET_ERROR(ENOENT); + } + + return (error); +} + +/* + * Return the data associated with this callback. This allows consumers + * of the cleanup-on-exit interfaces to stash kernel data across system + * calls, knowing that it will be cleaned up if the calling process exits. + */ +int +zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) +{ + zfs_onexit_t *zo; + zfs_onexit_action_node_t *ap; + int error; + + *data = NULL; + + error = zfs_onexit_minor_to_state(minor, &zo); + if (error) + return (error); + + mutex_enter(&zo->zo_lock); + ap = zfs_onexit_find_cb(zo, action_handle); + if (ap != NULL) + *data = ap->za_data; + else + error = SET_ERROR(ENOENT); + mutex_exit(&zo->zo_lock); + + return (error); +} Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c,v retrieving revision 1.10 diff -u -p -r1.10 zfs_replay.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c 7 Feb 2014 15:29:20 -0000 1.10 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c 12 May 2017 21:45:09 -0000 @@ -19,8 +19,8 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. */ #include @@ -54,9 +54,9 @@ static void zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, - uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) + uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) { - vattr_null(vap); + VATTR_NULL(vap); vap->va_mask = (uint_t)mask; if (mask & AT_TYPE) vap->va_type = IFTOVT(mode); @@ -74,7 +74,7 @@ zfs_init_vattr(vattr_t *vap, uint64_t ma static int zfs_replay_error(zfsvfs_t *zfsvfs, lr_t *lr, boolean_t byteswap) { - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); } static void @@ -132,6 +132,10 @@ zfs_replay_xvattr(lr_attr_t *lrattr, xva bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ); if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) + xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) + xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0); } static int @@ -321,7 +325,6 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs, if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; - vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); switch (txtype) { case TX_CREATE_ACL: aclstart = (caddr_t)(lracl + 1); @@ -394,9 +397,8 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs, #endif break; default: - error = ENOTSUP; + error = SET_ERROR(ENOTSUP); } - VOP_UNLOCK(ZTOV(dzp)); bail: if (error == 0 && vp != NULL) @@ -476,7 +478,10 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_c } cn.cn_cred = kcred; - cn.cn_flags = 0; +#ifndef __NetBSD__ + cn.cn_thread = curthread; + cn.cn_flags = SAVENAME; +#endif vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); switch (txtype) { @@ -526,13 +531,17 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_c error = VOP_SYMLINK(ZTOV(dzp), &vp, &cn, &xva.xva_vattr, link /*,vflg*/); break; default: - error = ENOTSUP; + error = SET_ERROR(ENOTSUP); } - VOP_UNLOCK(ZTOV(dzp)); + VOP_UNLOCK(ZTOV(dzp), 0); out: if (error == 0 && vp != NULL) +#ifdef __NetBSD__ VN_RELE(vp); +#else + VN_URELE(vp); +#endif VN_RELE(ZTOV(dzp)); @@ -563,21 +572,21 @@ zfs_replay_remove(zfsvfs_t *zfsvfs, lr_r cn.cn_nameptr = name; cn.cn_namelen = strlen(name); cn.cn_nameiop = DELETE; - cn.cn_flags = ISLASTCN; - //cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; + cn.cn_flags = ISLASTCN | SAVENAME; cn.cn_cred = kcred; +#ifndef __NetBSD__ + cn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; + cn.cn_thread = curthread; +#endif vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LOOKUP(ZTOV(dzp), &vp, &cn); if (error != 0) { - VOP_UNLOCK(ZTOV(dzp)); - goto fail; - } - error = vn_lock(vp, LK_EXCLUSIVE); - if (error != 0) { - VOP_UNLOCK(ZTOV(dzp)); - vrele(vp); + VOP_UNLOCK(ZTOV(dzp), 0); goto fail; } +#ifdef __NetBSD__ + VOP_UNLOCK(vp, 0); +#endif switch ((int)lr->lr_common.lrc_txtype) { case TX_REMOVE: @@ -587,10 +596,15 @@ zfs_replay_remove(zfsvfs_t *zfsvfs, lr_r error = VOP_RMDIR(ZTOV(dzp), vp, &cn /*,vflg*/); break; default: - error = ENOTSUP; + error = SET_ERROR(ENOTSUP); } +#ifdef __NetBSD__ + vrele(vp); +#else vput(vp); - VOP_UNLOCK(ZTOV(dzp)); +#endif + VOP_UNLOCK(ZTOV(dzp), 0); + fail: VN_RELE(ZTOV(dzp)); @@ -619,15 +633,23 @@ zfs_replay_link(zfsvfs_t *zfsvfs, lr_lin if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; + cn.cn_nameptr = name; cn.cn_cred = kcred; - cn.cn_flags = 0; +#ifndef __NetBSD__ + cn.cn_thread = curthread; + cn.cn_flags = SAVENAME; +#endif vn_lock(ZTOV(dzp), LK_EXCLUSIVE | LK_RETRY); +#ifndef __NetBSD__ vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); +#endif error = VOP_LINK(ZTOV(dzp), ZTOV(zp), &cn /*,vflg*/); - VOP_UNLOCK(ZTOV(zp)); - VOP_UNLOCK(ZTOV(dzp)); +#ifndef __NetBSD__ + VOP_UNLOCK(ZTOV(zp), 0); +#endif + VOP_UNLOCK(ZTOV(dzp), 0); VN_RELE(ZTOV(zp)); VN_RELE(ZTOV(dzp)); @@ -665,36 +687,39 @@ zfs_replay_rename(zfsvfs_t *zfsvfs, lr_r scn.cn_nameptr = sname; scn.cn_namelen = strlen(sname); scn.cn_nameiop = DELETE; - scn.cn_flags = ISLASTCN; -// scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; + scn.cn_flags = ISLASTCN | SAVENAME; scn.cn_cred = kcred; +#ifndef __NetBSD__ + scn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; + scn.cn_thread = td; +#endif vn_lock(ZTOV(sdzp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LOOKUP(ZTOV(sdzp), &svp, &scn); - VOP_UNLOCK(ZTOV(sdzp)); + VOP_UNLOCK(ZTOV(sdzp), 0); if (error != 0) goto fail; + VOP_UNLOCK(svp, 0); tcn.cn_nameptr = tname; tcn.cn_namelen = strlen(tname); tcn.cn_nameiop = RENAME; - tcn.cn_flags = ISLASTCN; -// tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; + tcn.cn_flags = ISLASTCN | SAVENAME; tcn.cn_cred = kcred; +#ifndef __NetBSD__ + tcn.cn_lkflags = LK_EXCLUSIVE | LK_RETRY; + tcn.cn_thread = td; +#endif vn_lock(ZTOV(tdzp), LK_EXCLUSIVE | LK_RETRY); error = VOP_LOOKUP(ZTOV(tdzp), &tvp, &tcn); if (error == EJUSTRETURN) tvp = NULL; else if (error != 0) { - VOP_UNLOCK(ZTOV(tdzp)); + VOP_UNLOCK(ZTOV(tdzp), 0); goto fail; - } else { - error = vn_lock(tvp, LK_EXCLUSIVE); - if (error != 0) { - VOP_UNLOCK(ZTOV(tdzp)); - vrele(tvp); - goto fail; - } } +#ifdef __NetBSD__ + vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); +#endif error = VOP_RENAME(ZTOV(sdzp), svp, &scn, ZTOV(tdzp), tvp, &tcn /*,vflg*/); return (error); @@ -716,7 +741,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_wr znode_t *zp; int error; ssize_t resid; - uint64_t orig_eof, eod, offset, length; + uint64_t eod, offset, length; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -734,9 +759,20 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_wr offset = lr->lr_offset; length = lr->lr_length; - eod = offset + length; /* end of data for this write */ + eod = offset + length; /* end of data for this write */ + + /* + * This may be a write from a dmu_sync() for a whole block, + * and may extend beyond the current end of the file. + * We can't just replay what was written for this TX_WRITE as + * a future TX_WRITE2 may extend the eof and the data for that + * write needs to be there. So we write the whole block and + * reduce the eof. This needs to be done within the single dmu + * transaction created within vn_rdwr -> zfs_write. So a possible + * new end of file is passed through in zfsvfs->z_replay_eof + */ - orig_eof = zp->z_phys->zp_size; + zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */ /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { @@ -745,23 +781,15 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_wr offset -= offset % blocksize; length = blocksize; } + if (zp->z_size < eod) + zfsvfs->z_replay_eof = eod; } error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); - /* - * This may be a write from a dmu_sync() for a whole block, - * and may extend beyond the current end of the file. - * We can't just replay what was written for this TX_WRITE as - * a future TX_WRITE2 may extend the eof and the data for that - * write needs to be there. So we write the whole block and - * reduce the eof. - */ - if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */ - zp->z_phys->zp_size = eod; - VN_RELE(ZTOV(zp)); + zfsvfs->z_replay_eof = 0; /* safety */ return (error); } @@ -785,10 +813,31 @@ zfs_replay_write2(zfsvfs_t *zfsvfs, lr_w if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); +top: end = lr->lr_offset + lr->lr_length; - if (end > zp->z_phys->zp_size) { - ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz); - zp->z_phys->zp_size = end; + if (end > zp->z_size) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + zp->z_size = end; + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + VN_RELE(ZTOV(zp)); + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + return (error); + } + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); + + /* Ensure the replayed seq is updated */ + (void) zil_replaying(zfsvfs->z_log, tx); + + dmu_tx_commit(tx); } VN_RELE(ZTOV(zp)); @@ -799,11 +848,14 @@ zfs_replay_write2(zfsvfs_t *zfsvfs, lr_w static int zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap) { +#ifdef illumos + znode_t *zp; + flock64_t fl; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); -#ifdef __NetBSD__ - ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org"); - return (EOPNOTSUPP); -#else if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); @@ -819,6 +871,9 @@ zfs_replay_truncate(zfsvfs_t *zfsvfs, lr VN_RELE(ZTOV(zp)); return (error); +#else + ZFS_LOG(0, "Unexpected code path, report to pjd@FreeBSD.org"); + return (EOPNOTSUPP); #endif } @@ -869,7 +924,7 @@ zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_ vp = ZTOV(zp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_SETATTR(vp, vap, kcred); - VOP_UNLOCK(vp); + VOP_UNLOCK(vp, 0); zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; @@ -878,11 +933,15 @@ zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_ return (error); } +extern int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, + caller_context_t *ct); + static int zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap) { ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ vsecattr_t vsa; + vnode_t *vp; znode_t *zp; int error; @@ -901,13 +960,12 @@ zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_a vsa.vsa_aclflags = 0; vsa.vsa_aclentp = ace; -#ifdef TODO - error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL); -#else - panic("%s:%u: unsupported condition", __func__, __LINE__); -#endif + vp = ZTOV(zp); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL); + VOP_UNLOCK(vp, 0); - VN_RELE(ZTOV(zp)); + VN_RELE(vp); return (error); } @@ -932,6 +990,7 @@ zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_ ace_t *ace = (ace_t *)(lr + 1); vsecattr_t vsa; znode_t *zp; + vnode_t *vp; int error; if (byteswap) { @@ -947,7 +1006,6 @@ zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_ if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); -#ifdef TODO bzero(&vsa, sizeof (vsa)); vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS; vsa.vsa_aclcnt = lr->lr_aclcnt; @@ -964,16 +1022,16 @@ zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_ lr->lr_fuidcnt, lr->lr_domcnt, 0, 0); } - error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL); + vp = ZTOV(zp); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL); + VOP_UNLOCK(vp, 0); if (zfsvfs->z_fuid_replay) zfs_fuid_info_free(zfsvfs->z_fuid_replay); -#else - error = EOPNOTSUPP; -#endif zfsvfs->z_fuid_replay = NULL; - VN_RELE(ZTOV(zp)); + VN_RELE(vp); return (error); } Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c,v retrieving revision 1.5 diff -u -p -r1.5 zfs_rlock.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c 29 Feb 2016 16:18:37 -0000 1.5 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c 19 Jun 2013 13:06:07 -0000 @@ -19,13 +19,16 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ /* * This file contains the code to implement file range locking in - * ZFS, although there isn't much specific to ZFS (all that comes to mind + * ZFS, although there isn't much specific to ZFS (all that comes to mind is * support for growing the blocksize). * * Interface @@ -94,36 +97,6 @@ #include -static int -zfs_range_lock_hold(rl_t *rl) -{ - - KASSERT(rl->r_zp != NULL); - KASSERT(0 < rl->r_refcnt); - KASSERT(mutex_owned(&rl->r_zp->z_range_lock)); - - if (rl->r_refcnt >= ULONG_MAX) - return (ENFILE); /* XXX What to do? */ - - rl->r_refcnt++; - return (0); -} - -static void -zfs_range_lock_rele(rl_t *rl) -{ - - KASSERT(rl->r_zp != NULL); - KASSERT(0 < rl->r_refcnt); - KASSERT(mutex_owned(&rl->r_zp->z_range_lock)); - - if (--rl->r_refcnt == 0) { - cv_destroy(&rl->r_wr_cv); - cv_destroy(&rl->r_rd_cv); - kmem_free(rl, sizeof (rl_t)); - } -} - /* * Check if a write lock can be grabbed, or wait and recheck until available. */ @@ -142,7 +115,7 @@ zfs_range_lock_writer(znode_t *zp, rl_t * Range locking is also used by zvol and uses a * dummied up znode. However, for zvol, we don't need to * append or grow blocksize, and besides we don't have - * a z_phys or z_zfsvfs - so skip that processing. + * a "sa" data or z_zfsvfs - so skip that processing. * * Yes, this is ugly, and would be solved by not handling * grow or append in range lock code. If that was done then @@ -155,14 +128,14 @@ zfs_range_lock_writer(znode_t *zp, rl_t * This is done under z_range_lock to avoid races. */ if (new->r_type == RL_APPEND) - new->r_off = zp->z_phys->zp_size; + new->r_off = zp->z_size; /* * If we need to grow the block size then grab the whole * file range. This is also done under z_range_lock to * avoid races. */ - end_size = MAX(zp->z_phys->zp_size, new->r_off + len); + end_size = MAX(zp->z_size, new->r_off + len); if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zp->z_zfsvfs->z_max_blksz)) { new->r_off = 0; @@ -187,12 +160,10 @@ zfs_range_lock_writer(znode_t *zp, rl_t goto wait; /* already locked at same offset */ rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); - KASSERT(0 < rl->r_refcnt); if (rl && (rl->r_off < new->r_off + new->r_len)) goto wait; rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); - KASSERT(0 < rl->r_refcnt); if (rl && rl->r_off + rl->r_len > new->r_off) goto wait; @@ -201,12 +172,10 @@ zfs_range_lock_writer(znode_t *zp, rl_t return; wait: if (!rl->r_write_wanted) { + cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); rl->r_write_wanted = B_TRUE; } - if (zfs_range_lock_hold(rl) != 0) - panic("too many waiters on zfs range lock %p", rl); cv_wait(&rl->r_wr_cv, &zp->z_range_lock); - zfs_range_lock_rele(rl); /* reset to original */ new->r_off = off; @@ -234,17 +203,13 @@ zfs_range_proxify(avl_tree_t *tree, rl_t /* create a proxy range lock */ proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP); - proxy->r_zp = rl->r_zp; proxy->r_off = rl->r_off; proxy->r_len = rl->r_len; proxy->r_cnt = 1; proxy->r_type = RL_READER; proxy->r_proxy = B_TRUE; - cv_init(&proxy->r_wr_cv, NULL, CV_DEFAULT, NULL); - cv_init(&proxy->r_rd_cv, NULL, CV_DEFAULT, NULL); proxy->r_write_wanted = B_FALSE; proxy->r_read_wanted = B_FALSE; - proxy->r_refcnt = 1; avl_add(tree, proxy); return (proxy); @@ -267,15 +232,11 @@ zfs_range_split(avl_tree_t *tree, rl_t * /* create the rear proxy range lock */ rear = kmem_alloc(sizeof (rl_t), KM_SLEEP); - rear->r_zp = rl->r_zp; rear->r_off = off; rear->r_len = rl->r_off + rl->r_len - off; rear->r_cnt = rl->r_cnt; rear->r_type = RL_READER; rear->r_proxy = B_TRUE; - cv_init(&rear->r_wr_cv, NULL, CV_DEFAULT, NULL); - cv_init(&rear->r_rd_cv, NULL, CV_DEFAULT, NULL); - rear->r_refcnt = 1; rear->r_write_wanted = B_FALSE; rear->r_read_wanted = B_FALSE; @@ -290,30 +251,25 @@ zfs_range_split(avl_tree_t *tree, rl_t * * Create and add a new proxy range lock for the supplied range. */ static void -zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len, znode_t *zp) +zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) { rl_t *rl; ASSERT(len); rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); - rl->r_zp = zp; rl->r_off = off; rl->r_len = len; rl->r_cnt = 1; rl->r_type = RL_READER; rl->r_proxy = B_TRUE; - cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); - cv_init(&rl->r_rd_cv, NULL, CV_DEFAULT, NULL); rl->r_write_wanted = B_FALSE; rl->r_read_wanted = B_FALSE; - rl->r_refcnt = 1; avl_add(tree, rl); } static void zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) { - znode_t *zp = new->r_zp; rl_t *next; uint64_t off = new->r_off; uint64_t len = new->r_len; @@ -350,10 +306,9 @@ zfs_range_add_reader(avl_tree_t *tree, r return; } - KASSERT(0 < next->r_refcnt); if (off < next->r_off) { /* Add a proxy for initial range before the overlap */ - zfs_range_new_proxy(tree, off, next->r_off - off, zp); + zfs_range_new_proxy(tree, off, next->r_off - off); } new->r_cnt = 0; /* will use proxies in tree */ @@ -370,31 +325,28 @@ zfs_range_add_reader(avl_tree_t *tree, r /* there's a gap */ ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); zfs_range_new_proxy(tree, prev->r_off + prev->r_len, - next->r_off - (prev->r_off + prev->r_len), zp); + next->r_off - (prev->r_off + prev->r_len)); } if (off + len == next->r_off + next->r_len) { /* exact overlap with end */ next = zfs_range_proxify(tree, next); - KASSERT(0 < next->r_refcnt); next->r_cnt++; return; } if (off + len < next->r_off + next->r_len) { /* new range ends in the middle of this block */ next = zfs_range_split(tree, next, off + len); - KASSERT(0 < next->r_refcnt); next->r_cnt++; return; } ASSERT3U(off + len, >, next->r_off + next->r_len); next = zfs_range_proxify(tree, next); - KASSERT(0 < next->r_refcnt); next->r_cnt++; } /* Add the remaining end range. */ zfs_range_new_proxy(tree, prev->r_off + prev->r_len, - (off + len) - (prev->r_off + prev->r_len), zp); + (off + len) - (prev->r_off + prev->r_len)); } /* @@ -423,13 +375,10 @@ retry: if (prev && (off < prev->r_off + prev->r_len)) { if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { if (!prev->r_read_wanted) { + cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); prev->r_read_wanted = B_TRUE; } - if (zfs_range_lock_hold(prev) != 0) - panic("too many waiters on zfs range lock %p", - prev); cv_wait(&prev->r_rd_cv, &zp->z_range_lock); - zfs_range_lock_rele(prev); goto retry; } if (off + len < prev->r_off + prev->r_len) @@ -449,13 +398,10 @@ retry: goto got_lock; if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { if (!next->r_read_wanted) { + cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); next->r_read_wanted = B_TRUE; } - if (zfs_range_lock_hold(next) != 0) - panic("too many waiters on zfs range lock %p", - next); cv_wait(&next->r_rd_cv, &zp->z_range_lock); - zfs_range_lock_rele(next); goto retry; } if (off + len <= next->r_off + next->r_len) @@ -492,11 +438,8 @@ zfs_range_lock(znode_t *zp, uint64_t off new->r_cnt = 1; /* assume it's going to be in the tree */ new->r_type = type; new->r_proxy = B_FALSE; - cv_init(&new->r_wr_cv, NULL, CV_DEFAULT, NULL); - cv_init(&new->r_rd_cv, NULL, CV_DEFAULT, NULL); new->r_write_wanted = B_FALSE; new->r_read_wanted = B_FALSE; - new->r_refcnt = 1; mutex_enter(&zp->z_range_lock); if (type == RL_READER) { @@ -507,9 +450,8 @@ zfs_range_lock(znode_t *zp, uint64_t off avl_add(&zp->z_range_avl, new); else zfs_range_lock_reader(zp, new); - } else { + } else zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */ - } mutex_exit(&zp->z_range_lock); return (new); } @@ -521,7 +463,7 @@ static void zfs_range_unlock_reader(znode_t *zp, rl_t *remove) { avl_tree_t *tree = &zp->z_range_avl; - rl_t *rl, *next; + rl_t *rl, *next = NULL; uint64_t len; /* @@ -535,14 +477,16 @@ zfs_range_unlock_reader(znode_t *zp, rl_ avl_remove(tree, remove); if (remove->r_write_wanted) { cv_broadcast(&remove->r_wr_cv); + cv_destroy(&remove->r_wr_cv); } if (remove->r_read_wanted) { cv_broadcast(&remove->r_rd_cv); + cv_destroy(&remove->r_rd_cv); } } else { - ASSERT3U(remove->r_cnt, ==, 0); - ASSERT3U(remove->r_write_wanted, ==, 0); - ASSERT3U(remove->r_read_wanted, ==, 0); + ASSERT0(remove->r_cnt); + ASSERT0(remove->r_write_wanted); + ASSERT0(remove->r_read_wanted); /* * Find start proxy representing this reader lock, * then decrement ref count on all proxies @@ -566,15 +510,17 @@ zfs_range_unlock_reader(znode_t *zp, rl_ avl_remove(tree, rl); if (rl->r_write_wanted) { cv_broadcast(&rl->r_wr_cv); + cv_destroy(&rl->r_wr_cv); } if (rl->r_read_wanted) { cv_broadcast(&rl->r_rd_cv); + cv_destroy(&rl->r_rd_cv); } - zfs_range_lock_rele(rl); + kmem_free(rl, sizeof (rl_t)); } } } - zfs_range_lock_rele(remove); + kmem_free(remove, sizeof (rl_t)); } /* @@ -593,14 +539,16 @@ zfs_range_unlock(rl_t *rl) if (rl->r_type == RL_WRITER) { /* writer locks can't be shared or split */ avl_remove(&zp->z_range_avl, rl); + mutex_exit(&zp->z_range_lock); if (rl->r_write_wanted) { cv_broadcast(&rl->r_wr_cv); + cv_destroy(&rl->r_wr_cv); } if (rl->r_read_wanted) { cv_broadcast(&rl->r_rd_cv); + cv_destroy(&rl->r_rd_cv); } - zfs_range_lock_rele(rl); - mutex_exit(&zp->z_range_lock); + kmem_free(rl, sizeof (rl_t)); } else { /* * lock may be shared, let zfs_range_unlock_reader() @@ -632,11 +580,11 @@ zfs_range_reduce(rl_t *rl, uint64_t off, mutex_enter(&zp->z_range_lock); rl->r_off = off; rl->r_len = len; + mutex_exit(&zp->z_range_lock); if (rl->r_write_wanted) cv_broadcast(&rl->r_wr_cv); if (rl->r_read_wanted) cv_broadcast(&rl->r_rd_cv); - mutex_exit(&zp->z_range_lock); } /* Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_sa.c =================================================================== RCS file: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_sa.c diff -N src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_sa.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_sa.c 10 Oct 2016 11:09:56 -0000 @@ -0,0 +1,327 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include + +/* + * ZPL attribute registration table. + * Order of attributes doesn't matter + * a unique value will be assigned for each + * attribute that is file system specific + * + * This is just the set of ZPL attributes that this + * version of ZFS deals with natively. The file system + * could have other attributes stored in files, but they will be + * ignored. The SA framework will preserve them, just that + * this version of ZFS won't change or delete them. + */ + +sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { + {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, + {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, + {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, + {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, + {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, + {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, + {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, + {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, + {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, + {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, + {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, + {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, + {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, + {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, + {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, + {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, + {"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0}, + {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0}, + {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0}, + {"ZPL_DACL_ACES", 0, SA_ACL, 0}, + {NULL, 0, 0, 0} +}; + +#ifdef _KERNEL + +int +zfs_sa_readlink(znode_t *zp, uio_t *uio) +{ + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + size_t bufsz; + int error; + + bufsz = zp->z_size; + if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) { + error = uiomove((caddr_t)db->db_data + + ZFS_OLD_ZNODE_PHYS_SIZE, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + } else { + dmu_buf_t *dbp; + if ((error = dmu_buf_hold(zp->z_zfsvfs->z_os, zp->z_id, + 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) { + error = uiomove(dbp->db_data, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + dmu_buf_rele(dbp, FTAG); + } + } + return (error); +} + +void +zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) +{ + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + + if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) { + VERIFY(dmu_set_bonus(db, + len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0); + if (len) { + bcopy(link, (caddr_t)db->db_data + + ZFS_OLD_ZNODE_PHYS_SIZE, len); + } + } else { + dmu_buf_t *dbp; + + zfs_grow_blocksize(zp, len, tx); + VERIFY(0 == dmu_buf_hold(zp->z_zfsvfs->z_os, + zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)); + + dmu_buf_will_dirty(dbp, tx); + + ASSERT3U(len, <=, dbp->db_size); + bcopy(link, dbp->db_data, len); + dmu_buf_rele(dbp, FTAG); + } +} + +void +zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + xoptattr_t *xoap; + + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); + if (zp->z_is_sa) { + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), + &xoap->xoa_av_scanstamp, + sizeof (xoap->xoa_av_scanstamp)) != 0) + return; + } else { + dmu_object_info_t doi; + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + int len; + + if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP)) + return; + + sa_object_info(zp->z_sa_hdl, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + ZFS_OLD_ZNODE_PHYS_SIZE; + + if (len <= doi.doi_bonus_size) { + (void) memcpy(xoap->xoa_av_scanstamp, + (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + sizeof (xoap->xoa_av_scanstamp)); + } + } + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); +} + +void +zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + xoptattr_t *xoap; + + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); + if (zp->z_is_sa) + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), + &xoap->xoa_av_scanstamp, + sizeof (xoap->xoa_av_scanstamp), tx)); + else { + dmu_object_info_t doi; + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + int len; + + sa_object_info(zp->z_sa_hdl, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + ZFS_OLD_ZNODE_PHYS_SIZE; + if (len > doi.doi_bonus_size) + VERIFY(dmu_set_bonus(db, len, tx) == 0); + (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp)); + + zp->z_pflags |= ZFS_BONUS_SCANSTAMP; + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + &zp->z_pflags, sizeof (uint64_t), tx)); + } +} + +/* + * I'm not convinced we should do any of this upgrade. + * since the SA code can read both old/new znode formats + * with probably little to no performance difference. + * + * All new files will be created with the new format. + */ + +void +zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) +{ + dmu_buf_t *db = sa_get_db(hdl); + znode_t *zp = sa_get_userdata(hdl); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + sa_bulk_attr_t bulk[20]; + int count = 0; + sa_bulk_attr_t sa_attrs[20] = { 0 }; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t uid, gid, mode, rdev, xattr, parent; + uint64_t crtime[2], mtime[2], ctime[2]; + zfs_acl_phys_t znode_acl; + char scanstamp[AV_SCANSTAMP_SZ]; + + /* + * No upgrade if ACL isn't cached + * since we won't know which locks are held + * and ready the ACL would require special "locked" + * interfaces that would be messy + */ + if (zp->z_acl_cached == NULL || ZTOV(zp)->v_type == VLNK) + return; + + /* + * If the vnode lock is held and we aren't the owner + * then just return since we don't want to deadlock + * trying to update the status of z_is_sa. This + * file can then be upgraded at a later time. + * + * Otherwise, we know we are doing the + * sa_update() that caused us to enter this function. + */ + if (vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_NOWAIT) != 0) + return; + + /* First do a bulk query of the attributes that aren't cached */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &znode_acl, 88); + + if (sa_bulk_lookup_locked(hdl, bulk, count) != 0) + goto done; + + + /* + * While the order here doesn't matter its best to try and organize + * it is such a way to pick up an already existing layout number + */ + count = 0; + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs), + NULL, &zp->z_gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL, + zp->z_atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL, + &crtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, 8); + if (zp->z_vnode->v_type == VBLK || zp->z_vnode->v_type == VCHR) + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &zp->z_acl_cached->z_acl_count, 8); + + if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID) + zfs_acl_xform(zp, zp->z_acl_cached, CRED()); + + locate.cb_aclp = zp->z_acl_cached; + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes); + + if (xattr) + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs), + NULL, &xattr, 8); + + /* if scanstamp then add scanstamp */ + + if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { + bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + scanstamp, AV_SCANSTAMP_SZ); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), + NULL, scanstamp, AV_SCANSTAMP_SZ); + zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; + } + + VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0); + VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs, + count, tx) == 0); + if (znode_acl.z_acl_extern_obj) + VERIFY(0 == dmu_object_free(zfsvfs->z_os, + znode_acl.z_acl_extern_obj, tx)); + + zp->z_is_sa = B_TRUE; +done: + VOP_UNLOCK(ZTOV(zp), 0); +} + +void +zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp) +{ + if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa) + return; + + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + if (zfs_external_acl(zp)) { + dmu_tx_hold_free(tx, zfs_external_acl(zp), 0, + DMU_OBJECT_END); + } +} + +#endif Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c,v retrieving revision 1.15 diff -u -p -r1.15 zfs_vfsops.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c 17 Feb 2017 08:31:23 -0000 1.15 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c 16 Jun 2017 16:35:57 -0000 @@ -19,19 +19,24 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek . + * All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ +/* Portions Copyright 2010 Robert Milkowski */ + #include #include #include +#include #include #include -#include +#include #include #include -#include #include #include #include @@ -45,11 +50,11 @@ #include #include #include +#include +#include #include #include #include -#include -#include #include #include #include @@ -57,36 +62,89 @@ #include #include #include +#include "zfs_comutil.h" + +#ifdef __FreeBSD_kernel__ + +#include + +struct mtx zfs_debug_mtx; +MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); + +SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); + +int zfs_super_owner; +SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, + "File system owner can perform privileged operation on his file systems"); + +int zfs_debug_level; +SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, + "Debug level"); + +SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); +static int zfs_version_acl = ZFS_ACL_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, + "ZFS_ACL_VERSION"); +static int zfs_version_spa = SPA_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, + "SPA_VERSION"); +static int zfs_version_zpl = ZPL_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, + "ZPL_VERSION"); + +static int zfs_mount(vfs_t *vfsp); +static int zfs_umount(vfs_t *vfsp, int fflag); +static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); +static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); +static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); +static int zfs_sync(vfs_t *vfsp, int waitfor); +static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, + struct ucred **credanonp, int *numsecflavors, int **secflavors); +static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); +static void zfs_objset_close(zfsvfs_t *zfsvfs); +static void zfs_freevfs(vfs_t *vfsp); + +struct vfsops zfs_vfsops = { + .vfs_mount = zfs_mount, + .vfs_unmount = zfs_umount, + .vfs_root = zfs_root, + .vfs_statfs = zfs_statfs, + .vfs_vget = zfs_vget, + .vfs_sync = zfs_sync, + .vfs_checkexp = zfs_checkexp, + .vfs_fhtovp = zfs_fhtovp, +}; + +VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); + +#endif /* __FreeBSD_kernel__ */ #ifdef __NetBSD__ -/* include ddi_name_to_major function is there better place for it ?*/ -#include -#include -#endif -int zfsfstype; -vfsops_t *zfs_vfsops = NULL; -static major_t zfs_major; -static minor_t zfs_minor; -static kmutex_t zfs_dev_mtx; +#include +#include int zfs_debug_level; kmutex_t zfs_debug_mtx; -/* XXX NetBSD static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);*/ +#define DROP_GIANT() /* nothing */ +#define PICKUP_GIANT() /* nothing */ +#define vfs_stdsync(a, b) 0 + static int zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len); static int zfs_umount(vfs_t *vfsp, int fflag); -static int zfs_root(vfs_t *vfsp, vnode_t **vpp); +static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); +static int zfs_netbsd_root(vfs_t *vfsp, vnode_t **vpp); static int zfs_statvfs(vfs_t *vfsp, struct statvfs *statp); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp); static int zfs_vget(vfs_t *vfsp, ino_t ino, vnode_t **vpp); -static int zfs_start(vfs_t *vfsp, int flags); +static int zfs_sync(vfs_t *vfsp, int waitfor); +static int zfs_netbsd_sync(vfs_t *vfsp, int waitfor, cred_t *cr); static void zfs_freevfs(vfs_t *vfsp); void zfs_init(void); void zfs_fini(void); - extern const struct vnodeopv_desc zfs_vnodeop_opv_desc; static const struct vnodeopv_desc * const zfs_vnodeop_descs[] = { @@ -94,23 +152,23 @@ static const struct vnodeopv_desc * cons NULL, }; -static struct vfsops zfs_vfsops_template = { +struct vfsops zfs_vfsops = { .vfs_name = MOUNT_ZFS, .vfs_min_mount_data = sizeof(struct zfs_args), .vfs_opv_descs = zfs_vnodeop_descs, .vfs_mount = zfs_mount, .vfs_unmount = zfs_umount, - .vfs_root = zfs_root, + .vfs_root = zfs_netbsd_root, .vfs_statvfs = zfs_statvfs, - .vfs_sync = zfs_sync, + .vfs_sync = zfs_netbsd_sync, .vfs_vget = zfs_vget, .vfs_loadvnode = zfs_loadvnode, .vfs_fhtovp = zfs_fhtovp, .vfs_init = zfs_init, .vfs_done = zfs_fini, - .vfs_start = zfs_start, - .vfs_renamelock_enter = (void*)nullop, - .vfs_renamelock_exit = (void*)nullop, + .vfs_start = (void *)nullop, + .vfs_renamelock_enter = genfs_renamelock_enter, + .vfs_renamelock_exit = genfs_renamelock_exit, .vfs_reinit = (void *)nullop, .vfs_vptofh = (void *)eopnotsupp, .vfs_fhtovp = (void *)eopnotsupp, @@ -121,34 +179,6 @@ static struct vfsops zfs_vfsops_template .vfs_fsync = (void *)eopnotsupp, }; -/* - * We need to keep a count of active fs's. - * This is necessary to prevent our module - * from being unloaded after a umount -f - */ -static uint32_t zfs_active_fs_count = 0; - -static char *noatime_cancel[] = { MNTOPT_ATIME, NULL }; -static char *atime_cancel[] = { MNTOPT_NOATIME, NULL }; -static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL }; -static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL }; - -/* - * MO_DEFAULT is not used since the default value is determined - * by the equivalent property. - */ -static mntopt_t mntopts[] = { - { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL }, - { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL }, - { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL }, - { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL } -}; - -static mntopts_t zfs_mntopts = { - sizeof (mntopts) / sizeof (mntopt_t), - mntopts -}; - static bool zfs_sync_selector(void *cl, struct vnode *vp) { @@ -163,26 +193,12 @@ zfs_sync_selector(void *cl, struct vnode && !zp->z_unlinked; } -/*ARGSUSED*/ -int -zfs_sync(vfs_t *vfsp, int flag, cred_t *cr) +static int +zfs_netbsd_sync(vfs_t *vfsp, int waitfor, cred_t *cr) { + struct vnode_iterator *marker; zfsvfs_t *zfsvfs = vfsp->vfs_data; - znode_t *zp; vnode_t *vp; - struct vnode_iterator *marker; - dmu_tx_t *tx; - int error; - - - error = 0; - - /* - * Data integrity is job one. We don't want a compromised kernel - * writing to the storage pool, so we never sync during panic. - */ - if (panicstr) - return (0); /* * On NetBSD, we need to push out atime updates. Solaris does @@ -192,35 +208,73 @@ zfs_sync(vfs_t *vfsp, int flag, cred_t * vfs_vnode_iterator_init(vfsp, &marker); while ((vp = vfs_vnode_iterator_next(marker, zfs_sync_selector, NULL))) { + znode_t *zp; + dmu_buf_t *dbp; + dmu_tx_t *tx; + int error; + error = vn_lock(vp, LK_EXCLUSIVE); if (error) { - vrele(vp); + VN_RELE(vp); continue; } + ZFS_ENTER(zfsvfs); zp = VTOZ(vp); tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { - dmu_buf_will_dirty(zp->z_dbuf, tx); - mutex_enter(&zp->z_lock); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&zp->z_atime, sizeof (zp->z_atime), tx); zp->z_atime_dirty = 0; - mutex_exit(&zp->z_lock); dmu_tx_commit(tx); } + ZFS_EXIT(zfsvfs); vput(vp); } vfs_vnode_iterator_destroy(marker); /* - * SYNC_ATTR is used by fsflush() to force old filesystems like UFS - * to sync metadata, which they would otherwise cache indefinitely. - * Semantically, the only requirement is that the sync be initiated. - * The DMU syncs out txgs frequently, so there's nothing to do. + * Then do the regular ZFS stuff. + */ + return zfs_sync(vfsp, waitfor); +} + +static int +zfs_netbsd_root(vfs_t *vfsp, vnode_t **vpp) +{ + + return zfs_root(vfsp, LK_EXCLUSIVE | LK_RETRY, vpp); +} + +#endif /* __NetBSD__ */ + +/* + * We need to keep a count of active fs's. + * This is necessary to prevent our module + * from being unloaded after a umount -f + */ +static uint32_t zfs_active_fs_count = 0; + +/*ARGSUSED*/ +static int +zfs_sync(vfs_t *vfsp, int waitfor) +{ + /* + * Data integrity is job one. We don't want a compromised kernel + * writing to the storage pool, so we never sync during panic. + */ + if (panicstr) + return (0); + + /* + * Ignore the system syncher. ZFS already commits async data + * at zfs_txg_timeout intervals. */ - if ((flag & MNT_LAZY) != 0) + if (waitfor == MNT_LAZY) return (0); if (vfsp != NULL) { @@ -229,6 +283,11 @@ zfs_sync(vfs_t *vfsp, int flag, cred_t * */ zfsvfs_t *zfsvfs = vfsp->vfs_data; dsl_pool_t *dp; + int error; + + error = vfs_stdsync(vfsp, waitfor); + if (error != 0) + return (error); ZFS_ENTER(zfsvfs); dp = dmu_objset_pool(zfsvfs->z_os); @@ -243,9 +302,8 @@ zfs_sync(vfs_t *vfsp, int flag, cred_t * } if (zfsvfs->z_log != NULL) - zil_commit(zfsvfs->z_log, UINT64_MAX, 0); - else - txg_wait_synced(dp, 0); + zil_commit(zfsvfs->z_log, 0); + ZFS_EXIT(zfsvfs); } else { /* @@ -259,17 +317,18 @@ zfs_sync(vfs_t *vfsp, int flag, cred_t * return (0); } +#ifdef illumos static int zfs_create_unique_device(dev_t *dev) { major_t new_major; do { - ASSERT3U(zfs_minor, <=, MAXMIN); + ASSERT3U(zfs_minor, <=, MAXMIN32); minor_t start = zfs_minor; do { mutex_enter(&zfs_dev_mtx); - if (zfs_minor >= MAXMIN) { + if (zfs_minor >= MAXMIN32) { /* * If we're still using the real major * keep out of /dev/zfs and /dev/zvol minor @@ -286,8 +345,7 @@ zfs_create_unique_device(dev_t *dev) *dev = makedevice(zfs_major, zfs_minor); mutex_exit(&zfs_dev_mtx); } while (vfs_devismounted(*dev) && zfs_minor != start); - break; -#ifndef __NetBSD__ +#ifdef illumos if (zfs_minor == start) { /* * We are using all ~262,000 minor numbers for the @@ -313,6 +371,8 @@ zfs_create_unique_device(dev_t *dev) return (0); } +#endif /* illumos */ + static void atime_changed_cb(void *arg, uint64_t newval) @@ -321,10 +381,12 @@ atime_changed_cb(void *arg, uint64_t new if (newval == TRUE) { zfsvfs->z_atime = TRUE; + zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); } else { zfsvfs->z_atime = FALSE; + zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); } @@ -339,14 +401,14 @@ xattr_changed_cb(void *arg, uint64_t new /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag |= VFS_XATTR; -#endif +#endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0); } else { /* XXX locking on vfs_flag? */ #ifdef TODO zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR; -#endif +#endif vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0); } @@ -356,13 +418,12 @@ static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; - - if (newval < SPA_MINBLOCKSIZE || - newval > SPA_MAXBLOCKSIZE || !ISP2(newval)) - newval = SPA_MAXBLOCKSIZE; + ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); + ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); + ASSERT(ISP2(newval)); zfsvfs->z_max_blksz = newval; - zfsvfs->z_vfs->vfs_bsize = newval; + zfsvfs->z_vfs->mnt_stat.f_iosize = newval; } static void @@ -384,22 +445,6 @@ readonly_changed_cb(void *arg, uint64_t } static void -devices_changed_cb(void *arg, uint64_t newval) -{ - zfsvfs_t *zfsvfs = arg; - - if (newval == FALSE) { - zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0); - } else { - zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES; - vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES); - vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0); - } -} - -static void setuid_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; @@ -491,12 +536,20 @@ zfs_register_callbacks(vfs_t *vfsp) objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; uint64_t nbmand; - int readonly, do_readonly = B_FALSE; - int setuid, do_setuid = B_FALSE; - int exec, do_exec = B_FALSE; - int devices, do_devices = B_FALSE; - int xattr, do_xattr = B_FALSE; - int atime, do_atime = B_FALSE; + boolean_t readonly = B_FALSE; + boolean_t do_readonly = B_FALSE; + boolean_t setuid = B_FALSE; + boolean_t do_setuid = B_FALSE; + boolean_t exec = B_FALSE; + boolean_t do_exec = B_FALSE; +#ifdef illumos + boolean_t devices = B_FALSE; + boolean_t do_devices = B_FALSE; +#endif + boolean_t xattr = B_FALSE; + boolean_t do_xattr = B_FALSE; + boolean_t atime = B_FALSE; + boolean_t do_atime = B_FALSE; int error = 0; ASSERT(vfsp); @@ -505,12 +558,20 @@ zfs_register_callbacks(vfs_t *vfsp) os = zfsvfs->z_os; /* + * This function can be called for a snapshot when we update snapshot's + * mount point, which isn't really supported. + */ + if (dmu_objset_is_snapshot(os)) + return (EOPNOTSUPP); + + /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ - if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || + !spa_writeable(dmu_objset_spa(os))) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { @@ -518,19 +579,9 @@ zfs_register_callbacks(vfs_t *vfsp) do_readonly = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { - devices = B_FALSE; setuid = B_FALSE; - do_devices = B_TRUE; do_setuid = B_TRUE; } else { - if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { - devices = B_FALSE; - do_devices = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) { - devices = B_TRUE; - do_devices = B_TRUE; - } - if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; @@ -562,6 +613,19 @@ zfs_register_callbacks(vfs_t *vfsp) } /* + * We need to enter pool configuration here, so that we can use + * dsl_prop_get_int_ds() to handle the special nbmand property below. + * dsl_prop_get_integer() can not be used, because it has to acquire + * spa_namespace_lock and we can not do that because we already hold + * z_teardown_lock. The problem is that spa_config_sync() is called + * with spa_namespace_lock held and the function calls ZFS vnode + * operations to write the cache file and thus z_teardown_lock is + * acquired after spa_namespace_lock. + */ + ds = dmu_objset_ds(os); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + + /* * nbmand is a special property. It can only be changed at * mount time. * @@ -572,14 +636,9 @@ zfs_register_callbacks(vfs_t *vfsp) nbmand = B_FALSE; } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { nbmand = B_TRUE; - } else { - char osname[MAXNAMELEN]; - - dmu_objset_name(os, osname); - if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand, - NULL)) { - return (error); - } + } else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) { + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + return (error); } /* @@ -589,28 +648,32 @@ zfs_register_callbacks(vfs_t *vfsp) * the first prop_register(), but I guess I like to go * overboard... */ - ds = dmu_objset_ds(os); - error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); + error = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "xattr", xattr_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "recordsize", blksz_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "readonly", readonly_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); +#ifdef illumos error = error ? error : dsl_prop_register(ds, - "devices", devices_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); +#endif error = error ? error : dsl_prop_register(ds, - "setuid", setuid_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "exec", exec_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "snapdir", snapdir_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "aclmode", acl_mode_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, - "aclinherit", acl_inherit_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, + zfsvfs); error = error ? error : dsl_prop_register(ds, - "vscan", vscan_changed_cb, zfsvfs); + zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; @@ -623,8 +686,6 @@ zfs_register_callbacks(vfs_t *vfsp) setuid_changed_cb(zfsvfs, setuid); if (do_exec) exec_changed_cb(zfsvfs, exec); - if (do_devices) - devices_changed_cb(zfsvfs, devices); if (do_xattr) xattr_changed_cb(zfsvfs, xattr); if (do_atime) @@ -635,64 +696,70 @@ zfs_register_callbacks(vfs_t *vfsp) return (0); unregister: - /* - * We may attempt to unregister some callbacks that are not - * registered, but this is OK; it will simply return ENOMSG, - * which we will ignore. - */ - (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); - (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, - zfsvfs); - (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs); + dsl_prop_unregister_all(ds, zfsvfs); return (error); - } -static void -uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid, - int64_t delta, dmu_tx_t *tx) +static int +zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, + uint64_t *userp, uint64_t *groupp) { - uint64_t used = 0; - char buf[32]; - int err; - uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; + /* + * Is it a valid type of object to track? + */ + if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) + return (SET_ERROR(ENOENT)); - if (delta == 0) - return; + /* + * If we have a NULL data pointer + * then assume the id's aren't changing and + * return EEXIST to the dmu to let it know to + * use the same ids + */ + if (data == NULL) + return (SET_ERROR(EEXIST)); - (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid); - err = zap_lookup(os, obj, buf, 8, 1, &used); - ASSERT(err == 0 || err == ENOENT); - /* no underflow/overflow */ - ASSERT(delta > 0 || used >= -delta); - ASSERT(delta < 0 || used + delta > used); - used += delta; - if (used == 0) - err = zap_remove(os, obj, buf, tx); - else - err = zap_update(os, obj, buf, 8, 1, &used, tx); - ASSERT(err == 0); -} + if (bonustype == DMU_OT_ZNODE) { + znode_phys_t *znp = data; + *userp = znp->zp_uid; + *groupp = znp->zp_gid; + } else { + int hdrsize; + sa_hdr_phys_t *sap = data; + sa_hdr_phys_t sa = *sap; + boolean_t swap = B_FALSE; -static int -zfs_space_delta_cb(dmu_object_type_t bonustype, void *bonus, - uint64_t *userp, uint64_t *groupp) -{ - znode_phys_t *znp = bonus; + ASSERT(bonustype == DMU_OT_SA); - if (bonustype != DMU_OT_ZNODE) - return (ENOENT); + if (sa.sa_magic == 0) { + /* + * This should only happen for newly created + * files that haven't had the znode data filled + * in yet. + */ + *userp = 0; + *groupp = 0; + return (0); + } + if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { + sa.sa_magic = SA_MAGIC; + sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); + swap = B_TRUE; + } else { + VERIFY3U(sa.sa_magic, ==, SA_MAGIC); + } - *userp = znp->zp_uid; - *groupp = znp->zp_gid; + hdrsize = sa_hdrsize(&sa); + VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); + *userp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_UID_OFFSET)); + *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_GID_OFFSET)); + if (swap) { + *userp = BSWAP_64(*userp); + *groupp = BSWAP_64(*groupp); + } + } return (0); } @@ -740,7 +807,7 @@ zfs_userspace_many(zfsvfs_t *zfsvfs, zfs uint64_t obj; if (!dmu_objset_userspace_present(zfsvfs->z_os)) - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); obj = zfs_userquota_prop_to_obj(zfsvfs, type); if (obj == 0) { @@ -776,7 +843,7 @@ zfs_userspace_many(zfsvfs_t *zfsvfs, zfs */ static int id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, - char *buf, size_t buflen, boolean_t addok) + char *buf, boolean_t addok) { uint64_t fuid; int domainid = 0; @@ -784,10 +851,10 @@ id_to_fuidstr(zfsvfs_t *zfsvfs, const ch if (domain && domain[0]) { domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); if (domainid == -1) - return (ENOENT); + return (SET_ERROR(ENOENT)); } fuid = FUID_ENCODE(domainid, rid); - (void) snprintf(buf, buflen, "%llx", (longlong_t)fuid); + (void) sprintf(buf, "%llx", (longlong_t)fuid); return (0); } @@ -802,13 +869,13 @@ zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_ *valp = 0; if (!dmu_objset_userspace_present(zfsvfs->z_os)) - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); obj = zfs_userquota_prop_to_obj(zfsvfs, type); if (obj == 0) return (0); - err = id_to_fuidstr(zfsvfs, domain, rid, buf, sizeof(buf), FALSE); + err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); if (err) return (err); @@ -829,15 +896,15 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_ boolean_t fuid_dirtied; if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) - return (EINVAL); + return (SET_ERROR(EINVAL)); if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : &zfsvfs->z_groupquota_obj; - err = id_to_fuidstr(zfsvfs, domain, rid, buf, sizeof(buf), B_TRUE); + err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); if (err) return (err); fuid_dirtied = zfsvfs->z_fuid_dirty; @@ -880,7 +947,7 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_ } boolean_t -zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) +zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) { char buf[32]; uint64_t used, quota, usedobj, quotaobj; @@ -892,7 +959,7 @@ zfs_usergroup_overquota(zfsvfs_t *zfsvfs if (quotaobj == 0 || zfsvfs->z_replay) return (B_FALSE); - (void) snprintf(buf, sizeof(buf), "%llx", (longlong_t)fuid); + (void) sprintf(buf, "%llx", (longlong_t)fuid); err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); if (err != 0) return (B_FALSE); @@ -903,60 +970,62 @@ zfs_usergroup_overquota(zfsvfs_t *zfsvfs return (used >= quota); } -int -zfsvfs_create(const char *osname, zfsvfs_t **zfvp) +boolean_t +zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup) { - objset_t *os; - zfsvfs_t *zfsvfs; - uint64_t zval; - int i, error; + uint64_t fuid; + uint64_t quotaobj; - zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; - /* - * We claim to always be readonly so we can open snapshots; - * other ZPL code will prevent us from writing to snapshots. - */ - error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); - if (error) { - kmem_free(zfsvfs, sizeof (zfsvfs_t)); - return (error); - } + fuid = isgroup ? zp->z_gid : zp->z_uid; - /* - * Initialize the zfs-specific filesystem structure. - * Should probably make this a kmem cache, shuffle fields, - * and just bzero up to z_hold_mtx[]. - */ - zfsvfs->z_vfs = NULL; - zfsvfs->z_parent = zfsvfs; - zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + return (zfs_fuid_overquota(zfsvfs, isgroup, fuid)); +} + +/* + * Associate this zfsvfs with the given objset, which must be owned. + * This will cache a bunch of on-disk state from the objset in the + * zfsvfs. + */ +static int +zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + uint64_t val; + + zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_os = os; error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); - if (error) { - goto out; - } else if (zfsvfs->z_version > ZPL_VERSION) { - (void) printf("Mismatched versions: File system " - "is version %llu on-disk format, which is " - "incompatible with this software version %lld!", - (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); - error = ENOTSUP; - goto out; + if (error != 0) + return (error); + if (zfsvfs->z_version > + zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { + (void) printf("Can't mount a version %lld file system " + "on a version %lld pool\n. Pool must be upgraded to mount " + "this file system.", (u_longlong_t)zfsvfs->z_version, + (u_longlong_t)spa_version(dmu_objset_spa(os))); + return (SET_ERROR(ENOTSUP)); } + error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); + if (error != 0) + return (error); + zfsvfs->z_norm = (int)val; - if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) - goto out; - zfsvfs->z_norm = (int)zval; - - if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) - goto out; - zfsvfs->z_utf8 = (zval != 0); + error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); + if (error != 0) + return (error); + zfsvfs->z_utf8 = (val != 0); - if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) - goto out; - zfsvfs->z_case = (uint_t)zval; + error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); + if (error != 0) + return (error); + zfsvfs->z_case = (uint_t)val; /* * Fold case on file systems that are always or sometimes case @@ -967,58 +1036,138 @@ zfsvfs_create(const char *osname, zfsvfs zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); + + uint64_t sa_obj = 0; + if (zfsvfs->z_use_sa) { + /* should either have both of these objects or none */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + if (error != 0) + return (error); + } + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + if (error != 0) + return (error); + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(os, zfs_sa_upgrade); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); - if (error) - goto out; + if (error != 0) + return (error); ASSERT(zfsvfs->z_root != 0); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); - if (error) - goto out; + if (error != 0) + return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 8, 1, &zfsvfs->z_userquota_obj); - if (error && error != ENOENT) - goto out; + if (error == ENOENT) + zfsvfs->z_userquota_obj = 0; + else if (error != 0) + return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 8, 1, &zfsvfs->z_groupquota_obj); - if (error && error != ENOENT) - goto out; + if (error == ENOENT) + zfsvfs->z_groupquota_obj = 0; + else if (error != 0) + return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); - if (error && error != ENOENT) - goto out; + if (error == ENOENT) + zfsvfs->z_fuid_obj = 0; + else if (error != 0) + return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &zfsvfs->z_shares_dir); - if (error && error != ENOENT) - goto out; + if (error == ENOENT) + zfsvfs->z_shares_dir = 0; + else if (error != 0) + return (error); + + /* + * Only use the name cache if we are looking for a + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name (which is always the case on + * FreeBSD). + */ + zfsvfs->z_use_namecache = !zfsvfs->z_norm || + ((zfsvfs->z_case == ZFS_CASE_MIXED) && + !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); + + return (0); +} + +int +zfsvfs_create(const char *osname, zfsvfs_t **zfvp) +{ + objset_t *os; + zfsvfs_t *zfsvfs; + int error; + + /* + * XXX: Fix struct statfs so this isn't necessary! + * + * The 'osname' is used as the filesystem's special node, which means + * it must fit in statfs.f_mntfromname, or else it can't be + * enumerated, so libzfs_mnttab_find() returns NULL, which causes + * 'zfs unmount' to think it's not mounted when it is. + */ + if (strlen(osname) >= MNAMELEN) + return (SET_ERROR(ENAMETOOLONG)); + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + /* + * We claim to always be readonly so we can open snapshots; + * other ZPL code will prevent us from writing to snapshots. + */ + error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); + if (error) { + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + zfsvfs->z_vfs = NULL; + zfsvfs->z_parent = zfsvfs; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); - rrw_init(&zfsvfs->z_teardown_lock); +#ifdef DIAGNOSTIC + rrm_init(&zfsvfs->z_teardown_lock, B_TRUE); +#else + rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); +#endif rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + error = zfsvfs_init(zfsvfs, os); + if (error != 0) { + dmu_objset_disown(os, zfsvfs); + *zfvp = NULL; + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + *zfvp = zfsvfs; return (0); - -out: - dmu_objset_disown(os, zfsvfs); - *zfvp = NULL; - kmem_free(zfsvfs, sizeof (zfsvfs_t)); - return (error); } static int @@ -1030,18 +1179,7 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t if (error) return (error); - /* - * Set the objset user_ptr to track its zfsvfs. - */ - mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); - dmu_objset_set_user(zfsvfs->z_os, zfsvfs); - mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); - zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); - if (zil_disable) { - zil_destroy(zfsvfs->z_log, B_FALSE); - zfsvfs->z_log = NULL; - } /* * If we are not mounting (ie: online recv), then we don't @@ -1061,49 +1199,62 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t else zfs_unlinked_drain(zfsvfs); - if (zfsvfs->z_log) { - /* - * Parse and replay the intent log. - * - * Because of ziltest, this must be done after - * zfs_unlinked_drain(). (Further note: ziltest - * doesn't use readonly mounts, where - * zfs_unlinked_drain() isn't called.) This is because - * ziltest causes spa_sync() to think it's committed, - * but actually it is not, so the intent log contains - * many txg's worth of changes. - * - * In particular, if object N is in the unlinked set in - * the last txg to actually sync, then it could be - * actually freed in a later txg and then reallocated - * in a yet later txg. This would write a "create - * object N" record to the intent log. Normally, this - * would be fine because the spa_sync() would have - * written out the fact that object N is free, before - * we could write the "create object N" intent log - * record. - * - * But when we are in ziltest mode, we advance the "open - * txg" without actually spa_sync()-ing the changes to - * disk. So we would see that object N is still - * allocated and in the unlinked set, and there is an - * intent log record saying to allocate it. - */ - zfsvfs->z_replay = B_TRUE; - zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); - zfsvfs->z_replay = B_FALSE; + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + if (zil_replay_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + } else { + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, + zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + } } zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ } + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + return (0); } +extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ + void zfsvfs_free(zfsvfs_t *zfsvfs) { int i; - extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ /* * This is a barrier to prevent the filesystem from going away in @@ -1115,10 +1266,11 @@ zfsvfs_free(zfsvfs_t *zfsvfs) rw_exit(&zfsvfs_lock); zfs_fuid_destroy(zfsvfs); + mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); - rrw_destroy(&zfsvfs->z_teardown_lock); + rrm_destroy(&zfsvfs->z_teardown_lock); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) @@ -1130,23 +1282,33 @@ static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); - if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) { - vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); - vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + if (zfsvfs->z_vfs) { + if (zfsvfs->z_use_fuids) { + vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } else { + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } } + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static int zfs_domount(vfs_t *vfsp, char *osname) { - dev_t mount_dev; uint64_t recordsize, fsid_guid; int error = 0; zfsvfs_t *zfsvfs; + vnode_t *vp; ASSERT(vfsp); ASSERT(osname); @@ -1155,26 +1317,37 @@ zfs_domount(vfs_t *vfsp, char *osname) if (error) return (error); zfsvfs->z_vfs = vfsp; - zfsvfs->z_parent = zfsvfs; - zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; - zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; +#ifdef illumos /* Initialize the generic filesystem structure. */ + vfsp->vfs_bcount = 0; vfsp->vfs_data = NULL; if (zfs_create_unique_device(&mount_dev) == -1) { - error = ENODEV; + error = SET_ERROR(ENODEV); goto out; } ASSERT(vfs_devismounted(mount_dev) == 0); +#endif if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL)) - goto out; + goto out; + zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; + zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; - vfsp->vfs_bsize = DEV_BSIZE; - vfsp->vfs_flag |= VFS_NOTRUNC; vfsp->vfs_data = zfsvfs; +#ifdef __FreeBSD_kernel__ + vfsp->mnt_flag |= MNT_LOCAL; + vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; + vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; + vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; + vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ +#endif +#ifdef __NetBSD__ + vfsp->mnt_flag |= MNT_LOCAL; + vfsp->mnt_iflag |= IMNT_MPSAFE; +#endif /* * The fsid is 64 bits, composed of an 8-bit fs type, which @@ -1186,11 +1359,16 @@ zfs_domount(vfs_t *vfsp, char *osname) */ fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); +#ifdef __FreeBSD_kernel__ + vfsp->vfs_fsid.val[0] = fsid_guid; + vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | + vfsp->mnt_vfc->vfc_typenum & 0xFF; +#endif +#ifdef __NetBSD__ vfsp->mnt_stat.f_fsidx.__fsid_val[0] = fsid_guid; - vfsp->mnt_stat.f_fsidx.__fsid_val[1] = ((fsid_guid>>32) << 8) | - zfsfstype & 0xFF; - - dprintf("zfs_domount vrele after vfsp->vfs_count %d\n", vfsp->vfs_count); + vfsp->mnt_stat.f_fsidx.__fsid_val[1] = fsid_guid >> 32; +#endif + /* * Set features for file system. */ @@ -1214,6 +1392,7 @@ zfs_domount(vfs_t *vfsp, char *osname) goto out; xattr_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; + zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); @@ -1222,9 +1401,13 @@ zfs_domount(vfs_t *vfsp, char *osname) error = zfsvfs_setup(zfsvfs, B_TRUE); } - dprintf("zfs_vfsops.c zfs_domount called\n"); - dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count); - +#ifdef __FreeBSD_kernel__ + vfs_mountedfrom(vfsp, osname); +#endif +#ifdef __NetBSD__ + set_statvfs_info("on-name", UIO_SYSSPACE, osname, UIO_SYSSPACE, "zfs", vfsp, curlwp); +#endif + if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); out: @@ -1232,8 +1415,9 @@ out: dmu_objset_disown(zfsvfs->z_os, zfsvfs); zfsvfs_free(zfsvfs); } else { - atomic_add_32(&zfs_active_fs_count, 1); + atomic_inc_32(&zfs_active_fs_count); } + return (error); } @@ -1241,48 +1425,12 @@ void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; - struct dsl_dataset *ds; - - /* - * Unregister properties. - */ - if (!dmu_objset_is_snapshot(os)) { - ds = dmu_objset_ds(os); - VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, - zfsvfs) == 0); - VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "aclinherit", - acl_inherit_changed_cb, zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "vscan", - vscan_changed_cb, zfsvfs) == 0); - } + if (!dmu_objset_is_snapshot(os)) + dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); } +#ifdef SECLABEL /* * Convert a decimal digit string to a uint64_t integer. */ @@ -1293,7 +1441,7 @@ str_to_uint64(char *str, uint64_t *objnu while (*str) { if (*str < '0' || *str > '9') - return (EINVAL); + return (SET_ERROR(EINVAL)); num = num*10 + *str++ - '0'; } @@ -1315,7 +1463,7 @@ zfs_parse_bootfs(char *bpath, char *outp int error; if (*bpath == 0 || *bpath == '/') - return (EINVAL); + return (SET_ERROR(EINVAL)); (void) strcpy(outpath, bpath); @@ -1338,20 +1486,17 @@ zfs_parse_bootfs(char *bpath, char *outp return (error); } - /* - * zfs_check_global_label: - * Check that the hex label string is appropriate for the dataset - * being mounted into the global_zone proper. + * Check that the hex label string is appropriate for the dataset being + * mounted into the global_zone proper. * - * Return an error if the hex label string is not default or - * admin_low/admin_high. For admin_low labels, the corresponding - * dataset must be readonly. + * Return an error if the hex label string is not default or + * admin_low/admin_high. For admin_low labels, the corresponding + * dataset must be readonly. */ int zfs_check_global_label(const char *dsname, const char *hexsl) { -#ifdef PORT_SOLARIS if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); if (strcasecmp(hexsl, ADMIN_HIGH) == 0) @@ -1362,30 +1507,23 @@ zfs_check_global_label(const char *dsnam if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) - return (EACCES); + return (SET_ERROR(EACCES)); return (rdonly ? 0 : EACCES); } - return (EACCES); -#else - return 0; -#endif + return (SET_ERROR(EACCES)); } /* - * zfs_mount_label_policy: - * Determine whether the mount is allowed according to MAC check. - * by comparing (where appropriate) label of the dataset against - * the label of the zone being mounted into. If the dataset has - * no label, create one. + * Determine whether the mount is allowed according to MAC check. + * by comparing (where appropriate) label of the dataset against + * the label of the zone being mounted into. If the dataset has + * no label, create one. * - * Returns: - * 0 : access allowed - * >0 : error code, such as EACCES + * Returns 0 if access allowed, error otherwise (e.g. EACCES) */ static int zfs_mount_label_policy(vfs_t *vfsp, char *osname) { -#ifdef PORT_SOLARIS int error, retv; zone_t *mntzone = NULL; ts_label_t *mnt_tsl; @@ -1401,7 +1539,7 @@ zfs_mount_label_policy(vfs_t *vfsp, char error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error) - return (EACCES); + return (SET_ERROR(EACCES)); /* * If labeling is NOT enabled, then disallow the mount of datasets @@ -1411,7 +1549,7 @@ zfs_mount_label_policy(vfs_t *vfsp, char if (!is_system_labeled()) { if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); - return (EACCES); + return (SET_ERROR(EACCES)); } /* @@ -1428,7 +1566,7 @@ zfs_mount_label_policy(vfs_t *vfsp, char if (dsl_prop_get_integer(osname, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) - return (EACCES); + return (SET_ERROR(EACCES)); if (!zoned) return (zfs_check_global_label(osname, ds_hexsl)); else @@ -1452,8 +1590,9 @@ zfs_mount_label_policy(vfs_t *vfsp, char char *str = NULL; if (l_to_str_internal(mnt_sl, &str) == 0 && - dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), - ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0) + dsl_prop_set_string(osname, + zfs_prop_to_name(ZFS_PROP_MLSLABEL), + ZPROP_SRC_LOCAL, str) == 0) retv = 0; if (str != NULL) kmem_free(str, strlen(str) + 1); @@ -1475,12 +1614,10 @@ zfs_mount_label_policy(vfs_t *vfsp, char label_rele(mnt_tsl); zone_rele(mntzone); return (retv); -#else /* PORT_SOLARIS */ - return (0); -#endif } +#endif /* SECLABEL */ -#ifndef __NetBSD__ +#ifdef OPENSOLARIS_MOUNTROOT static int zfs_mountroot(vfs_t *vfsp, enum whymountroot why) { @@ -1501,7 +1638,7 @@ zfs_mountroot(vfs_t *vfsp, enum whymount */ if (why == ROOT_INIT) { if (zfsrootdone++) - return (EBUSY); + return (SET_ERROR(EBUSY)); /* * the process of doing a spa_load will require the * clock to be set before we could (for example) do @@ -1513,7 +1650,7 @@ zfs_mountroot(vfs_t *vfsp, enum whymount if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) { cmn_err(CE_NOTE, "spa_get_bootfs: can not get " "bootfs name"); - return (EINVAL); + return (SET_ERROR(EINVAL)); } zfs_devid = spa_get_bootprop("diskdevid"); error = spa_import_rootpool(rootfs.bo_name, zfs_devid); @@ -1582,39 +1719,60 @@ out: * if "why" is equal to anything else other than ROOT_INIT, * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. */ - return (ENOTSUP); + return (SET_ERROR(ENOTSUP)); +} +#endif /* OPENSOLARIS_MOUNTROOT */ + +static int +getpoolname(const char *osname, char *poolname) +{ + char *p; + + p = strchr(osname, '/'); + if (p == NULL) { + if (strlen(osname) >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strcpy(poolname, osname); + } else { + if (p - osname >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strncpy(poolname, osname, p - osname); + poolname[p - osname] = '\0'; + } + return (0); } -#endif /*__NetBSD__ */ /*ARGSUSED*/ +#ifdef illumos +static int +zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +#endif +#ifdef __FreeBSD_kernel__ +static int +zfs_mount(vfs_t *vfsp) +#endif +#ifdef __NetBSD__ static int zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len) +#endif { + vnode_t *mvp = vfsp->mnt_vnodecovered; char *osname; - pathname_t spn; - vnode_t *mvp = vfsp->mnt_vnodecovered; - struct mounta *uap = data; int error = 0; int canwrite; - cred_t *cr; - crget(cr); - dprintf("zfs_vfsops.c zfs_mount called\n"); - dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count); +#ifdef illumos if (mvp->v_type != VDIR) - return (ENOTDIR); - - if (uap == NULL) - return (EINVAL); + return (SET_ERROR(ENOTDIR)); - mutex_enter(mvp->v_interlock); + mutex_enter(&mvp->v_lock); if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 && (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { - mutex_exit(mvp->v_interlock); - return (EBUSY); + mutex_exit(&mvp->v_lock); + return (SET_ERROR(EBUSY)); } - mutex_exit(mvp->v_interlock); + mutex_exit(&mvp->v_lock); /* * ZFS does not support passing unparsed data in via MS_DATA. @@ -1623,11 +1781,52 @@ zfs_mount(vfs_t *vfsp, const char *path, * can be interrogated. */ if ((uap->flags & MS_DATA) && uap->datalen > 0) - return (EINVAL); + return (SET_ERROR(EINVAL)); +#endif /* illumos */ - osname = PNBUF_GET(); +#ifdef __FreeBSD_kernel__ + kthread_t *td = curthread; + cred_t *cr = td->td_ucred; + + if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS)) + return (SET_ERROR(EPERM)); + if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) + return (SET_ERROR(EINVAL)); + + /* + * If full-owner-access is enabled and delegated administration is + * turned on, we must set nosuid. + */ + if (zfs_super_owner && + dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { + secpolicy_fs_mount_clearopts(cr, vfsp); + } + +#endif /* __FreeBSD_kernel__ */ + +#ifdef __NetBSD__ + cred_t *cr = CRED(); + struct mounta *uap = data; + + if (uap == NULL) + return (SET_ERROR(EINVAL)); + + if (mvp->v_type != VDIR) + return (SET_ERROR(ENOTDIR)); + + mutex_enter(mvp->v_interlock); + if ((uap->flags & MS_REMOUNT) == 0 && + (uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + mutex_exit(mvp->v_interlock); + return (SET_ERROR(EBUSY)); + } + mutex_exit(mvp->v_interlock); + + osname = PNBUF_GET(); strlcpy(osname, uap->fspec, strlen(uap->fspec) + 1); +#endif /* __NetBSD__ */ /* * Check for mount privilege? @@ -1637,8 +1836,10 @@ zfs_mount(vfs_t *vfsp, const char *path, */ error = secpolicy_fs_mount(cr, mvp, vfsp); if (error) { - error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr); - if (error == 0) { + if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) + goto out; + + if (!(vfsp->vfs_flag & MS_REMOUNT)) { vattr_t vattr; /* @@ -1648,52 +1849,111 @@ zfs_mount(vfs_t *vfsp, const char *path, vattr.va_mask = AT_UID; - if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) { +#ifdef __FreeBSD_kernel__ + vn_lock(mvp, LK_SHARED | LK_RETRY); + if (VOP_GETATTR(mvp, &vattr, cr)) { + VOP_UNLOCK(mvp, 0); goto out; } - if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 && - VOP_ACCESS(mvp, VWRITE, cr) != 0) { - error = EPERM; + if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && + VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { + VOP_UNLOCK(mvp, 0); + goto out; + } + VOP_UNLOCK(mvp, 0); +#endif +#ifdef __NetBSD__ + vn_lock(mvp, LK_SHARED | LK_RETRY); + if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) { + VOP_UNLOCK(mvp, 0); goto out; } -/* XXX NetBSD secpolicy_fs_mount_clearopts(cr, vfsp);*/ - } else { - goto out; + if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && + VOP_ACCESS(mvp, VWRITE, cr) != 0) { + VOP_UNLOCK(mvp, 0); + goto out; + } + VOP_UNLOCK(mvp, 0); +#endif } + + secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Refuse to mount a filesystem if we are in a local zone and the * dataset is not visible. */ - if (!INGLOBALZONE(curproc) && + if (!INGLOBALZONE(curthread) && (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { - error = EPERM; + error = SET_ERROR(EPERM); goto out; } +#ifdef SECLABEL error = zfs_mount_label_policy(vfsp, osname); if (error) goto out; +#endif + +#ifdef __FreeBSD_kernel__ + vfsp->vfs_flag |= MNT_NFS4ACLS; +#endif +#ifdef __NetBSD__ + vfsp->mnt_iflag |= IMNT_MPSAFE; +#endif /* * When doing a remount, we simply refresh our temporary properties * according to those options set in the current VFS options. */ - if (uap->flags & MS_REMOUNT) { - /* refresh mount options */ - zfs_unregister_callbacks(vfsp->vfs_data); + if (vfsp->vfs_flag & MS_REMOUNT) { + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + /* + * Refresh mount options with z_teardown_lock blocking I/O while + * the filesystem is in an inconsistent state. + * The lock also serializes this code with filesystem + * manipulations between entry to zfs_suspend_fs() and return + * from zfs_resume_fs(). + */ + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + zfs_unregister_callbacks(zfsvfs); error = zfs_register_callbacks(vfsp); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); goto out; } - /* Mark ZFS as MP SAFE */ - vfsp->mnt_iflag |= IMNT_MPSAFE; - +#ifdef __FreeBSD_kernel__ + /* Initial root mount: try hard to import the requested root pool. */ + if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && + (vfsp->vfs_flag & MNT_UPDATE) == 0) { + char pname[MAXNAMELEN]; + + error = getpoolname(osname, pname); + if (error == 0) + error = spa_import_rootpool(pname); + if (error) + goto out; + } +#endif + + DROP_GIANT(); error = zfs_domount(vfsp, osname); + PICKUP_GIANT(); + +#ifdef illumos + /* + * Add an extra VFS_HOLD on our parent vfs so that it can't + * disappear due to a forced unmount. + */ + if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) + VFS_HOLD(mvp->v_vfsp); +#endif +#ifdef __NetBSD__ vfs_getnewfsid(vfsp); /* setup zfs mount info */ @@ -1701,26 +1961,28 @@ zfs_mount(vfs_t *vfsp, const char *path, sizeof(vfsp->mnt_stat.f_mntfromname)); set_statvfs_info(path, UIO_USERSPACE, vfsp->mnt_stat.f_mntfromname, UIO_SYSSPACE, vfsp->mnt_op->vfs_name, vfsp, curlwp); - - /* - * Add an extra VFS_HOLD on our parent vfs so that it can't - * disappear due to a forced unmount. - */ - if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) - VFS_HOLD(mvp->v_vfsp); +#endif out: - PNBUF_PUT(osname); return (error); } +#ifdef __FreeBSD_kernel__ +static int +zfs_statfs(vfs_t *vfsp, struct statfs *statp) +#endif +#ifdef __NetBSD__ static int zfs_statvfs(vfs_t *vfsp, struct statvfs *statp) +#endif { zfsvfs_t *zfsvfs = vfsp->vfs_data; - dev_t dev; uint64_t refdbytes, availbytes, usedobjs, availobjs; +#ifdef __FreeBSD_kernel__ + statp->f_version = STATFS_VERSION; +#endif + ZFS_ENTER(zfsvfs); dmu_objset_space(zfsvfs->z_os, @@ -1731,8 +1993,11 @@ zfs_statvfs(vfs_t *vfsp, struct statvfs * We report the fragsize as the smallest block size we support, * and we report our blocksize as the filesystem's maximum blocksize. */ - statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT; - statp->f_bsize = zfsvfs->z_max_blksz; + statp->f_bsize = SPA_MINBLOCKSIZE; +#ifdef __NetBSD__ + statp->f_frsize = SPA_MINBLOCKSIZE; +#endif + statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; /* * The following report "total" blocks of various kinds in the @@ -1741,7 +2006,7 @@ zfs_statvfs(vfs_t *vfsp, struct statvfs */ statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; - statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT; + statp->f_bfree = availbytes / statp->f_bsize; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* @@ -1753,51 +2018,62 @@ zfs_statvfs(vfs_t *vfsp, struct statvfs * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, statp->f_bfree); +#ifndef __FreeBSD__ statp->f_favail = statp->f_ffree; /* no "root reservation" */ +#endif statp->f_files = statp->f_ffree + usedobjs; +#ifdef __FreeBSD__ + (void) cmpldev(&d32, vfsp->vfs_dev); + statp->f_fsid = d32; +#endif +#ifdef __NetBSD__ statp->f_fsid = vfsp->mnt_stat.f_fsidx.__fsid_val[0]; - +#endif + /* * We're a zfs filesystem. */ (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename)); - (void) strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, + + strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, sizeof(statp->f_mntfromname)); - (void) strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, + strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, sizeof(statp->f_mntonname)); +#ifdef __FreeBSD_kernel__ + statp->f_namemax = MAXNAMELEN - 1; +#endif +#ifdef __NetBSD__ statp->f_namemax = ZFS_MAXNAMELEN; - - /* - * We have all of 32 characters to stuff a string here. - * Is there anything useful we could/should provide? - */ -#ifndef __NetBSD__ - bzero(statp->f_fstr, sizeof (statp->f_fstr)); #endif + ZFS_EXIT(zfsvfs); return (0); } static int -zfs_root(vfs_t *vfsp, vnode_t **vpp) +zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *rootzp; int error; ZFS_ENTER(zfsvfs); - dprintf("zfs_root called\n"); + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *vpp = ZTOV(rootzp); - dprintf("vpp -> %d, error %d -- %p\n", (*vpp)->v_type, error, *vpp); + ZFS_EXIT(zfsvfs); - if (error == 0) - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - KASSERT((error != 0) || (*vpp != NULL)); - KASSERT((error != 0) || (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)); + + if (error == 0) { + error = vn_lock(*vpp, flags); + if (error != 0) { + VN_RELE(*vpp); + *vpp = NULL; + } + } return (error); } @@ -1812,7 +2088,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolea { znode_t *zp; - rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); if (!unmounting) { /* @@ -1822,6 +2098,9 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolea * 'z_parent' is self referential for non-snapshots. */ (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0); +#ifdef FREEBSD_NAMECACHE + cache_purgevfs(zfsvfs->z_parent->z_vfs, true); +#endif } /* @@ -1842,8 +2121,8 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolea */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { rw_exit(&zfsvfs->z_teardown_inactive_lock); - rrw_exit(&zfsvfs->z_teardown_lock, FTAG); - return (EIO); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + return (SET_ERROR(EIO)); } /* @@ -1856,8 +2135,8 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolea mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) - if (zp->z_dbuf) { - ASSERT(ZTOV(zp)->v_count > 0); + if (zp->z_sa_hdl) { + ASSERT(ZTOV(zp)->v_count >= 0); zfs_znode_dmu_fini(zp); } mutex_exit(&zfsvfs->z_znodes_lock); @@ -1869,7 +2148,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolea */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; - rrw_exit(&zfsvfs->z_teardown_lock, FTAG); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); rw_exit(&zfsvfs->z_teardown_inactive_lock); } @@ -1889,10 +2168,10 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolea /* * Evict cached data */ - if (dmu_objset_evict_dbufs(zfsvfs->z_os)) { + if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) && + !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); - (void) dmu_objset_evict_dbufs(zfsvfs->z_os); - } + dmu_objset_evict_dbufs(zfsvfs->z_os); return (0); } @@ -1903,32 +2182,22 @@ zfs_umount(vfs_t *vfsp, int fflag) { zfsvfs_t *zfsvfs = vfsp->vfs_data; objset_t *os; - int ret, flags = 0; - cred_t *cr; - - vnode_t *vpp; - int counter; - - counter = 0; - - dprintf("ZFS_UMOUNT called\n"); + int ret; +#ifdef __FreeBSD_kernel__ + kthread_t *td = curthread; + cred_t *cr = td->td_ucred; +#endif +#ifdef __NetBSD__ + cred_t *cr = CRED(); +#endif - /*TAILQ_FOREACH(vpp, &vfsp->mnt_vnodelist, v_mntvnodes) { - printf("vnode list vnode number %d -- vnode address %p\n", counter, vpp); - vprint("ZFS vfsp vnode list", vpp); - counter++; - } */ - - crget(cr); -#ifdef TODO ret = secpolicy_fs_unmount(cr, vfsp); if (ret) { - ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), - ZFS_DELEG_PERM_MOUNT, cr); - if (ret) + if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource), + ZFS_DELEG_PERM_MOUNT, cr)) return (ret); } -#endif + /* * We purge the parent filesystem's vfsp as the parent filesystem * and all of its snapshots have their vnode's v_vfsp set to the @@ -1941,12 +2210,35 @@ zfs_umount(vfs_t *vfsp, int fflag) * Unmount any snapshots mounted under .zfs before unmounting the * dataset itself. */ - if (zfsvfs->z_ctldir != NULL && - (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) { - return (ret); + if (zfsvfs->z_ctldir != NULL) { + if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) + return (ret); } -#if 0 + if (fflag & MS_FORCE) { + /* + * Mark file system as unmounted before calling + * vflush(FORCECLOSE). This way we ensure no future vnops + * will be called and risk operating on DOOMED vnodes. + */ + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + zfsvfs->z_unmounted = B_TRUE; + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * Flush all the files. + */ +#ifdef __FreeBSD_kernel__ + ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); +#endif +#ifdef __NetBSD__ + ret = vflush(vfsp, NULL, (fflag & MS_FORCE) ? FORCECLOSE : 0); +#endif + if (ret != 0) + return (ret); + +#ifdef illumos if (!(fflag & MS_FORCE)) { /* * Check the number of active vnodes in the file system. @@ -1959,21 +2251,15 @@ zfs_umount(vfs_t *vfsp, int fflag) * reflected in the vnode count. */ if (zfsvfs->z_ctldir == NULL) { - if (vfsp->vfs_count > 1){ - return (EBUSY); - } + if (vfsp->vfs_count > 1) + return (SET_ERROR(EBUSY)); } else { if (vfsp->vfs_count > 2 || - zfsvfs->z_ctldir->v_count > 1) { - return (EBUSY); - } + zfsvfs->z_ctldir->v_count > 1) + return (SET_ERROR(EBUSY)); } } #endif - ret = vflush(vfsp, NULL, (ISSET(fflag, MS_FORCE)? FORCECLOSE : 0)); - if (ret != 0) - return ret; - vfsp->vfs_flag |= VFS_UNMOUNTED; VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); os = zfsvfs->z_os; @@ -2001,59 +2287,111 @@ zfs_umount(vfs_t *vfsp, int fflag) */ if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); + zfs_freevfs(vfsp); return (0); } +#ifdef __FreeBSD_kernel__ +static int +zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) +#endif +#ifdef __NetBSD__ static int zfs_vget(vfs_t *vfsp, ino_t ino, vnode_t **vpp) +#endif { - zfsvfs_t *zfsvfs = vfsp->vfs_data; - znode_t *zp; - int err; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + int err; +#ifdef __NetBSD__ + int flags = LK_EXCLUSIVE; +#endif + + /* + * zfs_zget() can't operate on virtual entries like .zfs/ or + * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. + * This will make NFS to switch to LOOKUP instead of using VGET. + */ + if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || + (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) + return (EOPNOTSUPP); - dprintf("zfs_vget called\n"); - dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count); - ZFS_ENTER(zfsvfs); err = zfs_zget(zfsvfs, ino, &zp); if (err == 0 && zp->z_unlinked) { VN_RELE(ZTOV(zp)); err = EINVAL; } - if (err != 0) - *vpp = NULL; - else { + if (err == 0) *vpp = ZTOV(zp); - /* XXX NetBSD how to get flags for vn_lock ? */ - vn_lock(*vpp, 0); - } ZFS_EXIT(zfsvfs); + if (err == 0) + err = vn_lock(*vpp, flags); + if (err != 0) + *vpp = NULL; + return (err); } +#ifdef __FreeBSD_kernel__ static int -zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) +zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, + struct ucred **credanonp, int *numsecflavors, int **secflavors) { zfsvfs_t *zfsvfs = vfsp->vfs_data; - znode_t *zp; - uint64_t object = 0; - uint64_t fid_gen = 0; - uint64_t gen_mask; - uint64_t zp_gen; - int i, err; + + /* + * If this is regular file system vfsp is the same as + * zfsvfs->z_parent->z_vfs, but if it is snapshot, + * zfsvfs->z_parent->z_vfs represents parent file system + * which we have to use here, because only this file system + * has mnt_export configured. + */ + return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, + credanonp, numsecflavors, secflavors)); +} + +CTASSERT(SHORT_FID_LEN <= sizeof(struct fid)); +CTASSERT(LONG_FID_LEN <= sizeof(struct fid)); +#endif + +#ifdef __FreeBSD_kernel__ +static int +zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) +#endif +#ifdef __NetBSD__ +static int +zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp) +#endif +{ + struct componentname cn; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + vnode_t *dvp; + uint64_t object = 0; + uint64_t fid_gen = 0; + uint64_t gen_mask; + uint64_t zp_gen; + int i, err; *vpp = NULL; - dprintf("zfs_fhtovp called\n"); - dprintf("vfsp->vfs_count %d\n", vfsp->vfs_count); - +#ifdef __NetBSD__ + return (SET_ERROR(ENOTSUP)); +#endif + ZFS_ENTER(zfsvfs); - if (fidp->fid_len == LONG_FID_LEN) { - zfid_long_t *zlfid = (zfid_long_t *)fidp; - uint64_t objsetid = 0; - uint64_t setgen = 0; +#ifdef __FreeBSD_kernel__ + /* + * On FreeBSD we can get snapshot's mount point or its parent file + * system mount point depending if snapshot is already mounted or not. + */ + if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { + zfid_long_t *zlfid = (zfid_long_t *)fidp; + uint64_t objsetid = 0; + uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); @@ -2065,12 +2403,12 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vno err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); if (err) - return (EINVAL); + return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); } if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { - zfid_short_t *zfid = (zfid_short_t *)fidp; + zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); @@ -2079,26 +2417,46 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vno fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { ZFS_EXIT(zfsvfs); - return (EINVAL); + return (SET_ERROR(EINVAL)); } - /* A zero fid_gen means we are in the .zfs control directories */ - if (fid_gen == 0 && - (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { - *vpp = zfsvfs->z_ctldir; - ASSERT(*vpp != NULL); + /* + * A zero fid_gen means we are in .zfs or the .zfs/snapshot + * directory tree. If the object == zfsvfs->z_shares_dir, then + * we are in the .zfs/shares directory tree. + */ + if ((fid_gen == 0 && + (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || + (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { + ZFS_EXIT(zfsvfs); + VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); if (object == ZFSCTL_INO_SNAPDIR) { - VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL, - 0, NULL, NULL, NULL, NULL, NULL) == 0); + cn.cn_nameptr = "snapshot"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN | LOCKLEAF; + cn.cn_lkflags = flags; + VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); + vput(dvp); + } else if (object == zfsvfs->z_shares_dir) { + /* + * XXX This branch must not be taken, + * if it is, then the lookup below will + * explode. + */ + cn.cn_nameptr = "shares"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN; + cn.cn_lkflags = flags; + VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); + vput(dvp); } else { - VN_HOLD(*vpp); + *vpp = dvp; } - ZFS_EXIT(zfsvfs); - /* XXX: LK_RETRY? */ - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); - return (0); + return (err); } - +#endif gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); @@ -2106,28 +2464,37 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vno ZFS_EXIT(zfsvfs); return (err); } - zp_gen = zp->z_phys->zp_gen & gen_mask; + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, + sizeof (uint64_t)); + zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); VN_RELE(ZTOV(zp)); ZFS_EXIT(zfsvfs); - return (EINVAL); + return (SET_ERROR(EINVAL)); } *vpp = ZTOV(zp); - /* XXX: LK_RETRY? */ - vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); ZFS_EXIT(zfsvfs); - return (0); +#ifdef __FreeBSD_kernel__ + err = vn_lock(*vpp, flags); + if (err == 0) + vnode_create_vobject(*vpp, zp->z_size, curthread); + else + *vpp = NULL; +#endif + return (err); } /* * Block out VOPs and close zfsvfs_t::z_os * * Note, if successful, then we return with the 'z_teardown_lock' and - * 'z_teardown_inactive_lock' write held. + * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying + * dataset and objset intact so that they can be atomically handed off during + * a subsequent rollback or recv operation and the resume thereafter. */ int zfs_suspend_fs(zfsvfs_t *zfsvfs) @@ -2136,57 +2503,70 @@ zfs_suspend_fs(zfsvfs_t *zfsvfs) if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); - dmu_objset_disown(zfsvfs->z_os, zfsvfs); return (0); } /* - * Reopen zfsvfs_t::z_os and release VOPs. + * Rebuild SA and release VOPs. Note that ownership of the underlying dataset + * is an invariant across any of the operations that can be performed while the + * filesystem was suspended. Whether it succeeded or failed, the preconditions + * are the same: the relevant objset and associated dataset are owned by + * zfsvfs, held, and long held on entry. */ int -zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) +zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { int err; + znode_t *zp; - ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); - err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs, - &zfsvfs->z_os); - if (err) { - zfsvfs->z_os = NULL; - } else { - znode_t *zp; + /* + * We already own this, so just update the objset_t, as the one we + * had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + VERIFY0(dmu_objset_from_ds(ds, &os)); - VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + err = zfsvfs_init(zfsvfs, os); + if (err != 0) + goto bail; - /* - * Attempt to re-establish all the active znodes with - * their dbufs. If a zfs_rezget() fails, then we'll let - * any potential callers discover that via ZFS_ENTER_VERIFY_VP - * when they try to use their znode. - */ - mutex_enter(&zfsvfs->z_znodes_lock); - for (zp = list_head(&zfsvfs->z_all_znodes); zp; - zp = list_next(&zfsvfs->z_all_znodes, zp)) { - (void) zfs_rezget(zp); - } - mutex_exit(&zfsvfs->z_znodes_lock); + VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + zfs_set_fuid_feature(zfsvfs); + + /* + * Attempt to re-establish all the active znodes with + * their dbufs. If a zfs_rezget() fails, then we'll let + * any potential callers discover that via ZFS_ENTER_VERIFY_VP + * when they try to use their znode. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + (void) zfs_rezget(zp); } + mutex_exit(&zfsvfs->z_znodes_lock); +bail: /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); - rrw_exit(&zfsvfs->z_teardown_lock, FTAG); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); if (err) { /* - * Since we couldn't reopen zfsvfs::z_os, force + * Since we couldn't setup the sa framework, try to force * unmount this file system. */ - if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) - (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curlwp); + if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { + vfs_ref(zfsvfs->z_vfs); + (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); + } } return (err); } @@ -2196,76 +2576,76 @@ zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; +#ifdef illumos /* * If this is a snapshot, we have an extra VFS_HOLD on our parent - * from zfs_mount(). Release it here. + * from zfs_mount(). Release it here. If we came through + * zfs_mountroot() instead, we didn't grab an extra hold, so + * skip the VFS_RELE for rootvfs. */ - if (zfsvfs->z_issnap) + if (zfsvfs->z_issnap && (vfsp != rootvfs)) VFS_RELE(zfsvfs->z_parent->z_vfs); +#endif zfsvfs_free(zfsvfs); - atomic_add_32(&zfs_active_fs_count, -1); + atomic_dec_32(&zfs_active_fs_count); } -/* - * VFS_INIT() initialization. Note that there is no VFS_FINI(), - * so we can't safely do any non-idempotent initialization here. - * Leave that to zfs_init() and zfs_fini(), which are called - * from the module's _init() and _fini() entry points. - */ -/*ARGSUSED*/ -int -zfs_vfsinit(int fstype, char *name) -{ - int error; - - zfsfstype = fstype; +#ifdef __FreeBSD_kernel__ +#ifdef __i386__ +static int desiredvnodes_backup; +#endif - /* - * Setup vfsops and vnodeops tables. - */ - error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops); +static void +zfs_vnodes_adjust(void) +{ +#ifdef __i386__ + int newdesiredvnodes; - error = zfs_create_op_tables(); - if (error) { - zfs_remove_op_tables(); - cmn_err(CE_WARN, "zfs: bad vnode ops template"); - vfs_freevfsops_by_type(zfsfstype); - return (error); - } + desiredvnodes_backup = desiredvnodes; - mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zfs_debug_mtx, NULL, MUTEX_DEFAULT, NULL); - /* - * Unique major number for all zfs mounts. - * If we run out of 32-bit minors, we'll getudev() another major. + * We calculate newdesiredvnodes the same way it is done in + * vntblinit(). If it is equal to desiredvnodes, it means that + * it wasn't tuned by the administrator and we can tune it down. */ - zfs_major = ddi_name_to_major(ZFS_DRIVER); - zfs_minor = ZFS_MIN_MINOR; - - return (0); + newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * + vm_kmem_size / (5 * (sizeof(struct vm_object) + + sizeof(struct vnode)))); + if (newdesiredvnodes == desiredvnodes) + desiredvnodes = (3 * newdesiredvnodes) / 4; +#endif } -int -zfs_vfsfini(void) +static void +zfs_vnodes_adjust_back(void) { - int err; - err = vfs_detach(&zfs_vfsops_template); - if (err != 0) - return err; +#ifdef __i386__ + desiredvnodes = desiredvnodes_backup; +#endif +} +#endif /* __FreeBSD_kernel__ */ - mutex_destroy(&zfs_debug_mtx); - mutex_destroy(&zfs_dev_mtx); +#ifdef __NetBSD__ +static void +zfs_vnodes_adjust(void) +{ +} - return 0; +static void +zfs_vnodes_adjust_back(void) +{ } +#endif void zfs_init(void) { + + printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); + /* * Initialize .zfs directory structures */ @@ -2276,6 +2656,13 @@ zfs_init(void) */ zfs_znode_init(); + /* + * Reduce number of vnodes. Originally number of vnodes is calculated + * with UFS inode in mind. We reduce it here, because it's too big for + * ZFS/i386. + */ + zfs_vnodes_adjust(); + dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); } @@ -2284,6 +2671,7 @@ zfs_fini(void) { zfsctl_fini(); zfs_znode_fini(); + zfs_vnodes_adjust_back(); } int @@ -2300,18 +2688,28 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64 dmu_tx_t *tx; if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) - return (EINVAL); + return (SET_ERROR(EINVAL)); if (newvers < zfsvfs->z_version) - return (EINVAL); + return (SET_ERROR(EINVAL)); + + if (zfs_spa_version_map(newvers) > + spa_version(dmu_objset_spa(zfsvfs->z_os))) + return (SET_ERROR(ENOTSUP)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + ZFS_SA_ATTRS); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); @@ -2320,17 +2718,30 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64 return (error); } - spa_history_internal_log(LOG_DS_UPGRADE, - dmu_objset_spa(os), tx, CRED(), - "oldver=%llu newver=%llu dataset = %llu", - zfsvfs->z_version, newvers, dmu_objset_id(os)); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + uint64_t sa_obj; + + ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, + SPA_VERSION_SA); + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + + error = zap_add(os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT0(error); + + VERIFY(0 == sa_set_sa_object(os, sa_obj)); + sa_register_update_callback(os, zfs_sa_upgrade); + } + + spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, + "from %llu to %llu", zfsvfs->z_version, newvers); dmu_tx_commit(tx); zfsvfs->z_version = newvers; - if (zfsvfs->z_version >= ZPL_VERSION_FUID) - zfs_set_fuid_feature(zfsvfs); + zfs_set_fuid_feature(zfsvfs); return (0); } @@ -2377,25 +2788,36 @@ zfs_get_zplprop(objset_t *os, zfs_prop_t return (error); } -static int -zfs_start(vfs_t *vfsp, int flags) +#ifdef __FreeBSD_kernel__ +#ifdef _KERNEL +void +zfsvfs_update_fromname(const char *oldname, const char *newname) { - - return (0); + char tmpbuf[MAXPATHLEN]; + struct mount *mp; + char *fromname; + size_t oldlen; + + oldlen = strlen(oldname); + + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + fromname = mp->mnt_stat.f_mntfromname; + if (strcmp(fromname, oldname) == 0) { + (void)strlcpy(fromname, newname, + sizeof(mp->mnt_stat.f_mntfromname)); + continue; + } + if (strncmp(fromname, oldname, oldlen) == 0 && + (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { + (void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s", + newname, fromname + oldlen); + (void)strlcpy(fromname, tmpbuf, + sizeof(mp->mnt_stat.f_mntfromname)); + continue; + } + } + mtx_unlock(&mountlist_mtx); } - - -#ifdef TODO -static vfsdef_t vfw = { - VFSDEF_VERSION, - MNTTYPE_ZFS, - zfs_vfsinit, - VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS| - VSW_XID, - &zfs_mntopts -}; - -struct modlfs zfs_modlfs = { - &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw -}; +#endif #endif Index: src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c =================================================================== RCS file: /home/chs/netbsd/cvs/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c,v retrieving revision 1.26 diff -u -p -r1.26 zfs_vnops.c --- src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c 26 May 2017 14:20:59 -0000 1.26 +++ src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c 13 Sep 2017 07:55:33 -0000 @@ -19,11 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2010 Robert Milkowski */ #include #include @@ -32,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -48,28 +52,39 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include #include +#include #include #include -#include +#include #include #include -#include -#include #include #include #include -#include +#include + +#ifdef __FreeBSD__ +#include +#include +#include +#endif #ifdef __NetBSD__ #include +#include +#include + +uint_t zfs_putpage_key; #endif /* @@ -83,11 +98,11 @@ * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * - * (1) A check must be made in each zfs thread for a mounted file system. + * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). - * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes - * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros - * can return EIO from the calling function. + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: @@ -102,11 +117,18 @@ * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * - * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign(). - * This is critical because we don't want to block while holding locks. - * Note, in particular, that if a lock is sometimes acquired before - * the tx assigns, and sometimes after (e.g. z_lock), then failing to - * use a non-blocking assign can deadlock the system. The scenario: + * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to + * dmu_tx_assign(). This is critical because we don't want to block + * while holding locks. + * + * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * reduces lock contention and CPU usage when we must wait (note that if + * throughput is constrained by the storage, nearly every transaction + * must wait). + * + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing + * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. @@ -114,34 +136,39 @@ * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, - * then drop all locks, call dmu_tx_wait(), and try again. + * then drop all locks, call dmu_tx_wait(), and try again. On subsequent + * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, + * to indicate that this operation has already called dmu_tx_wait(). + * This will ensure that we don't retry forever, waiting a short bit + * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. - * During ZIL replay the zfs_log_* functions will update the sequence + * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * - * (7) After dropping all locks, invoke zil_commit(zilog, seq, foid) + * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: - * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD()) + * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify - * error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign + * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks - * zfs_dirent_unlock(...); // unlock directory entry + * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * if (error == ERESTART) { + * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; @@ -155,9 +182,9 @@ * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks - * zfs_dirent_unlock(dl, 0); // unlock directory entry + * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes - * zil_commit(zilog, seq, foid); // synchronous when necessary + * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ @@ -167,23 +194,31 @@ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(*vpp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; - if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) && + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { - return (EPERM); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); } if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && - !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && - zp->z_phys->zp_size > 0) - if (fs_vscan(*vpp, cr, 0) != 0) - return (EACCES); + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { + if (fs_vscan(*vpp, cr, 0) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + } /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); + ZFS_EXIT(zfsvfs); return (0); } @@ -204,22 +239,19 @@ zfs_close(vnode_t *vp, int flag, int cou ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - dprintf("zfs_close called \n"); /* Decrement the synchronous opens in the znode */ if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && - !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && - zp->z_phys->zp_size > 0) + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(fs_vscan(vp, cr, 1) == 0); ZFS_EXIT(zfsvfs); return (0); } -#ifdef PORT_NETBSD /* * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. @@ -233,9 +265,9 @@ zfs_holey(vnode_t *vp, u_long cmd, offse int error; boolean_t hole; - file_sz = zp->z_phys->zp_size; + file_sz = zp->z_size; if (noff >= file_sz) { - return (ENXIO); + return (SET_ERROR(ENXIO)); } if (cmd == _FIO_SEEK_HOLE) @@ -245,16 +277,19 @@ zfs_holey(vnode_t *vp, u_long cmd, offse error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); - /* end of file? */ - if ((error == ESRCH) || (noff > file_sz)) { - /* - * Handle the virtual hole at the end of file. - */ - if (hole) { - *off = file_sz; - return (0); - } - return (ENXIO); + if (error == ESRCH) + return (SET_ERROR(ENXIO)); + + /* + * We could find a hole that begins after the logical end-of-file, + * because dmu_offset_next() only works on whole blocks. If the + * EOF falls mid-block, then indicate that the "virtual hole" + * at the end of the file begins at the logical EOF, rather than + * at the end of the last block. + */ + if (noff > file_sz) { + ASSERT(hole); + noff = file_sz; } if (noff < *off) @@ -262,54 +297,220 @@ zfs_holey(vnode_t *vp, u_long cmd, offse *off = noff; return (error); } -#endif /* PORT_NETBSD */ +/* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred, int *rvalp, caller_context_t *ct) { offset_t off; + offset_t ndata; + dmu_object_info_t doi; int error; zfsvfs_t *zfsvfs; znode_t *zp; - + switch (com) { case _FIOFFS: + { return (0); - + /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ + } case _FIOGDIO: case _FIOSDIO: + { return (0); -#ifdef PORT_NETBSD /* XXX NetBSD Do we support holes in files ? */ + } + case _FIO_SEEK_DATA: case _FIO_SEEK_HOLE: + { +#ifdef illumos if (ddi_copyin((void *)data, &off, sizeof (off), flag)) - return (EFAULT); - + return (SET_ERROR(EFAULT)); +#else + off = *(offset_t *)data; +#endif zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - + /* offset parameter is in/out */ error = zfs_holey(vp, com, &off); ZFS_EXIT(zfsvfs); if (error) return (error); +#ifdef illumos if (ddi_copyout(&off, (void *)data, sizeof (off), flag)) - return (EFAULT); + return (SET_ERROR(EFAULT)); +#else + *(offset_t *)data = off; +#endif + return (0); + } +#ifdef illumos + case _FIO_COUNT_FILLED: + { + /* + * _FIO_COUNT_FILLED adds a new ioctl command which + * exposes the number of filled blocks in a + * ZFS object. + */ + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * Wait for all dirty blocks for this object + * to get synced out to disk, and the DMU info + * updated. + */ + error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Retrieve fill count from DMU object. + */ + error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + ndata = doi.doi_fill_count; + + ZFS_EXIT(zfsvfs); + if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag)) + return (SET_ERROR(EFAULT)); return (0); + } #endif } - - return (ENOTTY); + return (SET_ERROR(ENOTTY)); +} + +#ifdef __FreeBSD__ +static vm_page_t +page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) +{ + vm_object_t obj; + vm_page_t pp; + int64_t end; + + /* + * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE + * aligned boundaries, if the range is not aligned. As a result a + * DEV_BSIZE subrange with partially dirty data may get marked as clean. + * It may happen that all DEV_BSIZE subranges are marked clean and thus + * the whole page would be considred clean despite have some dirty data. + * For this reason we should shrink the range to DEV_BSIZE aligned + * boundaries before calling vm_page_clear_dirty. + */ + end = rounddown2(off + nbytes, DEV_BSIZE); + off = roundup2(off, DEV_BSIZE); + nbytes = end - off; + + obj = vp->v_object; + zfs_vmobject_assert_wlocked(obj); + + for (;;) { + if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && + pp->valid) { + if (vm_page_xbusied(pp)) { + /* + * Reference the page before unlocking and + * sleeping so that the page daemon is less + * likely to reclaim it. + */ + vm_page_reference(pp); + vm_page_lock(pp); + zfs_vmobject_wunlock(obj); + vm_page_busy_sleep(pp, "zfsmwb", true); + zfs_vmobject_wlock(obj); + continue; + } + vm_page_sbusy(pp); + } else if (pp != NULL) { + ASSERT(!pp->valid); + pp = NULL; + } + + if (pp != NULL) { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_object_pip_add(obj, 1); + pmap_remove_write(pp); + if (nbytes != 0) + vm_page_clear_dirty(pp, off, nbytes); + } + break; + } + return (pp); +} + +static void +page_unbusy(vm_page_t pp) +{ + + vm_page_sunbusy(pp); + vm_object_pip_subtract(pp->object, 1); +} + +static vm_page_t +page_hold(vnode_t *vp, int64_t start) +{ + vm_object_t obj; + vm_page_t pp; + + obj = vp->v_object; + zfs_vmobject_assert_wlocked(obj); + + for (;;) { + if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && + pp->valid) { + if (vm_page_xbusied(pp)) { + /* + * Reference the page before unlocking and + * sleeping so that the page daemon is less + * likely to reclaim it. + */ + vm_page_reference(pp); + vm_page_lock(pp); + zfs_vmobject_wunlock(obj); + vm_page_busy_sleep(pp, "zfsmwb", true); + zfs_vmobject_wlock(obj); + continue; + } + + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_page_lock(pp); + vm_page_hold(pp); + vm_page_unlock(pp); + + } else + pp = NULL; + break; + } + return (pp); +} + +static void +page_unhold(vm_page_t pp) +{ + + vm_page_lock(pp); + vm_page_unhold(pp); + vm_page_unlock(pp); } -#ifdef PORT_NETBSD /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: @@ -318,33 +519,109 @@ zfs_ioctl(vnode_t *vp, u_long com, intpt * the page and the dmu buffer. */ static void -update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) +update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, + int segflg, dmu_tx_t *tx) { - int64_t off; + vm_object_t obj; + struct sf_buf *sf; + caddr_t va; + int off; + + ASSERT(segflg != UIO_NOCOPY); + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); off = start & PAGEOFFSET; - dirbytes = 0; - VM_OBJECT_LOCK(obj); + zfs_vmobject_wlock(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { - page_t *pp; - uint64_t nbytes = MIN(PAGESIZE - off, len); + vm_page_t pp; + int nbytes = imin(PAGESIZE - off, len); - if (pp = page_lookup(vp, start, SE_SHARED)) { - caddr_t va; + if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { + zfs_vmobject_wunlock(obj); - va = zfs_map_page(pp, S_WRITE); - (void) dmu_read(os, oid, start+off, nbytes, va+off, - DMU_READ_PREFETCH); - zfs_unmap_page(pp, va); - page_unlock(pp); + va = zfs_map_page(pp, &sf); + (void) dmu_read(os, oid, start+off, nbytes, + va+off, DMU_READ_PREFETCH);; + zfs_unmap_page(sf); + + zfs_vmobject_wlock(obj); + page_unbusy(pp); } len -= nbytes; off = 0; } + vm_object_pip_wakeupn(obj, 0); + zfs_vmobject_wunlock(obj); +} + +/* + * Read with UIO_NOCOPY flag means that sendfile(2) requests + * ZFS to populate a range of page cache pages with data. + * + * NOTE: this function could be optimized to pre-allocate + * all pages in advance, drain exclusive busy on all of them, + * map them into contiguous KVA region and populate them + * in one single dmu_read() call. + */ +static int +mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + objset_t *os = zp->z_zfsvfs->z_os; + struct sf_buf *sf; + vm_object_t obj; + vm_page_t pp; + int64_t start; + caddr_t va; + int len = nbytes; + int off; + int error = 0; - VM_OBJECT_UNLOCK(obj); - if (error == 0 && dirbytes > 0) - error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx); + ASSERT(uio->uio_segflg == UIO_NOCOPY); + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); + + zfs_vmobject_wlock(obj); + for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { + int bytes = MIN(PAGESIZE, len); + + pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | + VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); + if (pp->valid == 0) { + zfs_vmobject_wunlock(obj); + va = zfs_map_page(pp, &sf); + error = dmu_read(os, zp->z_id, start, bytes, va, + DMU_READ_PREFETCH); + if (bytes != PAGESIZE && error == 0) + bzero(va + bytes, PAGESIZE - bytes); + zfs_unmap_page(sf); + zfs_vmobject_wlock(obj); + vm_page_sunbusy(pp); + vm_page_lock(pp); + if (error) { + if (pp->wire_count == 0 && pp->valid == 0 && + !vm_page_busied(pp)) + vm_page_free(pp); + } else { + pp->valid = VM_PAGE_BITS_ALL; + vm_page_activate(pp); + } + vm_page_unlock(pp); + } else { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_page_sunbusy(pp); + } + if (error) + break; + uio->uio_resid -= bytes; + uio->uio_offset += bytes; + len -= bytes; + } + zfs_vmobject_wunlock(obj); return (error); } @@ -356,21 +633,18 @@ update_pages(vnode_t *vp, int64_t start, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. + * the file is memory mapped. */ static int mappedread(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); - objset_t *os = zp->z_zfsvfs->z_os; vm_object_t obj; - vm_page_t m; - struct sf_buf *sf; - int64_t start, off; + int64_t start; caddr_t va; int len = nbytes; + int off; int error = 0; - uint64_t dirbytes; ASSERT(vp->v_mount != NULL); obj = vp->v_object; @@ -378,77 +652,161 @@ mappedread(vnode_t *vp, int nbytes, uio_ start = uio->uio_loffset; off = start & PAGEOFFSET; - dirbytes = 0; - VM_OBJECT_LOCK(obj); + zfs_vmobject_wlock(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + vm_page_t pp; uint64_t bytes = MIN(PAGESIZE - off, len); -again: - if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && - vm_page_is_valid(m, (vm_offset_t)off, bytes)) { - if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) - goto again; - vm_page_busy(m); - VM_OBJECT_UNLOCK(obj); - if (dirbytes > 0) { - error = dmu_read_uio(os, zp->z_id, uio, - dirbytes); - dirbytes = 0; - } - if (error == 0) { - sched_pin(); - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - va = (caddr_t)sf_buf_kva(sf); - error = uiomove(va + off, bytes, UIO_READ, uio); - sf_buf_free(sf); - sched_unpin(); - } - VM_OBJECT_LOCK(obj); - vm_page_wakeup(m); - } else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) { - /* - * The code below is here to make sendfile(2) work - * correctly with ZFS. As pointed out by ups@ - * sendfile(2) should be changed to use VOP_GETPAGES(), - * but it pessimize performance of sendfile/UFS, that's - * why I handle this special case in ZFS code. - */ - if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb")) - goto again; - vm_page_busy(m); - VM_OBJECT_UNLOCK(obj); - if (dirbytes > 0) { - error = dmu_read_uio(os, zp->z_id, uio, - dirbytes); - dirbytes = 0; - } - if (error == 0) { - sched_pin(); - sf = sf_buf_alloc(m, SFB_CPUPRIVATE); - va = (caddr_t)sf_buf_kva(sf); - error = dmu_read(os, zp->z_id, start + off, - bytes, (void *)(va + off)); - sf_buf_free(sf); - sched_unpin(); - } - VM_OBJECT_LOCK(obj); - vm_page_wakeup(m); - if (error == 0) - uio->uio_resid -= bytes; + if (pp = page_hold(vp, start)) { + struct sf_buf *sf; + caddr_t va; + + zfs_vmobject_wunlock(obj); + va = zfs_map_page(pp, &sf); +#ifdef illumos + error = uiomove(va + off, bytes, UIO_READ, uio); +#else + error = vn_io_fault_uiomove(va + off, bytes, uio); +#endif + zfs_unmap_page(sf); + zfs_vmobject_wlock(obj); + page_unhold(pp); + } else { + zfs_vmobject_wunlock(obj); + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, bytes); + zfs_vmobject_wlock(obj); + } + len -= bytes; + off = 0; + if (error) + break; + } + zfs_vmobject_wunlock(obj); + return (error); +} +#endif /* __FreeBSD__ */ + +#ifdef __NetBSD__ + +caddr_t +zfs_map_page(page_t *pp, enum seg_rw rw) +{ + vaddr_t va; + int flags; + +#ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS + if (mm_md_direct_mapped_phys(VM_PAGE_TO_PHYS(pp), &va)) + return (caddr_t)va; +#endif + + flags = UVMPAGER_MAPIN_WAITOK | + (rw == S_READ ? UVMPAGER_MAPIN_WRITE : UVMPAGER_MAPIN_READ); + va = uvm_pagermapin(&pp, 1, flags); + return (caddr_t)va; +} + +void +zfs_unmap_page(page_t *pp, caddr_t addr) +{ + +#ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS + vaddr_t va; + + if (mm_md_direct_mapped_phys(VM_PAGE_TO_PHYS(pp), &va)) + return; +#endif + uvm_pagermapout((vaddr_t)addr, 1); +} + +static int +mappedread(vnode_t *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + struct uvm_object *uobj = &vp->v_uobj; + kmutex_t *mtx = uobj->vmobjlock; + int64_t start; + caddr_t va; + size_t len = nbytes; + int off; + int error = 0; + int npages, found; + + start = uio->uio_loffset; + off = start & PAGEOFFSET; + + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + page_t *pp; + uint64_t bytes = MIN(PAGESIZE - off, len); + + pp = NULL; + npages = 1; + mutex_enter(mtx); + found = uvn_findpages(uobj, start, &npages, &pp, UFP_NOALLOC); + mutex_exit(mtx); + + /* XXXNETBSD shouldn't access userspace with the page busy */ + if (found) { + va = zfs_map_page(pp, S_READ); + error = uiomove(va + off, bytes, UIO_READ, uio); + zfs_unmap_page(pp, va); } else { - dirbytes += bytes; + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, bytes); } + + mutex_enter(mtx); + uvm_page_unbusy(&pp, 1); + mutex_exit(mtx); + len -= bytes; off = 0; if (error) break; } - VM_OBJECT_UNLOCK(obj); - if (error == 0 && dirbytes > 0) - error = dmu_read_uio(os, zp->z_id, uio, dirbytes); return (error); } -#endif /* PORT_NETBSD */ + +static void +update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, + int segflg, dmu_tx_t *tx) +{ + struct uvm_object *uobj = &vp->v_uobj; + kmutex_t *mtx = uobj->vmobjlock; + caddr_t va; + int off; + + ASSERT(vp->v_mount != NULL); + + mutex_enter(mtx); + + off = start & PAGEOFFSET; + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + page_t *pp; + int nbytes = MIN(PAGESIZE - off, len); + int npages, found; + + pp = NULL; + npages = 1; + found = uvn_findpages(uobj, start, &npages, &pp, UFP_NOALLOC); + if (found) { + mutex_exit(mtx); + + va = zfs_map_page(pp, S_WRITE); + (void) dmu_read(os, oid, start + off, nbytes, + va + off, DMU_READ_PREFETCH); + zfs_unmap_page(pp, va); + + mutex_enter(mtx); + uvm_page_unbusy(&pp, 1); + } + len -= nbytes; + off = 0; + } + mutex_exit(mtx); +} +#endif /* __NetBSD__ */ + offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ /* @@ -463,8 +821,7 @@ offset_t zfs_read_chunk_size = 1024 * 10 * * OUT: uio - updated offset and range, buffer filled. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Side Effects: * vp - atime updated if byte count > 0 @@ -475,21 +832,17 @@ zfs_read(vnode_t *vp, uio_t *uio, int io { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - objset_t *os; ssize_t n, nbytes; - int error; + int error = 0; rl_t *rl; xuio_t *xuio = NULL; - dprintf("zfs_read called\n"); - ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - os = zfsvfs->z_os; - if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) { + if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); - return (EACCES); + return (SET_ERROR(EACCES)); } /* @@ -497,7 +850,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int io */ if (uio->uio_loffset < (offset_t)0) { ZFS_EXIT(zfsvfs); - return (EINVAL); + return (SET_ERROR(EINVAL)); } /* @@ -511,7 +864,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int io /* * Check for mandatory locks */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { + if (MANDMODE(zp->z_mode)) { if (error = chklock(vp, FREAD, uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) { ZFS_EXIT(zfsvfs); @@ -522,8 +875,9 @@ zfs_read(vnode_t *vp, uio_t *uio, int io /* * If we're in FRSYNC mode, sync out this znode before reading it. */ - if (ioflag & FRSYNC) - zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); + if (zfsvfs->z_log && + (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) + zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. @@ -534,14 +888,15 @@ zfs_read(vnode_t *vp, uio_t *uio, int io * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ - if (uio->uio_loffset >= zp->z_phys->zp_size) { + if (uio->uio_loffset >= zp->z_size) { error = 0; goto out; } - ASSERT(uio->uio_loffset < zp->z_phys->zp_size); - n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); -#ifdef PORT_SOLARIS + ASSERT(uio->uio_loffset < zp->z_size); + n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); + +#ifdef illumos if ((uio->uio_extflg == UIO_XUIO) && (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { int nblk; @@ -565,24 +920,32 @@ zfs_read(vnode_t *vp, uio_t *uio, int io */ while (--nblk >= 0) { (void) dmu_xuio_add(xuio, - dmu_request_arcbuf(zp->z_dbuf, blksz), - 0, blksz); + dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz), 0, blksz); } } } -#endif +#endif /* illumos */ + while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); -// if (vn_has_cached_data(vp)) -// error = mappedread(vp, nbytes, uio); -// else - error = dmu_read_uio(os, zp->z_id, uio, nbytes); +#ifdef __FreeBSD__ + if (uio->uio_segflg == UIO_NOCOPY) + error = mappedread_sf(vp, nbytes, uio); + else +#endif /* __FreeBSD__ */ + if (vn_has_cached_data(vp)) { + error = mappedread(vp, nbytes, uio); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes); + } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) - error = EIO; + error = SET_ERROR(EIO); break; } @@ -597,70 +960,24 @@ out: } /* - * Fault in the pages of the first n bytes specified by the uio structure. - * 1 byte in each page is touched and the uio struct is unmodified. - * Any error will exit this routine as this is only a best - * attempt to get the pages resident. This is a copy of ufs_trans_touch(). - */ -static void -zfs_prefault_write(ssize_t n, struct uio *uio) -{ - struct iovec *iov; - ulong_t cnt, incr; - caddr_t p; - - if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) - return; - - iov = uio->uio_iov; - - while (n) { - cnt = MIN(iov->iov_len, n); - if (cnt == 0) { - /* empty iov entry */ - iov++; - continue; - } - n -= cnt; - /* - * touch each page in this segment. - */ - p = iov->iov_base; - while (cnt) { - if (fubyte(p) == -1) - return; - incr = MIN(cnt, PAGESIZE); - p += incr; - cnt -= incr; - } - /* - * touch the last byte in case it straddles a page. - */ - p--; - if (fubyte(p) == -1) - return; - iov++; - } -} - -/* * Write the bytes to a file. * * IN: vp - vnode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. - * ioflag - IO_APPEND flag set if in append mode. + * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is + * set if in append mode. * cr - credentials of caller. * ct - caller context (NFS/CIFS fem monitor only) * * OUT: uio - updated offset and range. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime|mtime updated if byte count > 0 */ + /* ARGSUSED */ static int zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) @@ -677,17 +994,25 @@ zfs_write(vnode_t *vp, uio_t *uio, int i ssize_t n, nbytes; rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; - uint64_t pflags; - int error; + int error = 0; arc_buf_t *abuf; - iovec_t *aiov; + iovec_t *aiov = NULL; xuio_t *xuio = NULL; int i_iov = 0; int iovcnt = uio->uio_iovcnt; iovec_t *iovp = uio->uio_iov; int write_eof; + int count = 0; + sa_bulk_attr_t bulk[4]; + uint64_t mtime[2], ctime[2]; + int segflg; - dprintf("zfs_write called\n"); +#ifdef __NetBSD__ + segflg = VMSPACE_IS_KERNEL_P(uio->uio_vmspace) ? + UIO_SYSSPACE : UIO_USERSPACE; +#else + segflg = uio->uio_segflg; +#endif /* * Fasttrack empty write @@ -702,49 +1027,71 @@ zfs_write(vnode_t *vp, uio_t *uio, int i ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + /* - * If immutable or not appending then return EPERM + * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our + * callers might not be able to detect properly that we are read-only, + * so check it explicitly here. */ - pflags = zp->z_phys->zp_flags; - if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || - ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && - (uio->uio_loffset < zp->z_phys->zp_size))) { + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); - return (EPERM); + return (SET_ERROR(EROFS)); } - zilog = zfsvfs->z_log; - + /* + * If immutable or not appending then return EPERM + */ + if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || + ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && + (uio->uio_loffset < zp->z_size))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + zilog = zfsvfs->z_log; + /* * Validate file offset */ - woff = ioflag & FAPPEND ? zp->z_phys->zp_size : uio->uio_loffset; + woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; if (woff < 0) { ZFS_EXIT(zfsvfs); - return (EINVAL); + return (SET_ERROR(EINVAL)); } /* * Check for mandatory locks before calling zfs_range_lock() * in order to prevent a deadlock with locks set via fcntl(). */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode) && + if (MANDMODE((mode_t)zp->z_mode) && (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { ZFS_EXIT(zfsvfs); return (error); } +#ifdef illumos /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. * Skip this if uio contains loaned arc_buf. */ - zfs_prefault_write(n, uio); + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) + xuio = (xuio_t *)uio; + else + uio_prefaultpages(MIN(n, max_blksz), uio); +#endif /* * If in append mode, set the io offset pointer to eof. */ - if (ioflag & IO_APPEND) { + if (ioflag & FAPPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. @@ -757,7 +1104,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int i * the file block size to increase. * Note that zp_size cannot change with this lock held. */ - woff = zp->z_phys->zp_size; + woff = zp->z_size; } uio->uio_loffset = woff; } else { @@ -769,19 +1116,32 @@ zfs_write(vnode_t *vp, uio_t *uio, int i rl = zfs_range_lock(zp, woff, n, RL_WRITER); } +#ifdef illumos if (woff >= limit) { zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); - return (EFBIG); + return (SET_ERROR(EFBIG)); + } + +#endif +#ifdef __FreeBSD__ + if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { + zfs_range_unlock(rl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFBIG)); } +#endif +#ifdef __NetBSD__ + /* XXXNETBSD we might need vn_rlimit_fsize() too here eventually */ +#endif if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; /* Will this write extend the file length? */ - write_eof = (woff + n > zp->z_phys->zp_size); + write_eof = (woff + n > zp->z_size); - end_size = MAX(zp->z_phys->zp_size, woff + n); + end_size = MAX(zp->z_size, woff + n); /* * Write the file in reasonable size chunks. Each chunk is written @@ -791,14 +1151,11 @@ zfs_write(vnode_t *vp, uio_t *uio, int i while (n > 0) { abuf = NULL; woff = uio->uio_loffset; -again: - if (zfs_usergroup_overquota(zfsvfs, - B_FALSE, zp->z_phys->zp_uid) || - zfs_usergroup_overquota(zfsvfs, - B_TRUE, zp->z_phys->zp_gid)) { + if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || + zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { if (abuf != NULL) dmu_return_arcbuf(abuf); - error = EDQUOT; + error = SET_ERROR(EDQUOT); break; } @@ -814,7 +1171,7 @@ again: aiov->iov_len == arc_buf_size(abuf))); i_iov++; } else if (abuf == NULL && n >= max_blksz && - woff >= zp->z_phys->zp_size && + woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* @@ -824,32 +1181,55 @@ again: * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ +#ifdef illumos size_t cbytes; +#endif - abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz); + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); +#ifdef illumos if (error = uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes)) { dmu_return_arcbuf(abuf); break; } ASSERT(cbytes == max_blksz); +#endif +#ifdef __FreeBSD__ + ssize_t resid = uio->uio_resid; + + error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio); + if (error != 0) { + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + dmu_return_arcbuf(abuf); + break; + } +#endif +#ifdef __NetBSD__ + ssize_t resid = uio->uio_resid; + + error = uiomove(abuf->b_data, max_blksz, UIO_WRITE, uio); + if (error != 0) { + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + dmu_return_arcbuf(abuf); + break; + } +#endif } /* * Start a transaction. */ tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); - error = dmu_tx_assign(tx, TXG_NOWAIT); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto again; - } dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); @@ -866,8 +1246,14 @@ again: uint64_t new_blksz; if (zp->z_blksz > max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE); + new_blksz = MIN(end_size, + 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } @@ -881,10 +1267,13 @@ again: */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + if (woff + nbytes > zp->z_size) + vnode_pager_setsize(vp, woff + nbytes); + if (abuf == NULL) { tx_bytes = uio->uio_resid; - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, - nbytes, tx); + error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes, tx); tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; @@ -904,22 +1293,26 @@ again: xuio_stat_wbuf_copied(); } else { ASSERT(xuio || tx_bytes == max_blksz); - dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx); + dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), + woff, abuf, tx); } +#ifdef illumos ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); +#endif } -#ifdef PORT_SOLARIS if (tx_bytes && vn_has_cached_data(vp)) { - update_pages(vp, woff, - tx_bytes, zfsvfs->z_os, zp->z_id); + update_pages(vp, woff, tx_bytes, zfsvfs->z_os, + zp->z_id, segflg, tx); } -#endif + /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; @@ -937,27 +1330,48 @@ again: * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); - if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) | + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && - (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(cr, (zp->z_phys->zp_mode & S_ISUID) != 0 && zp->z_phys->zp_uid == 0) != 0) { - zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID); + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(vp, cr, + (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { + uint64_t newmode; + zp->z_mode &= ~(S_ISUID | S_ISGID); + newmode = zp->z_mode; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); - /* - * Update time stamp. NOTE: This marks the bonus buffer as - * dirty, so we don't have to do it again for zp_size. - */ - zfs_time_stamper(zp, CONTENT_MODIFIED, tx); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, + B_TRUE); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ - while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset) - (void) atomic_cas_64(&zp->z_phys->zp_size, end_size, + while ((end_size = zp->z_size) < uio->uio_loffset) { + (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); +#ifdef illumos + ASSERT(error == 0); +#else + ASSERT(error == 0 || error == EFAULT); +#endif + } + /* + * If we are replaying and eof is non zero then force + * the file size to the specified eof. Note, there's no + * concurrency during replay. + */ + if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) + zp->z_size = zfsvfs->z_replay_eof; + + if (error == 0) + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + else + (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); @@ -965,6 +1379,11 @@ again: break; ASSERT(tx_bytes == nbytes); n -= nbytes; + +#ifdef illumos + if (!xuio && n > 0) + uio_prefaultpages(MIN(n, max_blksz), uio); +#endif } zfs_range_unlock(rl); @@ -978,11 +1397,22 @@ again: return (error); } - if (ioflag & (FSYNC | FDSYNC)) - zil_commit(zilog, zp->z_last_itx, zp->z_id); +#ifdef __FreeBSD__ + /* + * EFAULT means that at least one page of the source buffer was not + * available. VFS will re-try remaining I/O upon this error. + */ + if (error == EFAULT) { + ZFS_EXIT(zfsvfs); + return (error); + } +#endif + + if (ioflag & (FSYNC | FDSYNC) || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, zp->z_id); ZFS_EXIT(zfsvfs); - return (0); } @@ -1001,7 +1431,7 @@ zfs_get_done(zgd_t *zgd, int error) * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ - VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); + VN_RELE_CLEANER(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); if (error == 0 && zgd->zgd_bp) zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); @@ -1036,16 +1466,16 @@ zfs_get_data(void *arg, lr_write_t *lr, /* * Nothing to do if the file has been removed */ - if (zfs_zget(zfsvfs, object, &zp) != 0) - return (ENOENT); + if (zfs_zget_cleaner(zfsvfs, object, &zp) != 0) + return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ - VN_RELE_ASYNC(ZTOV(zp), + VN_RELE_CLEANER(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); - return (ENOENT); + return (SET_ERROR(ENOENT)); } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); @@ -1062,8 +1492,8 @@ zfs_get_data(void *arg, lr_write_t *lr, if (buf != NULL) { /* immediate write */ zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ - if (offset >= zp->z_phys->zp_size) { - error = ENOENT; + if (offset >= zp->z_size) { + error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); @@ -1089,18 +1519,25 @@ zfs_get_data(void *arg, lr_write_t *lr, zfs_range_unlock(zgd->zgd_rl); } /* test for truncation needs to be done while range locked */ - if (lr->lr_offset >= zp->z_phys->zp_size) - error = ENOENT; + if (lr->lr_offset >= zp->z_size) + error = SET_ERROR(ENOENT); #ifdef DEBUG if (zil_fault_io) { - error = EIO; + error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) - error = dmu_buf_hold(os, object, offset, zgd, &db); + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); if (error == 0) { + blkptr_t *obp = dmu_buf_get_blkptr(db); + if (obp) { + ASSERT(BP_IS_HOLE(bp)); + *bp = *obp; + } + zgd->zgd_db = db; zgd->zgd_bp = bp; @@ -1153,6 +1590,310 @@ zfs_access(vnode_t *vp, int mode, int fl return (error); } +#ifdef __FreeBSD__ +static int +zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) +{ + int error; + + *vpp = arg; + error = vn_lock(*vpp, lkflags); + if (error != 0) + vrele(*vpp); + return (error); +} + +static int +zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) +{ + znode_t *zdp = VTOZ(dvp); + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + int error; + int ltype; + + ASSERT_VOP_LOCKED(dvp, __func__); +#ifdef DIAGNOSTIC + if ((zdp->z_pflags & ZFS_XATTR) == 0) + VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); +#endif + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + ASSERT3P(dvp, ==, vp); + vref(dvp); + ltype = lkflags & LK_TYPE_MASK; + if (ltype != VOP_ISLOCKED(dvp)) { + if (ltype == LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + else /* if (ltype == LK_SHARED) */ + vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); + + /* + * Relock for the "." case could leave us with + * reclaimed vnode. + */ + if (dvp->v_iflag & VI_DOOMED) { + vrele(dvp); + return (SET_ERROR(ENOENT)); + } + } + return (0); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + /* + * Note that in this case, dvp is the child vnode, and we + * are looking up the parent vnode - exactly reverse from + * normal operation. Unlocking dvp requires some rather + * tricky unlock/relock dance to prevent mp from being freed; + * use vn_vget_ino_gen() which takes care of all that. + * + * XXX Note that there is a time window when both vnodes are + * unlocked. It is possible, although highly unlikely, that + * during that window the parent-child relationship between + * the vnodes may change, for example, get reversed. + * In that case we would have a wrong lock order for the vnodes. + * All other filesystems seem to ignore this problem, so we + * do the same here. + * A potential solution could be implemented as follows: + * - using LK_NOWAIT when locking the second vnode and retrying + * if necessary + * - checking that the parent-child relationship still holds + * after locking both vnodes and retrying if it doesn't + */ + error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); + return (error); + } else { + error = vn_lock(vp, lkflags); + if (error != 0) + vrele(vp); + return (error); + } +} + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held vnode reference for it. + * + * IN: dvp - vnode of directory to search. + * nm - name of entry to lookup. + * pnp - full pathname to lookup [UNUSED]. + * flags - LOOKUP_XATTR set if looking for an attribute. + * rdir - root directory vnode [UNUSED]. + * cr - credentials of caller. + * ct - caller context + * + * OUT: vpp - vnode of located entry, NULL if not found. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +static int +zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, + int nameiop, cred_t *cr, kthread_t *td, int flags) +{ + znode_t *zdp = VTOZ(dvp); + znode_t *zp; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + int error = 0; + + /* fast path (should be redundant with vfs namecache) */ + if (!(flags & LOOKUP_XATTR)) { + if (dvp->v_type != VDIR) { + return (SET_ERROR(ENOTDIR)); + } else if (zdp->z_sa_hdl == NULL) { + return (SET_ERROR(EIO)); + } + } + + DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); + + *vpp = NULL; + + if (flags & LOOKUP_XATTR) { +#ifdef TODO + /* + * If the xattr property is off, refuse the lookup request. + */ + if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } +#endif + + /* + * We don't allow recursive attributes.. + * Maybe someday we will. + */ + if (zdp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Do we have permission to get into attribute directory? + */ + if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0, + B_FALSE, cr)) { + vrele(*vpp); + *vpp = NULL; + } + + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Check accessibility of directory. + */ + if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + + /* + * First handle the special cases. + */ + if ((cnp->cn_flags & ISDOTDOT) != 0) { + /* + * If we are a snapshot mounted under .zfs, return + * the vp for the snapshot directory. + */ + if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { + struct componentname cn; + vnode_t *zfsctl_vp; + int ltype; + + ZFS_EXIT(zfsvfs); + ltype = VOP_ISLOCKED(dvp); + VOP_UNLOCK(dvp, 0); + error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, + &zfsctl_vp); + if (error == 0) { + cn.cn_nameptr = "snapshot"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = cnp->cn_nameiop; + cn.cn_flags = cnp->cn_flags; + cn.cn_lkflags = cnp->cn_lkflags; + error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); + vput(zfsctl_vp); + } + vn_lock(dvp, ltype | LK_RETRY); + return (error); + } + } + if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { + ZFS_EXIT(zfsvfs); + if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); + return (error); + } + + /* + * The loop is retry the lookup if the parent-child relationship + * changes during the dot-dot locking complexities. + */ + for (;;) { + uint64_t parent; + + error = zfs_dirlook(zdp, nm, &zp); + if (error == 0) + *vpp = ZTOV(zp); + + ZFS_EXIT(zfsvfs); + if (error != 0) + break; + + error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); + if (error != 0) { + /* + * If we've got a locking error, then the vnode + * got reclaimed because of a force unmount. + * We never enter doomed vnodes into the name cache. + */ + *vpp = NULL; + return (error); + } + + if ((cnp->cn_flags & ISDOTDOT) == 0) + break; + + ZFS_ENTER(zfsvfs); + if (zdp->z_sa_hdl == NULL) { + error = SET_ERROR(EIO); + } else { + error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + } + if (error != 0) { + ZFS_EXIT(zfsvfs); + vput(ZTOV(zp)); + break; + } + if (zp->z_id == parent) { + ZFS_EXIT(zfsvfs); + break; + } + vput(ZTOV(zp)); + } + +out: + if (error != 0) + *vpp = NULL; + + /* Translate errors and add SAVENAME when needed. */ + if (cnp->cn_flags & ISLASTCN) { + switch (nameiop) { + case CREATE: + case RENAME: + if (error == ENOENT) { + error = EJUSTRETURN; + cnp->cn_flags |= SAVENAME; + break; + } + /* FALLTHROUGH */ + case DELETE: + if (error == 0) + cnp->cn_flags |= SAVENAME; + break; + } + } + + /* Insert name into cache (as non-existent) if appropriate. */ + if (zfsvfs->z_use_namecache && + error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(dvp, NULL, cnp); + + /* Insert name into cache if appropriate. */ + if (zfsvfs->z_use_namecache && + error == 0 && (cnp->cn_flags & MAKEENTRY)) { + if (!(cnp->cn_flags & ISLASTCN) || + (nameiop != DELETE && nameiop != RENAME)) { + cache_enter(dvp, *vpp, cnp); + } + } + + return (error); +} +#endif /* __FreeBSD__ */ + +#ifdef __NetBSD__ /* * If vnode is for a device return a specfs vnode instead. */ @@ -1173,7 +1914,6 @@ specvp_check(vnode_t **vpp, cred_t *cr) return (error); } - /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. @@ -1199,20 +1939,20 @@ specvp_check(vnode_t **vpp, cred_t *cr) /* ARGSUSED */ static int zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, - int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, int *direntflags, pathname_t *realpnp) { znode_t *zdp = VTOZ(dvp); + znode_t *zp; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error = 0; /* fast path */ - if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { - + if (!(flags & LOOKUP_XATTR)) { if (dvp->v_type != VDIR) { return (ENOTDIR); - } else if (zdp->z_dbuf == NULL) { - return (EIO); + } else if (zdp->z_sa_hdl == NULL) { + return (SET_ERROR(EIO)); } if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { @@ -1265,7 +2005,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode * We don't allow recursive attributes.. * Maybe someday we will. */ - if (zdp->z_phys->zp_flags & ZFS_XATTR) { + if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (EINVAL); } @@ -1307,13 +2047,16 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode return (EILSEQ); } - error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); - if (error == 0) - error = specvp_check(vpp, cr); + error = zfs_dirlook(zdp, nm, &zp); + if (error == 0) { + *vpp = ZTOV(zp); + error = specvp_check(vpp, cr); + } ZFS_EXIT(zfsvfs); return (error); } +#endif /* * Attempt to create a new entry in a directory. If the entry @@ -1328,12 +2071,11 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode * cr - credentials of caller. * flag - large file flag [UNUSED]. * ct - caller context - * vsecp - ACL to be set + * vsecp - ACL to be set * * OUT: vpp - vnode of created or trunc'd entry. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated if new entry created @@ -1343,30 +2085,38 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode /* ARGSUSED */ static int zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode, - vnode_t **vpp, cred_t *cr) + vnode_t **vpp, cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; objset_t *os; - zfs_dirlock_t *dl; dmu_tx_t *tx; int error; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; void *vsecp = NULL; int flag = 0; - zfs_acl_ids_t acl_ids; - boolean_t fuid_dirtied; + uint64_t txtype; - dprintf("zfs_create called\n"); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); + if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || - IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr)))) - return (EINVAL); + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); @@ -1376,172 +2126,105 @@ zfs_create(vnode_t *dvp, char *name, vat if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); - return (EILSEQ); + return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { - if ((error = secpolicy_xvattr((xvattr_t *)vap, + if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } -top: + *vpp = NULL; if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~S_ISVTX; - if (*name == '\0') { - /* - * Null component name refers to the directory itself. - */ - VN_HOLD(dvp); - zp = dzp; - dl = NULL; - error = 0; - } else { - /* possible VN_HOLD(zp) */ - int zflg = 0; - - if (flag & FIGNORECASE) - zflg |= ZCILOOK; + error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + ASSERT3P(zp, ==, NULL); - error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, NULL); - if (error) { - if (strcmp(name, "..") == 0) - error = EISDIR; - ZFS_EXIT(zfsvfs); - return (error); - } + /* + * Create a new file object and update the directory + * to reference it. + */ + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + goto out; } - if (zp == NULL) { - uint64_t txtype; - /* - * Create a new file object and update the directory - * to reference it. - */ - error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr); - if (error) { - goto out; - } + /* + * We only support the creation of regular files in + * extended attribute directories. + */ - /* - * We only support the creation of regular files in - * extended attribute directories. - */ - if ((dzp->z_phys->zp_flags & ZFS_XATTR) && - (vap->va_type != VREG)) { - error = EINVAL; - goto out; - } + if ((dzp->z_pflags & ZFS_XATTR) && + (vap->va_type != VREG)) { + error = SET_ERROR(EINVAL); + goto out; + } - if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, - &acl_ids)) != 0) - goto out; - if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { - zfs_acl_ids_free(&acl_ids); - error = EDQUOT; - goto out; - } + if ((error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) - zfs_fuid_txhold(zfsvfs, tx); - dmu_tx_hold_bonus(tx, dzp->z_id); - dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, SPA_MAXBLOCKSIZE); - } - error = dmu_tx_assign(tx, TXG_NOWAIT); - if (error) { - zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); - } - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); + getnewvnode_reserve(1); - (void) zfs_link_create(dl, zp, tx, ZNEW); + tx = dmu_tx_create(os); - txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); - if (flag & FIGNORECASE) - txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, name, - vsecp, acl_ids.z_fuidp, vap); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { zfs_acl_ids_free(&acl_ids); - dmu_tx_commit(tx); - } else { - int aflags = (flag & FAPPEND) ? V_APPEND : 0; + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - /* - * A directory entry already exists for this name. - */ - /* - * Can't truncate an existing file if in exclusive mode. - */ - if (excl == EXCL) { - error = EEXIST; - goto out; - } - /* - * Can't open a directory for writing. - */ - if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) { - error = EISDIR; - goto out; - } - /* - * Verify requested access to file. - */ - if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { - goto out; - } + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); - mutex_enter(&dzp->z_lock); - dzp->z_seq++; - mutex_exit(&dzp->z_lock); + (void) zfs_link_create(dzp, name, zp, tx, ZNEW); + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); - /* - * Truncate regular files if requested. - */ - if ((ZTOV(zp)->v_type == VREG) && - (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) { - /* we can't hold any locks when calling zfs_freesp() */ - zfs_dirent_unlock(dl); - dl = NULL; - error = zfs_freesp(zp, 0, 0, mode, TRUE); - if (error == 0) { - vnevent_create(ZTOV(zp), NULL); - } - } - } -out: - if (dl) - zfs_dirent_unlock(dl); + getnewvnode_drop_reserve(); - if (error) { - if (zp) - VN_RELE(ZTOV(zp)); - } else { +out: + if (error == 0) { *vpp = ZTOV(zp); - error = specvp_check(vpp, cr); } + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + ZFS_EXIT(zfsvfs); return (error); } @@ -1555,59 +2238,37 @@ out: * ct - caller context * flags - case flags * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ + /*ARGSUSED*/ static int -zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, - int flags) +zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { - znode_t *zp, *dzp = VTOZ(dvp); - znode_t *xzp = NULL; - vnode_t *vp; + znode_t *dzp = VTOZ(dvp); + znode_t *zp = VTOZ(vp); + znode_t *xzp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t acl_obj, xattr_obj; - zfs_dirlock_t *dl; + uint64_t obj = 0; dmu_tx_t *tx; - boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked, toobig = FALSE; uint64_t txtype; - pathname_t *realnmp = NULL; - pathname_t realnm; int error; - int zflg = ZEXISTS; - dprintf("zfs_remove called\n"); - ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); + ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; + zp = VTOZ(vp); - if (flags & FIGNORECASE) { - zflg |= ZCILOOK; - pn_alloc(&realnm); - realnmp = &realnm; - } - -top: - /* - * Attempt to lock directory; fail if entry doesn't exist. - */ - if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, realnmp)) { - if (realnmp) - pn_free(realnmp); - ZFS_EXIT(zfsvfs); - return (error); - } - - vp = ZTOV(zp); + xattr_obj = 0; + xzp = NULL; if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; @@ -1617,18 +2278,21 @@ top: * Need to use rmdir for removing directories. */ if (vp->v_type == VDIR) { - error = EPERM; + error = SET_ERROR(EPERM); goto out; } vnevent_remove(vp, dvp, name, ct); - if (realnmp) - dnlc_remove(dvp, realnmp->pn_buf); - else - dnlc_remove(dvp, name); + obj = zp->z_id; - may_delete_now = FALSE; + /* are there any extended attributes? */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT0(error); + } /* * We may delete the znode now, or we may put it in the unlinked set; @@ -1638,40 +2302,25 @@ top: */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_bonus(tx, zp->z_id); - if (may_delete_now) { - toobig = - zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT; - /* if the file is too big, only hold_free a token amount */ - dmu_tx_hold_free(tx, zp->z_id, 0, - (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); - } - - /* are there any extended attributes? */ - if ((xattr_obj = zp->z_phys->zp_xattr) != 0) { - /* XXX - do we need this if we are deleting? */ - dmu_tx_hold_bonus(tx, xattr_obj); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + + if (xzp) { + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } - /* are there any additional acls */ - if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 && - may_delete_now) - dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); - /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, TXG_NOWAIT); + /* + * Mark this transaction as typically resulting in a net free of space + */ + dmu_tx_mark_netfree(tx); + + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - zfs_dirent_unlock(dl); - VN_RELE(vp); - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - if (realnmp) - pn_free(realnmp); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); @@ -1680,66 +2329,29 @@ top: /* * Remove the directory entry. */ - error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); + error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } - if (0 && unlinked) { - KASSERT(0); /* NetBSD: must now happen now */ - VI_LOCK(vp); - delete_now = may_delete_now && !toobig && - vp->v_count == 1 && !vn_has_cached_data(vp) && - zp->z_phys->zp_xattr == xattr_obj && - zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj; - VI_UNLOCK(vp); - } - - if (delete_now) { - KASSERT(0); /* NetBSD: must now happen now */ - if (zp->z_phys->zp_xattr) { - error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp); - ASSERT3U(error, ==, 0); - ASSERT3U(xzp->z_phys->zp_links, ==, 2); - dmu_buf_will_dirty(xzp->z_dbuf, tx); - mutex_enter(&xzp->z_lock); - xzp->z_unlinked = 1; - xzp->z_phys->zp_links = 0; - mutex_exit(&xzp->z_lock); - zfs_unlinked_add(xzp, tx); - zp->z_phys->zp_xattr = 0; /* probably unnecessary */ - } - mutex_enter(&zp->z_lock); - VI_LOCK(vp); - vp->v_count--; - ASSERT3U(vp->v_count, ==, 0); - VI_UNLOCK(vp); - mutex_exit(&zp->z_lock); - zfs_znode_delete(zp, tx); - } else if (unlinked) { + if (unlinked) { zfs_unlinked_add(zp, tx); + vp->v_vflag |= VV_NOSYNC; } txtype = TX_REMOVE; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name); + zfs_log_remove(zilog, tx, txtype, dzp, name, obj); dmu_tx_commit(tx); out: - if (realnmp) - pn_free(realnmp); - zfs_dirent_unlock(dl); + if (xzp) + vrele(ZTOV(xzp)); - if (!delete_now) { - VN_RELE(vp); - } else if (xzp) { - /* this rele is delayed to prevent nesting transactions */ - VN_RELE(ZTOV(xzp)); - } + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); @@ -1754,12 +2366,12 @@ out: * vap - attributes of new directory. * cr - credentials of caller. * ct - caller context + * flags - case flags * vsecp - ACL to be set * * OUT: vpp - vnode of created directory. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated @@ -1767,18 +2379,18 @@ out: */ /*ARGSUSED*/ static int -zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, - caller_context_t *ct, int flags, vsecattr_t *vsecp) +zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr) { znode_t *zp, *dzp = VTOZ(dvp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; - zfs_dirlock_t *dl; uint64_t txtype; dmu_tx_t *tx; int error; - int zf = ZNEW; - zfs_acl_ids_t acl_ids; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; ASSERT(vap->va_type == VDIR); @@ -1788,88 +2400,96 @@ zfs_mkdir(vnode_t *dvp, char *dirname, v * make sure file system is at proper version */ + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && - (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))|| - IS_EPHEMERAL(crgetgid(cr)))) - return (EINVAL); + ((vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; - if (dzp->z_phys->zp_flags & ZFS_XATTR) { + if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); - return (EINVAL); + return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); - return (EILSEQ); + return (SET_ERROR(EILSEQ)); } - if (flags & FIGNORECASE) - zf |= ZCILOOK; - if (vap->va_mask & AT_XVATTR) - if ((error = secpolicy_xvattr((xvattr_t *)vap, + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, + NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } /* * First make sure the new directory doesn't exist. + * + * Existence is checked first to make sure we don't return + * EACCES instead of EEXIST which can cause some applications + * to fail. */ -top: *vpp = NULL; - if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, - NULL, NULL)) { + if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) { + zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } + ASSERT3P(zp, ==, NULL); if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) { - zfs_dirent_unlock(dl); + zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } - if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, - &acl_ids)) != 0) { - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (error); - } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); - return (EDQUOT); + return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ + getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); - if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, - 0, SPA_MAXBLOCKSIZE); - error = dmu_tx_assign(tx, TXG_NOWAIT); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } dmu_tx_abort(tx); + getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } @@ -1877,27 +2497,30 @@ top: /* * Create new node. */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); + /* * Now put new name in parent dir. */ - (void) zfs_link_create(dl, zp, tx, ZNEW); + (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); *vpp = ZTOV(zp); - txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, + txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); - zfs_dirent_unlock(dl); + getnewvnode_drop_reserve(); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (0); @@ -1915,116 +2538,68 @@ top: * ct - caller context * flags - case flags * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int -zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, - caller_context_t *ct, int flags) +zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); - znode_t *zp; - vnode_t *vp; + znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; - zfs_dirlock_t *dl; dmu_tx_t *tx; int error; - int zflg = ZEXISTS; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); + ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; - if (flags & FIGNORECASE) - zflg |= ZCILOOK; -top: - zp = NULL; - - /* - * Attempt to lock directory; fail if entry doesn't exist. - */ - if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, - NULL, NULL)) { - ZFS_EXIT(zfsvfs); - return (error); - } - - vp = ZTOV(zp); if (error = zfs_zaccess_delete(dzp, zp, cr)) { goto out; } if (vp->v_type != VDIR) { - error = ENOTDIR; - goto out; - } - - if (vp == cwd) { - error = EINVAL; + error = SET_ERROR(ENOTDIR); goto out; } vnevent_rmdir(vp, dvp, name, ct); - /* - * Grab a lock on the directory to make sure that noone is - * trying to add (or lookup) entries while we are removing it. - */ - rw_enter(&zp->z_name_lock, RW_WRITER); - - /* - * Grab a lock on the parent pointer to make sure we play well - * with the treewalk and directory rename code. - */ - rw_enter(&zp->z_parent_lock, RW_WRITER); - tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); - dmu_tx_hold_bonus(tx, zp->z_id); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, TXG_NOWAIT); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - rw_exit(&zp->z_parent_lock); - rw_exit(&zp->z_name_lock); - zfs_dirent_unlock(dl); - VN_RELE(vp); - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } - /* Purge cache entries, while still holding locks. */ cache_purge(dvp); - cache_purge(vp); - error = zfs_link_destroy(dl, zp, tx, zflg, NULL); + error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_remove(zilog, tx, txtype, dzp, name); + zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT); } dmu_tx_commit(tx); - rw_exit(&zp->z_parent_lock); - rw_exit(&zp->z_name_lock); + cache_purge(vp); out: - zfs_dirent_unlock(dl); - - VN_RELE(vp); + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); @@ -2033,7 +2608,7 @@ out: /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in - * the uio structure. + * the uio structure). * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, @@ -2045,8 +2620,7 @@ out: * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated @@ -2058,7 +2632,7 @@ out: */ /* ARGSUSED */ static int -zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies) +zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, off_t **cookies) { znode_t *zp = VTOZ(vp); iovec_t *iovp; @@ -2072,21 +2646,32 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ + uint64_t parent; int local_eof; int outcount; int error; uint8_t prefetch; boolean_t check_sysattrs; uint8_t type; - int ncooks; - u_long *cooks = NULL; + int ncooks = 0; + off_t *cooks = NULL; int flags = 0; +#ifdef __FreeBSD__ + boolean_t user = uio->uio_segflg != UIO_SYSSPACE; +#endif +#ifdef __NetBSD__ + boolean_t user = !VMSPACE_IS_KERNEL_P(uio->uio_vmspace); +#endif - dprintf("zfs_readdir called\n"); - ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + /* * If we are not given an eof variable, * use a local one. @@ -2099,7 +2684,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre */ if (uio->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); - return (EINVAL); + return (SET_ERROR(EINVAL)); } /* @@ -2114,7 +2699,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre os = zfsvfs->z_os; offset = uio->uio_loffset; prefetch = zp->z_zn_prefetch; - + /* * Initialize the iterator cursor. */ @@ -2135,13 +2720,13 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre */ iovp = uio->uio_iov; bytes_wanted = iovp->iov_len; - if (!VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || uio->uio_iovcnt != 1) { + if (user || uio->uio_iovcnt != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); - memset(outbuf, 0, bufsize); odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; + outbuf = NULL; odp = (struct dirent64 *)iovp->iov_base; } eodp = (struct edirent *)odp; @@ -2150,11 +2735,14 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre /* * Minimum entry size is dirent size and 1 byte for a file name. */ +#ifdef __FreeBSD__ + ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); + cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK); +#endif +#ifdef __NetBSD__ ncooks = uio->uio_resid / _DIRENT_MINSIZE(odp); -// sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1); - cooks = kmem_alloc(ncooks * sizeof(u_long), KM_SLEEP); - - memset(cooks, 0, ncooks * sizeof(u_long)); + cooks = kmem_alloc(ncooks * sizeof(off_t), KM_SLEEP); +#endif *cookies = cooks; *ncookies = ncooks; } @@ -2180,7 +2768,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; - off64_t *next; + off64_t *next = NULL; /* * Special case `.', `..', and `.zfs'. @@ -2193,7 +2781,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; - objnum = zp->z_phys->zp_parent; + objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); @@ -2217,7 +2805,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre "entry, obj = %lld, offset = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset); - error = ENXIO; + error = SET_ERROR(ENXIO); goto update; } @@ -2247,16 +2835,16 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) goto skip_entry; if (!zfs_has_access(ezp, cr)) { - VN_RELE(ZTOV(ezp)); + vrele(ZTOV(ezp)); goto skip_entry; } - VN_RELE(ZTOV(ezp)); + vrele(ZTOV(ezp)); } if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else - reclen = _DIRENT_RECLEN(odp, strlen(zap.za_name)); + reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? @@ -2266,7 +2854,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre * Did we manage to fit anything in the buffer? */ if (!outcount) { - error = EINVAL; + error = SET_ERROR(EINVAL); goto update; } break; @@ -2297,11 +2885,12 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre } outcount += reclen; - KASSERT(outcount <= bufsize); + ASSERT(outcount <= bufsize); /* Prefetch znode */ if (prefetch) - dmu_prefetch(os, objnum, 0, 0); + dmu_prefetch(os, objnum, 0, 0, 0, + ZIO_PRIORITY_SYNC_READ); skip_entry: /* @@ -2317,7 +2906,12 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre if (cooks != NULL) { *cooks++ = offset; ncooks--; - KASSERT(ncooks >= 0); +#ifdef __FreeBSD__ + KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); +#endif +#ifdef __NetBSD__ + KASSERTMSG(ncooks >= 0, "ncooks=%d", ncooks); +#endif } } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ @@ -2326,11 +2920,11 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre if (ncookies != NULL) *ncookies -= ncooks; - if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace) && uio->uio_iovcnt == 1) { + if (!user && uio->uio_iovcnt == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; uio->uio_resid -= outcount; - } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) { + } else if (error = uiomove(outbuf, (size_t)outcount, UIO_READ, uio)) { /* * Reset the pointer. */ @@ -2339,7 +2933,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cre update: zap_cursor_fini(&zc); - if (!VMSPACE_IS_KERNEL_P(uio->uio_vmspace) || uio->uio_iovcnt != 1) + if (user || uio->uio_iovcnt != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) @@ -2350,7 +2944,12 @@ update: uio->uio_loffset = offset; ZFS_EXIT(zfsvfs); if (error != 0 && cookies != NULL) { - kmem_free(*cookies, ncooks * sizeof(u_long)); +#ifdef __FreeBSD__ + free(*cookies, M_TEMP); +#endif +#ifdef __NetBSD__ + kmem_free(*cookies, ncooks * sizeof(off_t)); +#endif *cookies = NULL; *ncookies = 0; } @@ -2364,44 +2963,19 @@ zfs_fsync(vnode_t *vp, int syncflag, cre { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - error = 0; - - dprintf("zfs_fsync called vp %p -- zfsvfs %p\n", vp, zfsvfs); (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - /* - * NetBSD: if the sync is from reclaim or from ioflush, - * push dirty atime now. No need to lock: in the reclaim - * case, everything is single threaded and for ioflush this - * is a lazy writeback. - * - * XXXNETBSD: in the ioflush case, we don't want to push anything - * to disk immediately. We just want to queue the update so it - * will happen "soon". Check this is the case otherwise zfs will - * perform poorly. - */ - if (zp->z_atime_dirty && zp->z_unlinked == 0 && - (syncflag & (FSYNC_RECLAIM | FSYNC_LAZY)) != 0) { - dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); - dmu_tx_hold_bonus(tx, zp->z_id); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - dmu_buf_will_dirty(zp->z_dbuf, tx); - mutex_enter(&zp->z_lock); - zp->z_atime_dirty = 0; - mutex_exit(&zp->z_lock); - dmu_tx_commit(tx); - } +#ifdef __NetBSD__ + if (!zp->z_unlinked) +#endif + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); } - zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id); - ZFS_EXIT(zfsvfs); return (0); } @@ -2419,7 +2993,7 @@ zfs_fsync(vnode_t *vp, int syncflag, cre * * OUT: vap - attribute values. * - * RETURN: 0 (always succeeds) + * RETURN: 0 (always succeeds). */ /* ARGSUSED */ static int @@ -2428,28 +3002,41 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, i { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - znode_phys_t *pzp; int error = 0; uint32_t blksize; u_longlong_t nblocks; uint64_t links; + uint64_t mtime[2], ctime[2], crtime[2], rdev; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + sa_bulk_attr_t bulk[4]; + int count = 0; - dprintf("zfs_getattr called\n"); - ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - pzp = zp->z_phys; + + zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); + if (vp->v_type == VBLK || vp->v_type == VCHR) + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + + if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ - if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) && - (pzp->zp_uid != crgetuid(cr))) { + if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && + (vap->va_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { ZFS_EXIT(zfsvfs); @@ -2461,22 +3048,34 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, i * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ - mutex_enter(&zp->z_lock); - vap->va_type = IFTOVT(pzp->zp_mode); - vap->va_mode = pzp->zp_mode & ~S_IFMT; - zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + + vap->va_type = IFTOVT(zp->z_mode); + vap->va_mode = zp->z_mode & ~S_IFMT; +#ifdef illumos + vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev; +#endif +#ifdef __FreeBSD__ + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; +#endif +#ifdef __NetBSD__ + vap->va_fsid = vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0]; +#endif vap->va_nodeid = zp->z_id; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp)) - links = pzp->zp_links + 1; + links = zp->z_links + 1; else - links = pzp->zp_links; - vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */ - vap->va_size = pzp->zp_size; - vap->va_fsid = vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0]; -// vap->va_fsid = 0; - vap->va_rdev = zfs_cmpldev(pzp->zp_rdev); + links = zp->z_links; + vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */ + vap->va_size = zp->z_size; +#ifdef illumos + vap->va_rdev = vp->v_rdev; +#else + if (vp->v_type == VBLK || vp->v_type == VCHR) + vap->va_rdev = zfs_cmpldev(rdev); +#endif vap->va_seq = zp->z_seq; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ + vap->va_filerev = zp->z_seq; /* * Add in any requested optional attributes and the create time. @@ -2485,116 +3084,104 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, i if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = - ((pzp->zp_flags & ZFS_ARCHIVE) != 0); + ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = - ((pzp->zp_flags & ZFS_READONLY) != 0); + ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = - ((pzp->zp_flags & ZFS_SYSTEM) != 0); + ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = - ((pzp->zp_flags & ZFS_HIDDEN) != 0); + ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = - ((pzp->zp_flags & ZFS_NOUNLINK) != 0); + ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = - ((pzp->zp_flags & ZFS_IMMUTABLE) != 0); + ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = - ((pzp->zp_flags & ZFS_APPENDONLY) != 0); + ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = - ((pzp->zp_flags & ZFS_NODUMP) != 0); + ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = - ((pzp->zp_flags & ZFS_OPAQUE) != 0); + ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = - ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0); + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = - ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0); + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && - vp->v_type == VREG && - (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) { - size_t len; - dmu_object_info_t doi; - - /* - * Only VREG files have anti-virus scanstamps, so we - * won't conflict with symlinks in the bonus buffer. - */ - dmu_object_info_from_db(zp->z_dbuf, &doi); - len = sizeof (xoap->xoa_av_scanstamp) + - sizeof (znode_phys_t); - if (len <= doi.doi_bonus_size) { - /* - * pzp points to the start of the - * znode_phys_t. pzp + 1 points to the - * first byte after the znode_phys_t. - */ - (void) memcpy(xoap->xoa_av_scanstamp, - pzp + 1, - sizeof (xoap->xoa_av_scanstamp)); - XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); - } - } - - if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { - ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); - XVA_SET_RTN(xvap, XAT_CREATETIME); + vp->v_type == VREG) { + zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { - xoap->xoa_reparse = - ((pzp->zp_flags & ZFS_REPARSE) != 0); + xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } - } + if (XVA_ISSET_REQ(xvap, XAT_GEN)) { + xoap->xoa_generation = zp->z_gen; + XVA_SET_RTN(xvap, XAT_GEN); + } - ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); - ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime); - ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime); - ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime); + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + xoap->xoa_offline = + ((zp->z_pflags & ZFS_OFFLINE) != 0); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + xoap->xoa_sparse = + ((zp->z_pflags & ZFS_SPARSE) != 0); + XVA_SET_RTN(xvap, XAT_SPARSE); + } + } - mutex_exit(&zp->z_lock); + ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); + ZFS_TIME_DECODE(&vap->va_mtime, mtime); + ZFS_TIME_DECODE(&vap->va_ctime, ctime); + ZFS_TIME_DECODE(&vap->va_birthtime, crtime); - dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks); + + sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); vap->va_blksize = blksize; vap->va_bytes = nblocks << 9; /* nblocks * 512 */ @@ -2621,8 +3208,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, i * cr - credentials of caller. * ct - caller context * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime updated, mtime updated if size changed. @@ -2630,42 +3216,43 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, i /* ARGSUSED */ static int zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, - caller_context_t *ct) + caller_context_t *ct) { znode_t *zp = VTOZ(vp); - znode_phys_t *pzp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t tmpxvattr; uint_t mask = vap->va_mask; - uint_t saved_mask; + uint_t saved_mask = 0; + uint64_t saved_mode; int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; + uint64_t xattr_obj; + uint64_t mtime[2], ctime[2]; znode_t *attrzp; int need_policy = FALSE; - int err; + int err, err2; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; - zfs_acl_t *aclp = NULL; + zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - boolean_t fuid_dirtied = B_FALSE; - - dprintf("zfs_setattr called\n"); + boolean_t fuid_dirtied = B_FALSE; + sa_bulk_attr_t bulk[7], xattr_bulk[7]; + int count = 0, xattr_count = 0; if (mask == 0) return (0); if (mask & AT_NOSET) - return (EINVAL); + return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - pzp = zp->z_phys; zilog = zfsvfs->z_log; /* @@ -2678,17 +3265,17 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, i ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & AT_XVATTR))) { ZFS_EXIT(zfsvfs); - return (EINVAL); + return (SET_ERROR(EINVAL)); } if (mask & AT_SIZE && vp->v_type == VDIR) { ZFS_EXIT(zfsvfs); - return (EISDIR); + return (SET_ERROR(EISDIR)); } if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { ZFS_EXIT(zfsvfs); - return (EINVAL); + return (SET_ERROR(EINVAL)); } /* @@ -2702,16 +3289,16 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, i /* * Immutable files can only alter immutable bit and atime */ - if ((pzp->zp_flags & ZFS_IMMUTABLE) && + if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ZFS_EXIT(zfsvfs); - return (EPERM); + return (SET_ERROR(EPERM)); } - if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) { + if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) { ZFS_EXIT(zfsvfs); - return (EPERM); + return (SET_ERROR(EPERM)); } /* @@ -2724,28 +3311,29 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, i if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { ZFS_EXIT(zfsvfs); - return (EOVERFLOW); + return (SET_ERROR(EOVERFLOW)); } } + if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) && + TIMESPEC_OVERFLOW(&vap->va_birthtime)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOVERFLOW)); + } -top: attrzp = NULL; + aclp = NULL; /* Can this be moved to before the top label? */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); - return (EROFS); + return (SET_ERROR(EROFS)); } /* * First validate permissions */ + if (mask & AT_SIZE) { - err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); - if (err) { - ZFS_EXIT(zfsvfs); - return (err); - } /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may @@ -2759,15 +3347,18 @@ top: return (err); } } - + if (mask & (AT_ATIME|AT_MTIME) || ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_OFFLINE) || + XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || - XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); + } if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); @@ -2780,7 +3371,7 @@ top: */ if (!(mask & AT_MODE)) - vap->va_mode = pzp->zp_mode; + vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of @@ -2798,7 +3389,7 @@ top: * Otherwise, send the check through secpolicy_vnode_setattr() * */ - + if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { @@ -2807,7 +3398,7 @@ top: /* * Remove setuid/setgid for non-privileged users */ - secpolicy_setid_clear(vap, cr); + secpolicy_setid_clear(vap, vp, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; @@ -2817,8 +3408,7 @@ top: } } - mutex_enter(&zp->z_lock); - oldva.va_mode = pzp->zp_mode; + oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { /* @@ -2830,7 +3420,7 @@ top: */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != - ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) { + ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); @@ -2840,7 +3430,7 @@ top: if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != - ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) { + ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); @@ -2850,7 +3440,7 @@ top: if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != - ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) { + ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); @@ -2860,7 +3450,7 @@ top: if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != - ((pzp->zp_flags & ZFS_NODUMP) != 0)) { + ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); @@ -2870,7 +3460,7 @@ top: if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != - ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) { + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); @@ -2882,7 +3472,7 @@ top: if ((vp->v_type != VREG && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != - ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) { + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); @@ -2891,9 +3481,8 @@ top: } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { - mutex_exit(&zp->z_lock); ZFS_EXIT(zfsvfs); - return (EPERM); + return (SET_ERROR(EPERM)); } if (need_policy == FALSE && @@ -2903,8 +3492,6 @@ top: } } - mutex_exit(&zp->z_lock); - if (mask & AT_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, @@ -2931,6 +3518,13 @@ top: if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; + if (trim_mask & AT_MODE) { + /* + * Save the mode, as secpolicy_vnode_setattr() + * will overwrite it with ova.va_mode. + */ + saved_mode = vap->va_mode; + } } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); @@ -2939,97 +3533,122 @@ top: return (err); } - if (trim_mask) + if (trim_mask) { vap->va_mask |= saved_mask; + if (trim_mask & AT_MODE) { + /* + * Recover the mode after + * secpolicy_vnode_setattr(). + */ + vap->va_mode = saved_mode; + } + } } + /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; + if ((mask & (AT_UID | AT_GID))) { + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + + if (err == 0 && xattr_obj) { + err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); + if (err == 0) { + err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); + if (err != 0) + vrele(ZTOV(attrzp)); + } + if (err) + goto out2; + } + if (mask & AT_UID) { + new_uid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_uid != zp->z_uid && + zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (mask & AT_GID) { + new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &fuidp); + if (new_gid != zp->z_gid && + zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + } tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, zp->z_id); if (mask & AT_MODE) { - uint64_t pmode = pzp->zp_mode; - + uint64_t pmode = zp->z_mode; + uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && + !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { + err = SET_ERROR(EPERM); + goto out; + } + if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) goto out; - if (pzp->zp_acl.z_acl_extern_obj) { - /* Are we upgrading ACL from old V0 format to new V1 */ - if (zfsvfs->z_version <= ZPL_VERSION_FUID && - pzp->zp_acl.z_acl_version == + + if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { + /* + * Are we upgrading ACL from old V0 format + * to V1 format? + */ + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { - dmu_tx_hold_free(tx, - pzp->zp_acl.z_acl_extern_obj, 0, + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { - dmu_tx_hold_write(tx, - pzp->zp_acl.z_acl_extern_obj, 0, + dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } - } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + } else { + if ((mask & AT_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } - if (mask & (AT_UID | AT_GID)) { - if (pzp->zp_xattr) { - err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); - if (err) - goto out; - dmu_tx_hold_bonus(tx, attrzp->z_id); - } - if (mask & AT_UID) { - new_uid = zfs_fuid_create(zfsvfs, - (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); - if (new_uid != pzp->zp_uid && - zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) { - err = EDQUOT; - goto out; - } - } - - if (mask & AT_GID) { - new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, - cr, ZFS_GROUP, &fuidp); - if (new_gid != pzp->zp_gid && - zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) { - err = EDQUOT; - goto out; - } - } - fuid_dirtied = zfsvfs->z_fuid_dirty; - if (fuid_dirtied) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, - FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } + if (attrzp) { + dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } - err = dmu_tx_assign(tx, TXG_NOWAIT); - if (err) { - if (err == ERESTART) - dmu_tx_wait(tx); - goto out; - } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + zfs_sa_upgrade_txholds(tx, zp); - dmu_buf_will_dirty(zp->z_dbuf, tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + goto out; + count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. @@ -3038,47 +3657,105 @@ top: * updated as a side-effect of calling this function. */ - mutex_enter(&zp->z_lock); + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&zp->z_acl_lock); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&attrzp->z_acl_lock); + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, + sizeof (attrzp->z_pflags)); + } + + if (mask & (AT_UID|AT_GID)) { + + if (mask & AT_UID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &new_uid, sizeof (new_uid)); + zp->z_uid = new_uid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_UID(zfsvfs), NULL, &new_uid, + sizeof (new_uid)); + attrzp->z_uid = new_uid; + } + } + + if (mask & AT_GID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), + NULL, &new_gid, sizeof (new_gid)); + zp->z_gid = new_gid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_GID(zfsvfs), NULL, &new_gid, + sizeof (new_gid)); + attrzp->z_gid = new_gid; + } + } + if (!(mask & AT_MODE)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), + NULL, &new_mode, sizeof (new_mode)); + new_mode = zp->z_mode; + } + err = zfs_acl_chown_setattr(zp); + ASSERT(err == 0); + if (attrzp) { + err = zfs_acl_chown_setattr(attrzp); + ASSERT(err == 0); + } + } if (mask & AT_MODE) { - mutex_enter(&zp->z_acl_lock); - zp->z_phys->zp_mode = new_mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &new_mode, sizeof (new_mode)); + zp->z_mode = new_mode; + ASSERT3U((uintptr_t)aclp, !=, 0); err = zfs_aclset_common(zp, aclp, cr, tx); - ASSERT3U(err, ==, 0); + ASSERT0(err); + if (zp->z_acl_cached) + zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; - mutex_exit(&zp->z_acl_lock); } - if (attrzp) - mutex_enter(&attrzp->z_lock); - if (mask & AT_UID) { - pzp->zp_uid = new_uid; - if (attrzp) - attrzp->z_phys->zp_uid = new_uid; + if (mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); } - if (mask & AT_GID) { - pzp->zp_gid = new_gid; - if (attrzp) - attrzp->z_phys->zp_gid = new_gid; + if (mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); } - if (attrzp) - mutex_exit(&attrzp->z_lock); - - if (mask & AT_ATIME) - ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); - - if (mask & AT_MTIME) - ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); - /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ - if (mask & AT_SIZE) - zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx); - else if (mask != 0) - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); + if (mask & AT_SIZE && !(mask & AT_MTIME)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, + B_TRUE); + } else if (mask != 0) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime, + B_TRUE); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(attrzp, STATE_CHANGED, + mtime, ctime, B_TRUE); + } + } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit @@ -3086,6 +3763,8 @@ top: if (xoap && (mask & AT_XVATTR)) { + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + xoap->xoa_createtime = vap->va_birthtime; /* * restore trimmed off masks * so that return masks can be set for caller. @@ -3110,20 +3789,10 @@ top: XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } - if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { - size_t len; - dmu_object_info_t doi; - + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(vp->v_type == VREG); - /* Grow the bonus buffer if necessary. */ - dmu_object_info_from_db(zp->z_dbuf, &doi); - len = sizeof (xoap->xoa_av_scanstamp) + - sizeof (znode_phys_t); - if (len > doi.doi_bonus_size) - VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0); - } - zfs_xvattr_set(zp, xvap); + zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) @@ -3132,11 +3801,22 @@ top: if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); - mutex_exit(&zp->z_lock); + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&zp->z_acl_lock); + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&attrzp->z_acl_lock); + } out: + if (err == 0 && attrzp) { + err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, + xattr_count, tx); + ASSERT(err2 == 0); + } + if (attrzp) - VN_RELE(ZTOV(attrzp)); + vput(ZTOV(attrzp)); if (aclp) zfs_acl_free(aclp); @@ -3146,112 +3826,299 @@ out: fuidp = NULL; } - if (err) + if (err) { dmu_tx_abort(tx); - else + } else { + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); + } - if (err == ERESTART) - goto top; +out2: + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (err); } -typedef struct zfs_zlock { - krwlock_t *zl_rwlock; /* lock we acquired */ - znode_t *zl_znode; /* znode we held */ - struct zfs_zlock *zl_next; /* next in list */ -} zfs_zlock_t; - /* - * Drop locks and release vnodes that were held by zfs_rename_lock(). + * We acquire all but fdvp locks using non-blocking acquisitions. If we + * fail to acquire any lock in the path we will drop all held locks, + * acquire the new lock in a blocking fashion, and then release it and + * restart the rename. This acquire/release step ensures that we do not + * spin on a lock waiting for release. On error release all vnode locks + * and decrement references the way tmpfs_rename() would do. */ -static void -zfs_rename_unlock(zfs_zlock_t **zlpp) +static int +zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, + struct vnode *tdvp, struct vnode **tvpp, + const struct componentname *scnp, const struct componentname *tcnp) { - zfs_zlock_t *zl; + zfsvfs_t *zfsvfs; + struct vnode *nvp, *svp, *tvp; + znode_t *sdzp, *tdzp, *szp, *tzp; + const char *snm = scnp->cn_nameptr; + const char *tnm = tcnp->cn_nameptr; + int error; + +#ifdef __FreeBSD__ + VOP_UNLOCK(tdvp, 0); + if (*tvpp != NULL && *tvpp != tdvp) + VOP_UNLOCK(*tvpp, 0); +#endif + +relock: + error = vn_lock(sdvp, LK_EXCLUSIVE); + if (error) + goto out; + sdzp = VTOZ(sdvp); + +#ifdef __NetBSD__ + if (tdvp == sdvp) { + } else { +#endif + error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK(sdvp, 0); + if (error != EBUSY) + goto out; + error = vn_lock(tdvp, LK_EXCLUSIVE); + if (error) + goto out; + VOP_UNLOCK(tdvp, 0); + goto relock; + } +#ifdef __NetBSD__ + } /* end if (tdvp == sdvp) */ +#endif + + tdzp = VTOZ(tdvp); + + /* + * Before using sdzp and tdzp we must ensure that they are live. + * As a porting legacy from illumos we have two things to worry + * about. One is typical for FreeBSD and it is that the vnode is + * not reclaimed (doomed). The other is that the znode is live. + * The current code can invalidate the znode without acquiring the + * corresponding vnode lock if the object represented by the znode + * and vnode is no longer valid after a rollback or receive operation. + * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock + * that protects the znodes from the invalidation. + */ + zfsvfs = sdzp->z_zfsvfs; + ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); + ZFS_ENTER(zfsvfs); - while ((zl = *zlpp) != NULL) { - if (zl->zl_znode != NULL) - VN_RELE(ZTOV(zl->zl_znode)); - rw_exit(zl->zl_rwlock); - *zlpp = zl->zl_next; - kmem_free(zl, sizeof (*zl)); + /* + * We can not use ZFS_VERIFY_ZP() here because it could directly return + * bypassing the cleanup code in the case of an error. + */ + if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { + ZFS_EXIT(zfsvfs); + VOP_UNLOCK(sdvp, 0); +#ifdef __NetBSD__ + if (tdvp != sdvp) +#endif + VOP_UNLOCK(tdvp, 0); + error = SET_ERROR(EIO); + goto out; } -} -/* - * Search back through the directory tree, using the ".." entries. - * Lock each directory in the chain to prevent concurrent renames. - * Fail any attempt to move a directory into one of its own descendants. - * XXX - z_parent_lock can overlap with map or grow locks - */ -static int -zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) -{ - zfs_zlock_t *zl; - znode_t *zp = tdzp; - uint64_t rootid = zp->z_zfsvfs->z_root; - uint64_t *oidp = &zp->z_id; - krwlock_t *rwlp = &szp->z_parent_lock; - krw_t rw = RW_WRITER; + /* + * Re-resolve svp to be certain it still exists and fetch the + * correct vnode. + */ + error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); + if (error != 0) { + /* Source entry invalid or not there. */ + ZFS_EXIT(zfsvfs); + VOP_UNLOCK(sdvp, 0); +#ifdef __NetBSD__ + if (tdvp != sdvp) +#endif + VOP_UNLOCK(tdvp, 0); + if ((scnp->cn_flags & ISDOTDOT) != 0 || + (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) + error = SET_ERROR(EINVAL); + goto out; + } + svp = ZTOV(szp); /* - * First pass write-locks szp and compares to zp->z_id. - * Later passes read-lock zp and compare to zp->z_parent. + * Re-resolve tvp, if it disappeared we just carry on. */ - do { - if (!rw_tryenter(rwlp, rw)) { - /* - * Another thread is renaming in this path. - * Note that if we are a WRITER, we don't have any - * parent_locks held yet. - */ - if (rw == RW_READER && zp->z_id > szp->z_id) { - /* - * Drop our locks and restart - */ - zfs_rename_unlock(&zl); - *zlpp = NULL; - zp = tdzp; - oidp = &zp->z_id; - rwlp = &szp->z_parent_lock; - rw = RW_WRITER; - continue; - } else { - /* - * Wait for other thread to drop its locks - */ - rw_enter(rwlp, rw); + error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); + if (error != 0) { + ZFS_EXIT(zfsvfs); + VOP_UNLOCK(sdvp, 0); +#ifdef __NetBSD__ + if (tdvp != sdvp) +#endif + VOP_UNLOCK(tdvp, 0); + vrele(svp); + if ((tcnp->cn_flags & ISDOTDOT) != 0) + error = SET_ERROR(EINVAL); + goto out; + } + if (tzp != NULL) + tvp = ZTOV(tzp); + else + tvp = NULL; + + /* + * At present the vnode locks must be acquired before z_teardown_lock, + * although it would be more logical to use the opposite order. + */ + ZFS_EXIT(zfsvfs); + + /* + * Now try acquire locks on svp and tvp. + */ + nvp = svp; + error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK(sdvp, 0); +#ifdef __NetBSD__ + if (tdvp != sdvp) +#endif + VOP_UNLOCK(tdvp, 0); + if (tvp != NULL) + vrele(tvp); + if (error != EBUSY) { + vrele(nvp); + goto out; + } + error = vn_lock(nvp, LK_EXCLUSIVE); + if (error != 0) { + vrele(nvp); + goto out; + } + VOP_UNLOCK(nvp, 0); + /* + * Concurrent rename race. + * XXX ? + */ + if (nvp == tdvp) { + vrele(nvp); + error = SET_ERROR(EINVAL); + goto out; + } +#ifdef __NetBSD__ + if (*svpp != NULL) +#endif + vrele(*svpp); + *svpp = nvp; + goto relock; + } +#ifdef __NetBSD__ + if (*svpp != NULL) +#endif + vrele(*svpp); + *svpp = nvp; + + if (*tvpp != NULL) + vrele(*tvpp); + *tvpp = NULL; + if (tvp != NULL) { + nvp = tvp; + +#ifdef __NetBSD__ + if (tvp == svp || tvp == sdvp) { + } else { +#endif + error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK(sdvp, 0); +#ifdef __NetBSD__ + if (tdvp != sdvp) +#endif + VOP_UNLOCK(tdvp, 0); +#ifdef __NetBSD__ + if (*svpp != tdvp) +#endif + VOP_UNLOCK(*svpp, 0); + if (error != EBUSY) { + vrele(nvp); + goto out; + } + error = vn_lock(nvp, LK_EXCLUSIVE); + if (error != 0) { + vrele(nvp); + goto out; } + vput(nvp); + goto relock; } +#ifdef __NetBSD__ + } /* end if (tvp == svp || tvp == sdvp) */ +#endif + + *tvpp = nvp; + } - zl = kmem_alloc(sizeof (*zl), KM_SLEEP); - zl->zl_rwlock = rwlp; - zl->zl_znode = NULL; - zl->zl_next = *zlpp; - *zlpp = zl; + KASSERT(VOP_ISLOCKED(sdvp) == LK_EXCLUSIVE); + KASSERT(VOP_ISLOCKED(*svpp) == LK_EXCLUSIVE); + KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); + KASSERT(*tvpp == NULL || VOP_ISLOCKED(*tvpp) == LK_EXCLUSIVE); - if (*oidp == szp->z_id) /* We're a descendant of szp */ - return (EINVAL); + return (0); - if (*oidp == rootid) /* We've hit the top */ - return (0); +out: + return (error); +} - if (rw == RW_READER) { /* i.e. not the first pass */ - int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp); - if (error) - return (error); - zl->zl_znode = zp; - } - oidp = &zp->z_phys->zp_parent; - rwlp = &zp->z_parent_lock; - rw = RW_READER; +/* + * Note that we must use VRELE_ASYNC in this function as it walks + * up the directory tree and vrele may need to acquire an exclusive + * lock if a last reference to a vnode is dropped. + */ +static int +zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) +{ + zfsvfs_t *zfsvfs; + znode_t *zp, *zp1; + uint64_t parent; + int error; + + zfsvfs = tdzp->z_zfsvfs; + if (tdzp == szp) + return (SET_ERROR(EINVAL)); + if (tdzp == sdzp) + return (0); + if (tdzp->z_id == zfsvfs->z_root) + return (0); + zp = tdzp; + for (;;) { + ASSERT(!zp->z_unlinked); + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + break; - } while (zp->z_id != sdzp->z_id); + if (parent == szp->z_id) { + error = SET_ERROR(EINVAL); + break; + } + if (parent == zfsvfs->z_root) + break; + if (parent == sdzp->z_id) + break; - return (0); + error = zfs_zget(zfsvfs, parent, &zp1); + if (error != 0) + break; + + if (zp != tdzp) + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); + zp = zp1; + } + + if (error == ENOTDIR) + panic("checkpath: .. not a directory\n"); + if (zp != tdzp) + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os))); + return (error); } /* @@ -3266,213 +4133,100 @@ zfs_rename_lock(znode_t *szp, znode_t *t * ct - caller context * flags - case flags * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * sdvp,tdvp - ctime|mtime updated */ -/* XXX NetBSD There is significant problem with dirent locking during rename - * of files which are in a same dir. zfs_dirent_lock is then called twice on - * same lock which panics LOCKDEBUG kernel. Locking twice is not needed. - * Proper solution for this is add new flag to zfs_dirent_lock which will - * disable rw_enter in it. Renaming of files in same dir is considered as broken - * on LOCKDEBUG kernels on NetBSD for now. - */ /*ARGSUSED*/ static int -zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, - caller_context_t *ct, int flags) +zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, + vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, + cred_t *cr) { - znode_t *tdzp, *szp, *tzp; - znode_t *sdzp = VTOZ(sdvp); - zfsvfs_t *zfsvfs = sdzp->z_zfsvfs; - zilog_t *zilog; - vnode_t *realvp; - zfs_dirlock_t *sdl, *tdl; + zfsvfs_t *zfsvfs; + znode_t *sdzp, *tdzp, *szp, *tzp; + zilog_t *zilog = NULL; dmu_tx_t *tx; - zfs_zlock_t *zl; - int cmp, serr, terr; + char *snm = __UNCONST(scnp->cn_nameptr); + char *tnm = __UNCONST(tcnp->cn_nameptr); int error = 0; - int zflg = 0; - int samedir = 0; - - tdl = NULL; - sdl = NULL; - - dprintf("zfs_rename called\n"); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(sdzp); - zilog = zfsvfs->z_log; - /* - * Make sure we have the real vp for the target directory. - */ - if (VOP_REALVP(tdvp, &realvp, ct) == 0) - tdvp = realvp; - - if (tdvp->v_vfsp != sdvp->v_vfsp) { - ZFS_EXIT(zfsvfs); - return (EXDEV); + /* Reject renames across filesystems. */ + if (((*svpp) != NULL && (*svpp)->v_mount != tdvp->v_mount) || + ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { + error = SET_ERROR(EXDEV); + goto out; } - tdzp = VTOZ(tdvp); - ZFS_VERIFY_ZP(tdzp); - if (zfsvfs->z_utf8 && u8_validate(tnm, - strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { - ZFS_EXIT(zfsvfs); - return (EILSEQ); + if (zfsctl_is_node(tdvp)) { + error = SET_ERROR(EXDEV); + goto out; } - if (flags & FIGNORECASE) - zflg |= ZCILOOK; - -top: - szp = NULL; - tzp = NULL; - zl = NULL; - /* - * This is to prevent the creation of links into attribute space - * by renaming a linked file into/outof an attribute directory. - * See the comment in zfs_link() for why this is considered bad. + * Lock all four vnodes to ensure safety and semantics of renaming. */ - if ((tdzp->z_phys->zp_flags & ZFS_XATTR) != - (sdzp->z_phys->zp_flags & ZFS_XATTR)) { - ZFS_EXIT(zfsvfs); - return (EINVAL); + error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); + if (error != 0) { + /* no vnodes are locked in the case of error here */ + return (error); } + tdzp = VTOZ(tdvp); + sdzp = VTOZ(sdvp); + zfsvfs = tdzp->z_zfsvfs; + zilog = zfsvfs->z_log; + /* - * Lock source and target directory entries. To prevent deadlock, - * a lock ordering must be defined. We lock the directory with - * the smallest object id first, or if it's a tie, the one with - * the lexically first name. + * After we re-enter ZFS_ENTER() we will have to revalidate all + * znodes involved. */ - if (sdzp->z_id < tdzp->z_id) { - cmp = -1; - } else if (sdzp->z_id > tdzp->z_id) { - cmp = 1; - } else { - /* - * First compare the two name arguments without - * considering any case folding. - */ - int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); + ZFS_ENTER(zfsvfs); - cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); - ASSERT(error == 0 || !zfsvfs->z_utf8); - if (cmp == 0) { - /* - * POSIX: "If the old argument and the new argument - * both refer to links to the same existing file, - * the rename() function shall return successfully - * and perform no other action." - */ - ZFS_EXIT(zfsvfs); - return (0); - } - /* - * If the file system is case-folding, then we may - * have some more checking to do. A case-folding file - * system is either supporting mixed case sensitivity - * access or is completely case-insensitive. Note - * that the file system is always case preserving. - * - * In mixed sensitivity mode case sensitive behavior - * is the default. FIGNORECASE must be used to - * explicitly request case insensitive behavior. - * - * If the source and target names provided differ only - * by case (e.g., a request to rename 'tim' to 'Tim'), - * we will treat this as a special case in the - * case-insensitive mode: as long as the source name - * is an exact match, we will allow this to proceed as - * a name-change request. - */ - if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || - (zfsvfs->z_case == ZFS_CASE_MIXED && - flags & FIGNORECASE)) && - u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, - &error) == 0) { - /* - * case preserving rename request, require exact - * name matches - */ - zflg |= ZCIEXACT; - zflg &= ~ZCILOOK; - } + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + error = SET_ERROR(EILSEQ); + goto unlockout; } - /* - * If the source and destination directories are the same, we should - * grab the z_name_lock of that directory only once. - */ - if (sdzp == tdzp) { - zflg |= ZHAVELOCK; - rw_enter(&sdzp->z_name_lock, RW_READER); + /* If source and target are the same file, there is nothing to do. */ + if ((*svpp) == (*tvpp)) { + error = 0; + goto unlockout; } - if (cmp < 0) { - - serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, - ZEXISTS | zflg, NULL, NULL); - if ((serr == 0) && (sdzp == tdzp)) { - /* - * If renaming within the one directory we must - * be careful not to recursively acquire locks. - */ - zflg |= ZHAVELOCK; - } - terr = zfs_dirent_lock(&tdl, - tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); - } else { - terr = zfs_dirent_lock(&tdl, - tdzp, tnm, &tzp, zflg, NULL, NULL); - - if ((terr == 0) && (sdzp == tdzp)) { - /* - * If renaming within the one directory we must - * be careful not to recursively acquire locks. - */ - zflg |= ZHAVELOCK; - } - serr = zfs_dirent_lock(&sdl, - sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, - NULL, NULL); + if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || + ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && + (*tvpp)->v_mountedhere != NULL)) { + error = SET_ERROR(EXDEV); + goto unlockout; } - if (serr) { - /* - * Source entry invalid or not there. - */ - if (!terr) { - zfs_dirent_unlock(tdl); - if (tzp) - VN_RELE(ZTOV(tzp)); - } - - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - if (strcmp(snm, "..") == 0) - serr = EINVAL; - ZFS_EXIT(zfsvfs); - return (serr); + /* + * We can not use ZFS_VERIFY_ZP() here because it could directly return + * bypassing the cleanup code in the case of an error. + */ + if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { + error = SET_ERROR(EIO); + goto unlockout; } - if (terr) { - if (sdl != NULL) - zfs_dirent_unlock(sdl); - VN_RELE(ZTOV(szp)); - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); + szp = VTOZ(*svpp); + tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); + if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { + error = SET_ERROR(EIO); + goto unlockout; + } - if (strcmp(tnm, "..") == 0) - terr = EINVAL; - ZFS_EXIT(zfsvfs); - return (terr); + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { + error = SET_ERROR(EINVAL); + goto unlockout; } /* @@ -3481,17 +4235,26 @@ top: * Note that if target and source are the same, this can be * done in a single check. */ - if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)) - goto out; + goto unlockout; + + if ((*svpp)->v_type == VDIR) { + /* + * Avoid ".", "..", and aliases of "." for obvious reasons. + */ + if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || + sdzp == szp || + (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { + error = SET_ERROR(EINVAL); + goto unlockout; + } - if (ZTOV(szp)->v_type == VDIR) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ - if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl)) - goto out; + if (error = zfs_rename_check(szp, sdzp, tdzp)) + goto unlockout; } /* @@ -3501,17 +4264,22 @@ top: /* * Source and target must be the same type. */ - if (ZTOV(szp)->v_type == VDIR) { - if (ZTOV(tzp)->v_type != VDIR) { - error = ENOTDIR; - goto out; + if ((*svpp)->v_type == VDIR) { + if ((*tvpp)->v_type != VDIR) { + error = SET_ERROR(ENOTDIR); + goto unlockout; + } else { + cache_purge(tdvp); + if (sdvp != tdvp) + cache_purge(sdvp); } } else { - if (ZTOV(tzp)->v_type == VDIR) { - error = EISDIR; - goto out; + if ((*tvpp)->v_type == VDIR) { + error = SET_ERROR(EISDIR); + goto unlockout; } } + /* * POSIX dictates that when the source and target * entries refer to the same file object, rename @@ -3528,14 +4296,14 @@ top: */ if (szp->z_id == tzp->z_id) { error = 0; - goto out; + goto unlockout; } #endif } - vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); + vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); if (tzp) - vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); + vnevent_rename_dest(*tvpp, tdvp, tnm, ct); /* * notify the target directory if it is not the same @@ -3546,87 +4314,97 @@ top: } tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ - dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); - if (sdzp != tdzp) - dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ - if (tzp) - dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ + if (sdzp != tdzp) { + dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tdzp); + } + if (tzp) { + dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tzp); + } + + zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, TXG_NOWAIT); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - if (zl != NULL) - zfs_rename_unlock(&zl); - - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); - - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); - - VN_RELE(ZTOV(szp)); - if (tzp) - VN_RELE(ZTOV(tzp)); - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (error); + goto unlockout; } + if (tzp && (tzp->z_id != szp->z_id)) /* Attempt to remove the existing target */ - error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); if (error == 0) { if (!tzp || (tzp->z_id != szp->z_id)) - error = zfs_link_create(tdl, szp, tx, ZRENAMING); + error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); if (error == 0) { - szp->z_phys->zp_flags |= ZFS_AV_MODIFIED; + szp->z_pflags |= ZFS_AV_MODIFIED; - error = zfs_link_destroy(sdl, szp, tx, - /* Kludge for BSD rename semantics. */ - ((tzp && (tzp->z_id == szp->z_id)) ? - zflg : ZRENAMING), NULL); - ASSERT(error == 0); + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT0(error); - zfs_log_rename(zilog, tx, - TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), - sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); + error = zfs_link_destroy(sdzp, snm, szp, tx, + /* Kludge for BSD rename semantics. */ + tzp && tzp->z_id == szp->z_id ? 0: ZRENAMING, NULL); + if (error == 0) { + zfs_log_rename(zilog, tx, TX_RENAME, sdzp, + snm, tdzp, tnm, szp); - /* Update path information for the target vnode */ - vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm)); + /* + * Update path information for the target vnode + */ + vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); + } else { + /* + * At this point, we have successfully created + * the target name, but have failed to remove + * the source name. Since the create was done + * with the ZRENAMING flag, there are + * complications; for one, the link count is + * wrong. The easiest way to deal with this + * is to remove the newly created target, and + * return the original error. This must + * succeed; fortunately, it is very unlikely to + * fail, since we just created it. + */ + VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, + ZRENAMING, NULL), ==, 0); + } } if (error == 0) { - /* Purge cache entries, while still holding locks. */ - cache_purge(sdvp); - cache_purge(tdvp); + cache_purge(*svpp); + if (*tvpp != NULL) + cache_purge(*tvpp); + cache_purge_negative(tdvp); } } dmu_tx_commit(tx); -out: - if (zl != NULL) - zfs_rename_unlock(&zl); - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); - - if (sdzp == tdzp) - rw_exit(&sdzp->z_name_lock); + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); +unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ + ZFS_EXIT(zfsvfs); - VN_RELE(ZTOV(szp)); - if (tzp) - VN_RELE(ZTOV(tzp)); + VOP_UNLOCK(*svpp, 0); + VOP_UNLOCK(sdvp, 0); - ZFS_EXIT(zfsvfs); + if (*tvpp != sdvp && *tvpp != *svpp) + if (*tvpp != NULL) + VOP_UNLOCK(*tvpp, 0); + if (tdvp != sdvp && tdvp != *svpp) + if (tdvp != *tvpp) + VOP_UNLOCK(tdvp, 0); +out: return (error); } @@ -3636,13 +4414,11 @@ out: * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. - * target - Target path of new symlink. * cr - credentials of caller. * ct - caller context * flags - case flags * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated @@ -3650,18 +4426,18 @@ out: /*ARGSUSED*/ static int zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, - cred_t *cr, int flags) + cred_t *cr, kthread_t *td) { znode_t *zp, *dzp = VTOZ(dvp); - zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; - int len = strlen(link); + uint64_t len = strlen(link); int error; - int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; + uint64_t txtype = TX_SYMLINK; + int flags = 0; ASSERT(vap->va_type == VLNK); @@ -3672,111 +4448,99 @@ zfs_symlink(vnode_t *dvp, vnode_t **vpp, if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); - return (EILSEQ); + return (SET_ERROR(EILSEQ)); } - if (flags & FIGNORECASE) - zflg |= ZCILOOK; -top: - if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + + if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); - return (error); + return (SET_ERROR(ENAMETOOLONG)); } - if (len > MAXPATHLEN) { + if ((error = zfs_acl_ids_create(dzp, 0, + vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); - return (ENAMETOOLONG); + return (error); } /* * Attempt to lock directory; fail if entry already exists. */ - error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); + error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { + zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } - VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); - return (EDQUOT); + return (SET_ERROR(EDQUOT)); } + + getnewvnode_reserve(1); tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); - dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE + len); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); - error = dmu_tx_assign(tx, TXG_NOWAIT); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); - zfs_dirent_unlock(dl); - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } dmu_tx_abort(tx); + getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } - dmu_buf_will_dirty(dzp->z_dbuf, tx); - /* * Create a new object for the symlink. - * Put the link content into bonus buffer if it will fit; - * otherwise, store it just like any other file data. + * for version 4 ZPL datsets the symlink will be an SA attribute */ - if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { - zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids); - if (len != 0) - bcopy(link, zp->z_phys + 1, len); - } else { - dmu_buf_t *dbp; + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); - - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - /* - * Nothing can access the znode yet so no locking needed - * for growing the znode's blocksize. - */ - zfs_grow_blocksize(zp, len, tx); - - VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, - zp->z_id, 0, FTAG, &dbp)); - dmu_buf_will_dirty(dbp, tx); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); - ASSERT3U(len, <=, dbp->db_size); - bcopy(link, dbp->db_data, len); - dmu_buf_rele(dbp, FTAG); - } - zp->z_phys->zp_size = len; + if (zp->z_is_sa) + error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), + link, len, tx); + else + zfs_sa_symlink(zp, link, len, tx); + zp->z_size = len; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ - (void) zfs_link_create(dl, zp, tx, ZNEW); - if (error == 0) { - uint64_t txtype = TX_SYMLINK; - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); - *vpp = ZTOV(zp); - } + (void) zfs_link_create(dzp, name, zp, tx, ZNEW); + + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + *vpp = ZTOV(zp); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); - zfs_dirent_unlock(dl); + getnewvnode_drop_reserve(); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); @@ -3787,14 +4551,13 @@ top: * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. - * uoip - structure to contain the link path. + * uio - structure to contain the link path. * cr - credentials of caller. * ct - caller context * - * OUT: uio - structure to contain the link path. + * OUT: uio - structure containing the link path. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated @@ -3805,29 +4568,19 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cr { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - size_t bufsz; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - bufsz = (size_t)zp->z_phys->zp_size; - if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) { - error = uiomove(zp->z_phys + 1, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); - } else { - dmu_buf_t *dbp; - error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); - if (error) { - ZFS_EXIT(zfsvfs); - return (error); - } - error = uiomove(dbp->db_data, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); - dmu_buf_rele(dbp, FTAG); - } + if (zp->z_is_sa) + error = sa_lookup_uio(zp->z_sa_hdl, + SA_ZPL_SYMLINK(zfsvfs), uio); + else + error = zfs_sa_readlink(zp, uio); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); return (error); } @@ -3841,8 +4594,7 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cr * cr - credentials of caller. * ct - caller context * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * tdvp - ctime|mtime updated @@ -3857,11 +4609,9 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, ch znode_t *tzp, *szp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; - zfs_dirlock_t *dl; dmu_tx_t *tx; - vnode_t *realvp; int error; - int zf = ZNEW; + uint64_t parent; uid_t owner; ASSERT(tdvp->v_type == VDIR); @@ -3870,51 +4620,57 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, ch ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; - if (VOP_REALVP(svp, &realvp, ct) == 0) - svp = realvp; - - if (svp->v_vfsp != tdvp->v_vfsp) { + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (svp->v_type == VDIR) { ZFS_EXIT(zfsvfs); - return (EXDEV); + return (SET_ERROR(EPERM)); } + szp = VTOZ(svp); ZFS_VERIFY_ZP(szp); + if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* Prevent links to .zfs/shares files */ + + if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + if (parent == zfsvfs->z_shares_dir) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); - return (EILSEQ); + return (SET_ERROR(EILSEQ)); } - if (flags & FIGNORECASE) - zf |= ZCILOOK; -top: /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ - if ((szp->z_phys->zp_flags & ZFS_XATTR) != - (dzp->z_phys->zp_flags & ZFS_XATTR)) { + if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); - return (EINVAL); + return (SET_ERROR(EINVAL)); } - /* - * POSIX dictates that we return EPERM here. - * Better choices include ENOTSUP or EISDIR. - */ - if (svp->v_type == VDIR) { - ZFS_EXIT(zfsvfs); - return (EPERM); - } - owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER); - if (owner != crgetuid(cr) && - secpolicy_basic_link(cr) != 0) { + owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); + if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) { ZFS_EXIT(zfsvfs); - return (EPERM); + return (SET_ERROR(EPERM)); } if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) { @@ -3925,73 +4681,131 @@ top: /* * Attempt to lock directory; fail if entry already exists. */ - error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); + error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_bonus(tx, szp->z_id); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - error = dmu_tx_assign(tx, TXG_NOWAIT); + zfs_sa_upgrade_txholds(tx, szp); + zfs_sa_upgrade_txholds(tx, dzp); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { - zfs_dirent_unlock(dl); - if (error == ERESTART) { - dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } - error = zfs_link_create(dl, szp, tx, 0); + error = zfs_link_create(dzp, name, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; - if (flags & FIGNORECASE) - txtype |= TX_CI; zfs_log_link(zilog, tx, txtype, dzp, szp, name); } dmu_tx_commit(tx); - zfs_dirent_unlock(dl); - if (error == 0) { vnevent_link(svp, ct); } + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + ZFS_EXIT(zfsvfs); return (error); } + /*ARGSUSED*/ +void +zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) { + /* + * The fs has been unmounted, or we did a + * suspend/resume and this file no longer exists. + */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + vrecycle(vp); + return; + } + + if (zp->z_unlinked) { + /* + * Fast path to recycle a vnode of a removed file. + */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + vrecycle(vp); + return; + } + + if (zp->z_atime_dirty && zp->z_unlinked == 0) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&zp->z_atime, sizeof (zp->z_atime), tx); + zp->z_atime_dirty = 0; + dmu_tx_commit(tx); + } + } + rw_exit(&zfsvfs->z_teardown_inactive_lock); +} -/* CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); */ -/* CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); */ + +#ifdef __FreeBSD__ +CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid)); +CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid)); +#endif /*ARGSUSED*/ static int zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) { - /* XXX This should bre reviewed maybe Opensolaris version of zfs_fid can - be used for NetBSD */ znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint32_t gen; + uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; - int size, i; + int size, i, error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - gen = (uint32_t)zp->z_gen; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), + &gen64, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + gen = (uint32_t)gen64; size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; + +#ifdef illumos + if (fidp->fid_len < size) { + fidp->fid_len = size; + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOSPC)); + } +#else fidp->fid_len = size; +#endif zfid = (zfid_short_t *)fidp; @@ -4024,656 +4838,14 @@ zfs_fid(vnode_t *vp, fid_t *fidp, caller return (0); } -/* - * Copy the portion of the file indicated from pages into the file. - * The pages are stored in a page list attached to the files vnode. - * - * IN: vp - vnode of file to push page data to. - * off - position in file to put data. - * len - amount of data to write. - * flags - flags to control the operation. - * cr - credentials of caller. - * ct - caller context. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * vp - ctime|mtime updated - */ -/*ARGSUSED*/ -#ifdef PORT_SOLARIS static int -zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, +zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, caller_context_t *ct) { - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - page_t *pp; - size_t io_len; - u_offset_t io_off; - uint_t blksz; - rl_t *rl; - int error = 0; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + znode_t *zp, *xzp; + zfsvfs_t *zfsvfs; + int error; - /* - * Align this request to the file block size in case we kluster. - * XXX - this can result in pretty aggresive locking, which can - * impact simultanious read/write access. One option might be - * to break up long requests (len == 0) into block-by-block - * operations to get narrower locking. - */ - blksz = zp->z_blksz; - if (ISP2(blksz)) - io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); - else - io_off = 0; - if (len > 0 && ISP2(blksz)) - io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); - else - io_len = 0; - - if (io_len == 0) { - /* - * Search the entire vp list for pages >= io_off. - */ - rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); - error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); - goto out; - } - rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); - - if (off > zp->z_phys->zp_size) { - /* past end of file */ - zfs_range_unlock(rl); - ZFS_EXIT(zfsvfs); - return (0); - } - - len = MIN(io_len, P2ROUNDUP(zp->z_phys->zp_size, PAGESIZE) - io_off); - - for (off = io_off; io_off < off + len; io_off += io_len) { - if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { - pp = page_lookup(vp, io_off, - (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); - } else { - pp = page_lookup_nowait(vp, io_off, - (flags & B_FREE) ? SE_EXCL : SE_SHARED); - } - - if (pp != NULL && pvn_getdirty(pp, flags)) { - int err; - - /* - * Found a dirty page to push - */ - err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); - if (err) - error = err; - } else { - io_len = PAGESIZE; - } - } -out: - zfs_range_unlock(rl); - if ((flags & B_ASYNC) == 0) - zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id); - ZFS_EXIT(zfsvfs); - return (error); -} - -/*ARGSUSED*/ -void -zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); - if (zp->z_dbuf == NULL) { - /* - * The fs has been unmounted, or we did a - * suspend/resume and this file no longer exists. - */ - if (vn_has_cached_data(vp)) { - (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage, - B_INVAL, cr); - } - - mutex_enter(&zp->z_lock); - mutex_enter(&vp->v_lock); - ASSERT(vp->v_count == 1); - vp->v_count = 0; - mutex_exit(&vp->v_lock); - mutex_exit(&zp->z_lock); - rw_exit(&zfsvfs->z_teardown_inactive_lock); - zfs_znode_free(zp); - return; - } - - /* - * Attempt to push any data in the page cache. If this fails - * we will get kicked out later in zfs_zinactive(). - */ - if (vn_has_cached_data(vp)) { - (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC, - cr); - } - - if (zp->z_atime_dirty && zp->z_unlinked == 0) { - dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); - - dmu_tx_hold_bonus(tx, zp->z_id); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - dmu_buf_will_dirty(zp->z_dbuf, tx); - mutex_enter(&zp->z_lock); - zp->z_atime_dirty = 0; - mutex_exit(&zp->z_lock); - dmu_tx_commit(tx); - } - } - - zfs_zinactive(zp); - rw_exit(&zfsvfs->z_teardown_inactive_lock); -} -#endif /* PORT_SOLARIS */ - -/* - * Bounds-check the seek operation. - * - * IN: vp - vnode seeking within - * ooff - old file offset - * noffp - pointer to new file offset - * ct - caller context - * - * RETURN: 0 if success - * EINVAL if new offset invalid - */ -/* ARGSUSED */ -static int -zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, - caller_context_t *ct) -{ - if (vp->v_type == VDIR) - return (0); - return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); -} - -#ifdef PORT_SOLARIS -/* - * Pre-filter the generic locking function to trap attempts to place - * a mandatory lock on a memory mapped file. - */ -static int -zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, - flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* - * We are following the UFS semantics with respect to mapcnt - * here: If we see that the file is mapped already, then we will - * return an error, but we don't worry about races between this - * function and zfs_map(). - */ - if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) { - ZFS_EXIT(zfsvfs); - return (EAGAIN); - } - ZFS_EXIT(zfsvfs); - return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); -} - - -/* - * If we can't find a page in the cache, we will create a new page - * and fill it with file data. For efficiency, we may try to fill - * multiple pages at once (klustering) to fill up the supplied page - * list. Note that the pages to be filled are held with an exclusive - * lock to prevent access by other threads while they are being filled. - */ -static int -zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, - caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) -{ - znode_t *zp = VTOZ(vp); - page_t *pp, *cur_pp; - objset_t *os = zp->z_zfsvfs->z_os; - u_offset_t io_off, total; - size_t io_len; - int err; - - if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { - /* - * We only have a single page, don't bother klustering - */ - io_off = off; - io_len = PAGESIZE; - pp = page_create_va(vp, io_off, io_len, - PG_EXCL | PG_WAIT, seg, addr); - } else { - /* - * Try to find enough pages to fill the page list - */ - pp = pvn_read_kluster(vp, off, seg, addr, &io_off, - &io_len, off, plsz, 0); - } - if (pp == NULL) { - /* - * The page already exists, nothing to do here. - */ - *pl = NULL; - return (0); - } - - /* - * Fill the pages in the kluster. - */ - cur_pp = pp; - for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { - caddr_t va; - - ASSERT3U(io_off, ==, cur_pp->p_offset); - va = zfs_map_page(cur_pp, S_WRITE); - err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, - DMU_READ_PREFETCH); - zfs_unmap_page(cur_pp, va); - if (err) { - /* On error, toss the entire kluster */ - pvn_read_done(pp, B_ERROR); - /* convert checksum errors into IO errors */ - if (err == ECKSUM) - err = EIO; - return (err); - } - cur_pp = cur_pp->p_next; - } - - /* - * Fill in the page list array from the kluster starting - * from the desired offset `off'. - * NOTE: the page list will always be null terminated. - */ - pvn_plist_init(pp, pl, plsz, off, io_len, rw); - ASSERT(pl == NULL || (*pl)->p_offset == off); - - return (0); -} - -/* - * Return pointers to the pages for the file region [off, off + len] - * in the pl array. If plsz is greater than len, this function may - * also return page pointers from after the specified region - * (i.e. the region [off, off + plsz]). These additional pages are - * only returned if they are already in the cache, or were created as - * part of a klustered read. - * - * IN: vp - vnode of file to get data from. - * off - position in file to get data from. - * len - amount of data to retrieve. - * plsz - length of provided page list. - * seg - segment to obtain pages for. - * addr - virtual address of fault. - * rw - mode of created pages. - * cr - credentials of caller. - * ct - caller context. - * - * OUT: protp - protection mode of created pages. - * pl - list of pages created. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * vp - atime updated - */ -/* ARGSUSED */ -static int -zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, - page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, - enum seg_rw rw, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - page_t **pl0 = pl; - int err = 0; - - /* we do our own caching, faultahead is unnecessary */ - if (pl == NULL) - return (0); - else if (len > plsz) - len = plsz; - else - len = P2ROUNDUP(len, PAGESIZE); - ASSERT(plsz >= len); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (protp) - *protp = PROT_ALL; - - /* - * Loop through the requested range [off, off + len) looking - * for pages. If we don't find a page, we will need to create - * a new page and fill it with data from the file. - */ - while (len > 0) { - if (*pl = page_lookup(vp, off, SE_SHARED)) - *(pl+1) = NULL; - else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) - goto out; - while (*pl) { - ASSERT3U((*pl)->p_offset, ==, off); - off += PAGESIZE; - addr += PAGESIZE; - if (len > 0) { - ASSERT3U(len, >=, PAGESIZE); - len -= PAGESIZE; - } - ASSERT3U(plsz, >=, PAGESIZE); - plsz -= PAGESIZE; - pl++; - } - } - - /* - * Fill out the page array with any pages already in the cache. - */ - while (plsz > 0 && - (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { - off += PAGESIZE; - plsz -= PAGESIZE; - } -out: - if (err) { - /* - * Release any pages we have previously locked. - */ - while (pl > pl0) - page_unlock(*--pl); - } else { - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - } - - *pl = NULL; - - ZFS_EXIT(zfsvfs); - return (err); -} - -/* - * Request a memory map for a section of a file. This code interacts - * with common code and the VM system as follows: - * - * common code calls mmap(), which ends up in smmap_common() - * - * this calls VOP_MAP(), which takes you into (say) zfs - * - * zfs_map() calls as_map(), passing segvn_create() as the callback - * - * segvn_create() creates the new segment and calls VOP_ADDMAP() - * - * zfs_addmap() updates z_mapcnt - */ -/*ARGSUSED*/ -static int -zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, - size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - segvn_crargs_t vn_a; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if ((prot & PROT_WRITE) && - (zp->z_phys->zp_flags & (ZFS_IMMUTABLE | ZFS_READONLY | - ZFS_APPENDONLY))) { - ZFS_EXIT(zfsvfs); - return (EPERM); - } - - if ((prot & (PROT_READ | PROT_EXEC)) && - (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED)) { - ZFS_EXIT(zfsvfs); - return (EACCES); - } - - if (vp->v_flag & VNOMAP) { - ZFS_EXIT(zfsvfs); - return (ENOSYS); - } - - if (off < 0 || len > MAXOFFSET_T - off) { - ZFS_EXIT(zfsvfs); - return (ENXIO); - } - - if (vp->v_type != VREG) { - ZFS_EXIT(zfsvfs); - return (ENODEV); - } - - /* - * If file is locked, disallow mapping. - */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) { - ZFS_EXIT(zfsvfs); - return (EAGAIN); - } - - as_rangelock(as); - error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); - if (error != 0) { - as_rangeunlock(as); - ZFS_EXIT(zfsvfs); - return (error); - } - - vn_a.vp = vp; - vn_a.offset = (u_offset_t)off; - vn_a.type = flags & MAP_TYPE; - vn_a.prot = prot; - vn_a.maxprot = maxprot; - vn_a.cred = cr; - vn_a.amp = NULL; - vn_a.flags = flags & ~MAP_TYPE; - vn_a.szc = 0; - vn_a.lgrp_mem_policy_flags = 0; - - error = as_map(as, *addrp, len, segvn_create, &vn_a); - - as_rangeunlock(as); - ZFS_EXIT(zfsvfs); - return (error); -} - -/* ARGSUSED */ -static int -zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, - size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, - caller_context_t *ct) -{ - uint64_t pages = btopr(len); - - atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); - return (0); -} - -/* - * The reason we push dirty pages as part of zfs_delmap() is so that we get a - * more accurate mtime for the associated file. Since we don't have a way of - * detecting when the data was actually modified, we have to resort to - * heuristics. If an explicit msync() is done, then we mark the mtime when the - * last page is pushed. The problem occurs when the msync() call is omitted, - * which by far the most common case: - * - * open() - * mmap() - * - * munmap() - * close() - *