zfsonfreebsd/ZoF

panic: recursive fpu_kern_enter while in PCB_FPUNOSAVE state

Opened this issue · 6 comments

With root on ZFS I occasionally hit this panic on boot in the SSE fletcher4 code.

The CPU is:

CPU: Intel(R) Xeon(R) CPU E3-1275 v3 @ 3.50GHz (3491.99-MHz K8-class CPU)
  Origin="GenuineIntel"  Id=0x306c3  Family=0x6  Model=0x3c  Stepping=3
  Features=0xbfebfbff<FPU,VME,DE,PSE,TSC,MSR,PAE,MCE,CX8,APIC,SEP,MTRR,PGE,MCA,CMOV,PAT,PSE36,CLFLUSH,DTS,ACPI,MMX,FXSR,SSE,SSE2,SS,HTT,TM,PBE>
  Features2=0x7ffafbff<SSE3,PCLMULQDQ,DTES64,MON,DS_CPL,VMX,SMX,EST,TM2,SSSE3,SDBG,FMA,CX16,xTPR,PDCM,PCID,SSE4.1,SSE4.2,x2APIC,MOVBE,POPCNT,TSCDLT,AESNI,XSAVE,OSXSAVE,AVX,F16C,RDRAND>
  AMD Features=0x2c100800<SYSCALL,NX,Page1GB,RDTSCP,LM>
  AMD Features2=0x21<LAHF,ABM>
  Structured Extended Features=0x27ab<FSGSBASE,TSCADJ,BMI1,AVX2,SMEP,BMI2,ERMS,INVPCID,NFPUSG>
  XSAVE Features=0x1<XSAVEOPT>
  VT-x: PAT,HLT,MTF,PAUSE,EPT,UG,VPID
  TSC: P-state invariant, performance statistics

Here is some stuff from the crash dump:

(kgdb) bt
#0  __curthread () at /usr/src/sys/amd64/include/pcpu_aux.h:55
#1  doadump (textdump=1) at /usr/src/sys/kern/kern_shutdown.c:394
#2  0xffffffff80bd6900 in kern_reboot (howto=260) at /usr/src/sys/kern/kern_shutdown.c:481
#3  0xffffffff80bd6d4a in vpanic (fmt=<optimized out>, ap=<optimized out>) at /usr/src/sys/kern/kern_shutdown.c:913
#4  0xffffffff80bd6ab3 in panic (fmt=<unavailable>) at /usr/src/sys/kern/kern_shutdown.c:839
#5  0xffffffff80ff39f1 in fpu_kern_enter (td=<optimized out>, ctx=0x0, flags=<unavailable>) at /usr/src/sys/amd64/amd64/fpu.c:1090
#6  0xffffffff8216e194 in fletcher_4_sse2_native (ctx=0xfffffe00e742c700, buf=0xfffff800258a7600, size=<optimized out>)
    at /usr/src/sys/contrib/openzfs/module/zcommon/zfs_fletcher_sse.c:105
#7  0xffffffff8216d46e in abd_fletcher_4_iter (data=0xfffff800258a7600, size=512, private=0xfffffe00e742c6e0)
    at /usr/src/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c:865
#8  0xffffffff82172829 in abd_iterate_func (abd=0xfffff8002eb89c00, off=<optimized out>, size=512, func=0xffffffff8216d430 <abd_fletcher_4_iter>,
    private=0xfffffe00e742c6e0) at /usr/src/sys/contrib/openzfs/module/zfs/abd.c:805
#9  0xffffffff822ce5ff in abd_fletcher_4_impl (abd=0xfffff8002eb89c00, size=512, acdp=<optimized out>) at /usr/src/sys/contrib/openzfs/module/zfs/zio_checksum.c:126
#10 abd_fletcher_4_native (abd=<unavailable>, size=<unavailable>, ctx_template=<optimized out>, zcp=<optimized out>)
    at /usr/src/sys/contrib/openzfs/module/zfs/zio_checksum.c:143
#11 0xffffffff822cee3b in zio_checksum_error_impl (spa=<optimized out>, bp=0xfffff8002eb9c578, checksum=ZIO_CHECKSUM_FLETCHER_4, abd=<optimized out>, size=<unavailable>,
    offset=<optimized out>, info=0xfffffe00e742ca48) at /usr/src/sys/contrib/openzfs/module/zfs/zio_checksum.c:488
#12 0xffffffff822cf29f in zio_checksum_error (zio=0xfffff8002eb9c4d0, info=0xfffffe00e742ca48) at /usr/src/sys/contrib/openzfs/module/zfs/zio_checksum.c:540
#13 0xffffffff822cbcf3 in zio_checksum_verify (zio=0xfffff8002eb9c4d0) at /usr/src/sys/contrib/openzfs/module/zfs/zio.c:4216
#14 0xffffffff822c6c39 in __zio_execute (zio=0xfffff8002eb9c4d0) at /usr/src/sys/contrib/openzfs/module/zfs/zio.c:2169
#15 zio_execute (zio=<optimized out>) at /usr/src/sys/contrib/openzfs/module/zfs/zio.c:2082
#16 0xffffffff80c36c0a in taskqueue_run_locked (queue=0xfffff800253db100) at /usr/src/sys/kern/subr_taskqueue.c:476
#17 0xffffffff80c37c94 in taskqueue_thread_loop (arg=<optimized out>) at /usr/src/sys/kern/subr_taskqueue.c:793
#18 0xffffffff80b92830 in fork_exit (callout=0xffffffff80c37c00 <taskqueue_thread_loop>, arg=0xfffff8002e90e0e0, frame=0xfffffe00e742cc00)
    at /usr/src/sys/kern/kern_fork.c:1052
#19 <signal handler called>
(kgdb) frame 14
#14 0xffffffff822c6c39 in __zio_execute (zio=0xfffff8002eb9c4d0) at /usr/src/sys/contrib/openzfs/module/zfs/zio.c:2169
2169                    zio = zio_pipeline[highbit64(stage) - 1](zio);
(kgdb) p *zio
$1 = {io_bookmark = {zb_objset = 0, zb_object = 0, zb_level = -1, zb_blkid = 0}, io_prop = {zp_checksum = ZIO_CHECKSUM_INHERIT, zp_compress = ZIO_COMPRESS_INHERIT, zp_complevel = 0 '\000', zp_type = DMU_OT_NONE, zp_level = 0 '\000', zp_copies = 0 '\000', zp_dedup = 0, zp_dedup_verify = 0, zp_nopwrite = 0, zp_encrypt = 0, zp_byteorder = 0,
    zp_salt = "\000\000\000\000\000\000\000", zp_iv = '\000' <repeats 11 times>, zp_mac = '\000' <repeats 15 times>, zp_zpl_smallblk = 0}, io_type = ZIO_TYPE_READ, io_child_type = ZIO_CHILD_VDEV, io_trim_flags = (unknown: 0), io_cmd = 0, io_priority = ZIO_PRIORITY_SYNC_READ, io_reexecute = 0 '\000', io_state = "\001", io_txg = 869176,
  io_spa = 0xfffffe00e7345000, io_bp = 0xfffff8002eb9c578, io_bp_override = 0x0, io_bp_copy = {blk_dva = {{dva_word = {1, 629146649}}, {dva_word = {1, 637534322}}, {dva_word = {1, 671089010}}}, blk_prop = 9226476022604496903, blk_pad = {0, 0}, blk_phys_birth = 0, blk_birth = 869176, blk_fill = 462, blk_cksum = {zc_word = {46157615889,
        4602830873836, 235214445069463, 8215871156353806}}}, io_parent_list = {list_size = 48, list_offset = 16, list_head = {list_next = 0xfffff80025f09af0, list_prev = 0xfffff80025f09af0}}, io_child_list = {list_size = 48, list_offset = 32, list_head = {list_next = 0xfffff8002eb9c628, list_prev = 0xfffff8002eb9c628}},
  io_logical = 0xfffff80025eec000, io_transform_stack = 0x0, io_ready = 0x0, io_children_ready = 0x0, io_physdone = 0x0, io_done = 0xffffffff8222b850 <vdev_mirror_child_done>, io_private = 0xfffff8002eb89898, io_prev_space_delta = 0, io_bp_orig = {blk_dva = {{dva_word = {1, 629146649}}, {dva_word = {1, 637534322}}, {dva_word = {1,
          671089010}}}, blk_prop = 9226476022604496903, blk_pad = {0, 0}, blk_phys_birth = 0, blk_birth = 869176, blk_fill = 462, blk_cksum = {zc_word = {46157615889, 4602830873836, 235214445069463, 8215871156353806}}}, io_lsize = 512, io_abd = 0xfffff8002eb89c00, io_orig_abd = 0xfffff8002eb89c00, io_size = 512, io_orig_size = 512,
  io_vd = 0xfffffe00e7664000, io_vsd = 0x0, io_vsd_ops = 0x0, io_metaslab_class = 0x0, io_offset = 322127278592, io_timestamp = 8409558794, io_queued_timestamp = 8409558362, io_target_timestamp = 0, io_delta = 75072, io_delay = 74702, io_queue_node = {avl_child = {0x0, 0x0}, avl_pcb = 1}, io_offset_node = {avl_child = {0x0, 0x0},
    avl_pcb = 1}, io_alloc_node = {avl_child = {0x0, 0x0}, avl_pcb = 0}, io_alloc_list = {zal_list = {list_size = 0, list_offset = 0, list_head = {list_next = 0x0, list_prev = 0x0}}, zal_size = 0}, io_flags = 1575040, io_stage = ZIO_STAGE_CHECKSUM_VERIFY,
  io_pipeline = (ZIO_STAGE_VDEV_IO_START | ZIO_STAGE_VDEV_IO_DONE | ZIO_STAGE_VDEV_IO_ASSESS | ZIO_STAGE_CHECKSUM_VERIFY | ZIO_STAGE_DONE), io_orig_flags = 1048704, io_orig_stage = ZIO_STAGE_READY, io_orig_pipeline = (ZIO_STAGE_VDEV_IO_START | ZIO_STAGE_VDEV_IO_DONE | ZIO_STAGE_VDEV_IO_ASSESS | ZIO_STAGE_CHECKSUM_VERIFY | ZIO_STAGE_DONE),
  io_pipeline_trace = (ZIO_STAGE_OPEN | ZIO_STAGE_VDEV_IO_START | ZIO_STAGE_VDEV_IO_DONE | ZIO_STAGE_VDEV_IO_ASSESS | ZIO_STAGE_CHECKSUM_VERIFY), io_error = 0, io_child_error = {0, 0, 0, 0}, io_children = {{0, 0}, {0, 0}, {0, 0}, {0, 0}}, io_child_count = 0, io_phys_children = 0, io_parent_count = 1, io_stall = 0x0, io_gang_leader = 0x0,
  io_gang_tree = 0x0, io_executor = 0xfffffe00e73e2500, io_waiter = 0x0, io_bio = 0x0, io_lock = {lock_object = {lo_name = 0xffffffff8237bedf <.L.str.40+1> "zio->io_lock", lo_flags = 577830912, lo_data = 0, lo_witness = 0x0}, sx_lock = 1}, io_cv = {cv_description = 0xffffffff82377508 <.L.str.41+1> "zio->io_cv", cv_waiters = 0},
  io_allocator = 0, io_cksum_report = 0x0, io_ena = 0, io_tqent = {tqent_task = {ta_link = {stqe_next = 0x0}, ta_pending = 0, ta_priority = 0 '\000', ta_flags = 0 '\000', ta_func = 0xffffffff82139f80 <taskq_run_ent>, ta_context = 0xfffff8002eb9c8e0}, tqent_timeout_task = {q = 0x0, t = {ta_link = {stqe_next = 0x0}, ta_pending = 0,
        ta_priority = 0 '\000', ta_flags = 0 '\000', ta_func = 0x0, ta_context = 0x0}, c = {c_links = {le = {le_next = 0x0, le_prev = 0x0}, sle = {sle_next = 0x0}, tqe = {tqe_next = 0x0, tqe_prev = 0x0}}, c_time = 0, c_precision = 0, c_arg = 0x0, c_func = 0x0, c_lock = 0x0, c_flags = 0, c_iflags = 0, c_cpu = 0}, f = 0},
    tqent_func = 0xffffffff822c6bd0 <zio_execute>, tqent_arg = 0xfffff8002eb9c4d0, tqent_id = 0, tqent_hash = {cle_next = 0x0, cle_prev = 0x0}, tqent_type = 0 '\000', tqent_registered = 0 '\000', tqent_cancelled = 0 '\000', tqent_rc = 0}}
(kgdb)

@freqlabs please reopen if you hit this again (should only be possible in a syscall now)

Still panics at boot with this on projects/fpu_kern_thread

Ok the new panic is similar but not exactly the same:
Screen Shot 2020-09-11 at 4 44 58 PM

panic: fpudna while in fpu_kern_enter(FPU_KERN_NOCTX)
instead of
panic: recursive fpu_kern_enter while in PCB_FPUNOSAVE state

Sometimes the panic is in the SSE2 implementation, sometimes it's in AVX2.

Latest panic on projects/fpu_kern_thread (89147cf):

<118>Mounting local filesystems:.
<118>no pools available to import
panic: recursive fpu_kern_enter while in PCB_FPUNOSAVE state
cpuid = 4
time = 1599940316
KDB: stack backtrace:
db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xffffffff827998a0
vpanic() at vpanic+0x182/frame 0xffffffff827998f0
panic() at panic+0x43/frame 0xffffffff82799950
fpu_kern_enter() at fpu_kern_enter+0x231/frame 0xffffffff82799980
fletcher_4_avx2_native() at fletcher_4_avx2_native+0xaa/frame 0xffffffff827999b0
fletcher_4_native() at fletcher_4_native+0xeb/frame 0xffffffff82799ba0
fletcher_4_benchmark_impl() at fletcher_4_benchmark_impl+0xe0/frame 0xffffffff82799c40
fletcher_4_init() at fletcher_4_init+0x187/frame 0xffffffff82799c60
mi_startup() at mi_startup+0xec/frame 0xffffffff82799cb0
btext() at btext+0x2c
KDB: enter: panic
(kgdb) bt
#0  __curthread () at /usr/src/sys/amd64/include/pcpu_aux.h:55
#1  doadump (textdump=1) at /usr/src/sys/kern/kern_shutdown.c:394
#2  0xffffffff80bd6db0 in kern_reboot (howto=260) at /usr/src/sys/kern/kern_shutdown.c:481
#3  0xffffffff80bd71fa in vpanic (fmt=<optimized out>, ap=<optimized out>) at /usr/src/sys/kern/kern_shutdown.c:913
#4  0xffffffff80bd6f63 in panic (fmt=<unavailable>) at /usr/src/sys/kern/kern_shutdown.c:839
#5  0xffffffff80ff6221 in fpu_kern_enter (td=<optimized out>, ctx=0x0, flags=<unavailable>) at /usr/src/sys/amd64/amd64/fpu.c:1089
#6  0xffffffff8216f2ea in fletcher_4_avx2_native (ctx=0xffffffff82799a00, buf=<optimized out>, size=<optimized out>) at zcommon/zfs_fletcher_intel.c:119
#7  0xffffffff8216dd7b in fletcher_4_scalar_native (ctx=<unavailable>, buf=<optimized out>, size=<optimized out>) at zcommon/zfs_fletcher.c:329
#8  fletcher_4_native (buf=<optimized out>, size=<optimized out>, ctx_template=<optimized out>, zcp=<unavailable>) at zcommon/zfs_fletcher.c:476
#9  0xffffffff8216e9a0 in gethrtime () at /home/ryan/ZoF/include/os/freebsd/spl/sys/time.h:73
#10 fletcher_4_benchmark_impl (native=1, data=0xfffffe00dc2d9000 "", data_size=131072) at zcommon/zfs_fletcher.c:691
#11 0xffffffff8216e4b7 in fletcher_4_benchmark () at zcommon/zfs_fletcher.c:752
#12 fletcher_4_init () at zcommon/zfs_fletcher.c:776
#13 0xffffffff80b682ec in mi_startup () at /usr/src/sys/kern/init_main.c:312
#14 0xffffffff8036f02c in btext () at /usr/src/sys/amd64/amd64/locore.S:76
(kgdb)

Fixed by e071625