We are seeing a NullReferenceException in our stress program. The investigation process may be helpful to some folks.

 

Thread 22 is showing a Watson dialog.

 

This is a register corruption.

 

0:022> kp

ChildEBP RetAddr 

091bc9a0 76961220 ntdll!ZwWaitForSingleObject(void)+0x15

091bca10 76961188 kernel32!WaitForSingleObjectEx(void * hHandle = 0x00001034, unsigned long dwMilliseconds = 0x4e20, int bAlertable = 0)+0xbe

091bca24 7bb1fca4 kernel32!WaitForSingleObject(void * hHandle = 0x00001034, unsigned long dwMilliseconds = 0x4e20)+0x12

091bca54 7bb202b4 mscorwks!ClrWaitForSingleObject(void * handle = 0x00001034, unsigned long timeout = 0x4e20)+0x24

091bcf10 7bb23570 mscorwks!RunWatson(void * hWatsonSharedMemory = 0x000015f4, void * hEventAlive = 0x00001034, void * hEventDone = 0x00000d7c, void * hMutex = 0x00001378)+0x18f

091bd658 7bb23b43 mscorwks!DoFaultReportWorker(struct _EXCEPTION_POINTERS * pExceptionInfo = 0x091bd7ac, class TypeOfReportedError tore = class TypeOfReportedError, class Thread * pThread = 0x09b1d0b0, unsigned long dwThreadID = 0x300)+0xb52

091bd6a0 7bb317d3 mscorwks!DoFaultReport(struct _EXCEPTION_POINTERS * pExceptionInfo = 0x091bd7ac, class TypeOfReportedError tore = class TypeOfReportedError)+0x13d

091bd6c8 7bb361fb mscorwks!WatsonLastChance(class Thread * pThread = 0x09b1d0b0, struct _EXCEPTION_POINTERS * pExceptionInfo = 0x091bd7ac, class TypeOfReportedError tore = class TypeOfReportedError)+0x51

091bd728 7bb36387 mscorwks!InternalUnhandledExceptionFilter_Worker(struct _EXCEPTION_POINTERS * pExceptionInfo = 0x091bd7ac)+0x179

091bd734 7bb7578c mscorwks!ThreadBaseExceptionAppDomainFilter(struct _EXCEPTION_POINTERS * pExceptionInfo = 0x091bd7ac, void * pvParam = 0x091bf0f4)+0x1a

091bd76c 7baf167a mscorwks!ThreadBaseRedirectingFilter(struct _EXCEPTION_POINTERS * pExceptionInfo = 0x091bd7ac, void * _pCallState = 0x00000002)+0x7b

091bd774 7b9d2540 mscorwks!ManagedThreadBase_DispatchOuter(struct ManagedThreadCallState * pCallState = 0x091bf0f4)+0x38

091bd788 7ba6bf39 mscorwks!_EH4_CallFilterFunc(void)+0x12

091bd7b0 77e6b6d1 mscorwks!_except_handler4(struct _EXCEPTION_RECORD * ExceptionRecord = <Memory access error>, struct _EXCEPTION_REGISTRATION_RECORD * EstablisherFrame = <Memory access error>, struct _CONTEXT * ContextRecord = <Memory access error>, void * DispatcherContext = <Memory access error>)+0x8e

091bd7d4 77e6b6a3 ntdll!ExecuteHandler2(void)+0x26

091bd87c 77e4ee57 ntdll!ExecuteHandler(void)+0x24

091bd87c 06b2d7f3 ntdll!KiUserExceptionDispatcher(void)+0xf

091bea00 06b2d796 mscorlib!System.Collections.Generic.LinkedList`1[[System.__Canon, mscorlib]].Find(<HRESULT 0x80004001>)+0x43

091bea38 06b2d6b6 mscorlib!System.Collections.Generic.LinkedList`1[[System.__Canon, mscorlib]].Contains(<HRESULT 0x80004001>)+0x6

 

 

The first thing we should do is to find the right exception context. As pointed out in this article, we should look for _EXCEPTION_POINTERS, and the frame mscorwks!DoFaultReport gives us that (and along with many other frames). Let’s use it.

 

0:022> dc 0x091bd7ac

091bd7ac  091bd894 091bd8e4 77e6b6d1 091bd894  ...........w....

091bd7bc  091beed4 091bd8e4 091bd870 091bdc64  ........p...d...

091bd7cc  77e6b6e5 091beed4 091bd87c 77e6b6a3  ...w....|......w

091bd7dc  091bd894 091beed4 091bd8e4 091bd870  ............p...

091bd7ec  7ba6beb4 00000000 091bd894 091beed4  ...{............

091bd7fc  77e6b648 091bd894 091beed4 091bd8e4  H..w............

091bd80c  091bd870 7ba6beb4 00000002 091bd894  p......{........

091bd81c  091be6e8 7bb83bd0 035db834 00000004  .....;.{4.].....

 

The highlighted DWORD is the exception context. Let’s move to it.

 

0:022> .cxr 091bd8e4

eax=00000000 ebx=03339044 ecx=033310a8 edx=00000000 esi=00000001 edi=03312bc0

eip=06b2d7f3 esp=091be9f0 ebp=091bea00 iopl=0         nv up ei ng nz ac pe cy

cs=0023  ss=002b  ds=002b  es=002b  fs=0053  gs=002b             efl=00010297

mscorlib!System.Collections.Generic.LinkedList`1[[System.__Canon, mscorlib]].Find(System.__Canon)+0x43:

06b2d7f3 8b5610          mov     edx,dword ptr [esi+10h] ds:002b:00000011=????????

 

So this is access invalid memory. This is very strange. LinkedList is pure managed code. I have never seen manage code accessing invalid memory.

 

Let’s look at the assembly code and see what happened.

 

0:022> !u 06b2d7f3

Normal JIT generated code

System.Collections.Generic.LinkedList`1[[System.__Canon, mscorlib]].Find(System.__Canon)

Begin 06b2d7b0, size 83

06b2d7b0 55              push    ebp

06b2d7b1 8bec            mov     ebp,esp

06b2d7b3 57              push    edi

06b2d7b4 56              push    esi

06b2d7b5 53              push    ebx

06b2d7b6 50              push    eax

06b2d7b7 8bf9            mov     edi,ecx

06b2d7b9 8bda            mov     ebx,edx

06b2d7bb 8b7704          mov     esi,dword ptr [edi+4]

06b2d7be 8b0f            mov     ecx,dword ptr [edi]

06b2d7c0 8b4120          mov     eax,dword ptr [ecx+20h]

06b2d7c3 8b10            mov     edx,dword ptr [eax]

06b2d7c5 83c208          add     edx,8

06b2d7c8 8b02            mov     eax,dword ptr [edx]

06b2d7ca 85c0            test    eax,eax

06b2d7cc 7513            jne     mscorlib!System.Collections.Generic.LinkedList`1[[System.__Canon, mscorlib]].Find(System.__Canon)+0x31 (06b2d7e1)

06b2d7ce 6a00            push    0

06b2d7d0 52              push    edx

06b2d7d1 51              push    ecx

06b2d7d2 bae900001b      mov     edx,1B0000E9h

06b2d7d7 b9f0047e02      mov     ecx,27E04F0h (MD: System.Collections.Generic.LinkedList`1[[System.__Canon, mscorlib]].Find(System.__Canon))

06b2d7dc e8e271fb74      call    mscorwks!JIT_GenericHandle (7bae49c3)

06b2d7e1 8bc8            mov     ecx,eax

06b2d7e3 e810a078f9      call    mscorlib!System.Collections.Generic.EqualityComparer`1[[System.__Canon, mscorlib]].get_Default() (002b77f8)

06b2d7e8 8945f0          mov     dword ptr [ebp-10h],eax

06b2d7eb 85f6            test    esi,esi

06b2d7ed 743c            je      mscorlib!System.Collections.Generic.LinkedList`1[[System.__Canon, mscorlib]].Find(System.__Canon)+0x7b (06b2d82b)

06b2d7ef 85db            test    ebx,ebx

06b2d7f1 7422            je      mscorlib!System.Collections.Generic.LinkedList`1[[System.__Canon, mscorlib]].Find(System.__Canon)+0x65 (06b2d815)

>>> 06b2d7f3 8b5610          mov     edx,dword ptr [esi+10h]

06b2d7f6 53              push    ebx

06b2d7f7 8b4df0          mov     ecx,dword ptr [ebp-10h]

06b2d7fa 8b01            mov     eax,dword ptr [ecx]

06b2d7fc ff5038          call    dword ptr [eax+38h]

 

0:022> r

Last set context:

eax=00000000 ebx=03339044 ecx=033310a8 edx=00000000 esi=00000001 edi=03312bc0

eip=06b2d7f3 esp=091be9f0 ebp=091bea00 iopl=0         nv up ei ng nz ac pe cy

cs=0023  ss=002b  ds=002b  es=002b  fs=0053  gs=002b             efl=00010297

Microsoft_Live_Moe_Runtime!System.Collections.Generic.LinkedList`1[[System.__Canon, mscorlib]].Find(System.__Canon)+0x43:

06b2d7f3 8b5610          mov     edx,dword ptr [esi+10h] ds:002b:00000011=????????

 

0:022> dc @edi+4

03312bc4  136deb74 00000000 00000008 00000000  t.m.............

03312bd4  00fe553c 03312be8 00000258 00000000  <U...+1.X.......

03312be4  00020000 00fe56b0 00000258 00000004  .....V..X.......

 

The highlighted code at 06b2d7f3 is where the exception happened. If we read the assembly code, register @esi  is assigned to the value at address @edi+4 at offset 06b2d7bb, and not changed afterwards.  Register @edi is assigned at offset 06b2d7b7, and not changed afterwards. This means @esi should be the same as the value at memory @edi+4.

 

However, @esi = 1, while the memory at @edi+4 is 136deb74. Apparently, one of them is wrong.  Telling from the value, it looks like the register @esi is corrupted.

 

We still don’t know why the register is corrupted. But at least we are confident that this is not a bug in LinkedList.