While I was preparing this year's performance talk for MEDC, I was curious about the quality of JIT-compiled code compared to native code.  I wanted to dive in a bit deeper to understand what the differences are and where they come from.

I chose a set of what I consider to be fairly common programming tasks: math, moving memory around, file IO, and iterating.  I wrote code in C#, VB, and C and measured the execution time on a Dell Axim x51v.  These tests were run against v2 (code generation hasn't changed for v3.5, so this analysis should hold for newer versions).

Let's look at the code I used for the tests starting with math.  I quite randomly chose a series of mathematical operations and came up with the following code that implements "Ryan's Formula":

C#
private int DoManagedMathTest(out int duration)
{
    int start = Environment.TickCount;

    int result = 0;
    for (int i = 0; i < 10000000; i++)
    {
        result = result + i;
        result = result * i;
        result = result - i;

        result = result - i * i;
        result = result << 5;
        result = result + i * i;
    }

    duration = Environment.TickCount - start;

    return result;
}

VB
Public Shared Function DoVBMathTest(ByRef duration As Integer) As Integer
    Dim start As Integer = Environment.TickCount

    Dim result As Integer = 0
    Dim i As Integer

    For i = 0 To (10000000 - 1)
        result = result + i
        result = result * i
        result = result - i

        result = result - i * i
        result = result << 5
        result = result + i * i
    Next i

    duration = Environment.TickCount - start

    Return result
End Function

C
__declspec(dllexport) int DoNativeMathTest(int * pDuration)
{
    int start = GetTickCount();

    int result = 0;

    for (int i = 0; i < 10000000; i++)
    {
        result = result + i;
        result = result * i;
        result = result - i;

        result = result - i * i;
        result = result << 5;
        result = result + i * i;
    }

    *pDuration = GetTickCount() - start;

    return result;
}

Next up, file IO.  The code below writes a sequence of bytes to a file and then reads them back out:

C#
private int DoManagedIOTest(out int duration)
{
    FileStream fs = File.Open("testfile.txt", FileMode.Create);

    byte[] buffer = new byte[512];

    for (int i = 0; i < buffer.Length; i++)
    {
        buffer[i] = (byte)i;
    }

    int start = Environment.TickCount;

    for (int i = 0; i < 1000; i++)
    {
        fs.Write(buffer, 0, buffer.Length);
    }

    fs.Seek(0, SeekOrigin.Begin);

    int totalBytes = 0;
    int readBytes = 0;

    while ((readBytes = fs.Read(buffer, 0, buffer.Length)) > 0)
    {
        totalBytes += readBytes;
    }

    duration = Environment.TickCount - start;

    fs.Close();

    return totalBytes;
}

VB
Public Shared Function DoVBIOTest(ByRef duration As Integer) As Integer
    Dim fs As FileStream = File.Open("testfile.txt", FileMode.Create)

    Dim buf() As Byte = New Byte(512 - 1) {}

    Dim i As Integer

    For i = 0 To (buf.Length - 1)
        buf(i) = CByte(i)
    Next i

    Dim start As Integer = Environment.TickCount

    For i = 0 To (1000 - 1)
        fs.Write(buf, 0, buf.Length)
    Next i

    fs.Seek(0, SeekOrigin.Begin)

    Dim totalBytes As Integer = 0
    Dim readBytes As Integer = 0

    readBytes = fs.Read(buf, 0, buf.Length)
    While (readBytes > 0)
        totalBytes += readBytes
        readBytes = fs.Read(buf, 0, buf.Length)
    End While

    duration = Environment.TickCount - start

    fs.Close()
    Return totalBytes
End Function

C
__declspec(dllexport) int DoNativeIOTest(int * pDuration)
{
    HANDLE hFile = CreateFile(
        L"testfile.txt", 
        GENERIC_READ | GENERIC_WRITE,
        0,
        NULL,
        CREATE_ALWAYS,
        0,
        NULL);

    unsigned char buffer[512];

    for (int i = 0; i < sizeof(buffer); i++)
    {
        buffer[i] = i;
    }

    int start = GetTickCount();

    int ct;

    for (int i = 0; i < 1000; i++)
    {
        WriteFile(hFile, buffer, sizeof(buffer), (LPDWORD)&ct, NULL);
    }

    SetFilePointer(hFile, 0, 0, FILE_BEGIN);

    int totalBytes = 0;
    int readBytes = 0;

    while (ReadFile(hFile, buffer, sizeof(buffer), (LPDWORD)&ct, NULL) != 0)
    {
        if(ct == 0)
            break;

        totalBytes += ct;
    }

    *pDuration = GetTickCount() - start;

    CloseHandle(hFile);

    return totalBytes;
}

Moving memory:

C#
private int DoManagedMemoryTest(out int duration)
{
    byte[] buf1 = new byte[10000];
    byte[] buf2 = new byte[10000];

    for (int i = 0; i < buf1.Length; i++)
        buf1[i] = (byte)i;

    int start = Environment.TickCount;

    int result;

    for (result = 0; result < 100000; result++)
    {
        Array.Copy(buf1, 0, buf2, 0, buf1.Length);
        Array.Copy(buf2, 0, buf1, 0, buf2.Length);
    }

    duration = Environment.TickCount - start;

    return result;
}

VB
Public Shared Function DoVBMemoryTest(ByRef duration As Integer) As Integer
    Dim buf1() As Byte = New Byte(10000 - 1) {}
    Dim buf2() As Byte = New Byte(10000 - 1) {}

    Dim i As Integer

    For i = 0 To (buf1.Length - 1)
        buf1(i) = CByte(i)
    Next i

    Dim start As Integer = Environment.TickCount

    Dim result As Integer

    For result = 0 To (100000 - 1)
        Array.Copy(buf1, 0, buf2, 0, buf1.Length)
        Array.Copy(buf2, 0, buf1, 0, buf2.Length)
    Next result

    duration = Environment.TickCount - start

    Return result
End Function

C
__declspec(dllexport) int DoNativeMemoryTest(int * pDuration)
{
    unsigned char buf1[10000];
    unsigned char buf2[10000];

    for (int i = 0; i < sizeof(buf1); i++)
        buf1[i] = (unsigned char)i;

    int start = GetTickCount();

    int result;

    for (result = 0; result < 100000; result++)
    {
        memmove(buf2, buf1, sizeof(buf1));
        memmove(buf1, buf2, sizeof(buf2));
    }

    *pDuration = GetTickCount() - start;

    return result;
}

And finally, loops:

C#
private int DoManagedLoopTest(out int duration)
{
    int start = Environment.TickCount;

    int result;
    int r2 = 0;

    for (result = 0; result < 100000000; result++)
    {
        r2++;
    }

    duration = Environment.TickCount - start;

    return result + r2;
}

VB
Public Shared Function DoVBLoopTest(ByRef duration As Integer) As Integer
    Dim start As Integer = Environment.TickCount

    Dim result As Integer
    Dim r2 As Integer = 0

    For result = 0 To (100000000 - 1)
        r2 = r2 + 1
    Next result

    duration = Environment.TickCount - start

    Return result + r2
End Function

C
__declspec(dllexport) int DoNativeLoopTest(int * pDuration)
{
    int start = GetTickCount();

    int result;
    int r2 = 0;

    for (result = 0; result < 100000000; result++)
    {
        r2++;
    }

    *pDuration = GetTickCount() - start;

    return result + r2;
}

Ok, that's a lot of code...let's look at some results!  One important note: I disabled integer overflow checks in VB.  I did this at first because the math sample above was throwing an overflow exception.  As it turns out, these checks have a significant impact on performance.  I'd encourage you to measure VB performance with and without the overflow checks enabled to see what kind of impact they may have for you.

 

C#

VB.NET

Native

Native (Optimized)

Math

442

444

941

99

IO

2338

2336

1827

1839

Memory

2769

2794

2459

2437

Loops

2020

2020

2860

0

Analysis

One of the first things we notice is that there is effectively no performance difference between C# and VB in the examples here (keeping in mind that the integer overflow checks are disabled).  This makes sense since the JIT works on IL that is generated by the C# and VB compilers.  I'd guess that the compilers would tend to generate identical--or at least very similar--IL for an algorithm whether it was implemented in C# or VB.  I'm not a compiler guy though and I'd like to hear about any obvious exceptions to this rule.

So let's look at some of the numbers in a bit more detail.  The math and loop samples involve the most interesting analysis, so I'll save those for last!

Memory and IO

In both of these cases, we notice a marginal slowdown using managed code vs. native code (roughly 10% for the memory tests and 20% for the IO tests).  Both of these slowdowns can be explained by the fact that the managed runtime is simply doing more work for every call.  The managed method calls ultimately use the same OS APIs that I used in the native code, but the runtime is doing a bit of extra work (boundary checking for example) and has to do a managed/native transition to make the low-level call.  The quality of the JIT-compiled code is not a factor in the performance differences.

Iteration 

The iteration numbers start to give us some insight into the behavior of JIT-compiled code compared to native code.  Managed code actually outperforms non-optimized native code, but the optimized native code actually takes no time to execute-what's going on here?  We have to dive into the assembly code that is generated by the native compilers and JIT to see what's causing such a wide variation in the results.

Managed vs. Non-Optimized Native

The assembly code snippets below are from the native compiler and the JIT respectively.  The disassembly of the native code has the added benefit of including the original C code in-line.

Native:

for (result = 0; result < 100000000; result++)
011E1498      mov    r3, #0
011E149C      str      r3, result
011E14A0      b        |DoNativeLoopTest + 0x40 ( 11e14b0h )|
011E14A4      ldr      r3, result
011E14A8      add     r3, r3, #1
011E14AC      str      r3, result
011E14B0      ldr      r2, result
011E14B4      ldr      r3, [pc, #0x4C]
011E14B8      cmp    r2, r3
011E14BC      bge     |DoNativeLoopTest + 0x60 ( 11e14d0h )|
{
r2++;
011E14C0      ldr      r3, r2
011E14C4      add     r3, r3, #1
011E14C8      str      r3, r2
}
011E14CC      b        |DoNativeLoopTest + 0x34 ( 11e14a4h )|

JIT:

000B8800      ldr      lr, [r11, #0x28]
000B8804      ands   lr, lr, #0x17
000B8808      blne    000A00BC
000B880C      add     r6, r6, #1
000B8810      add     r5, r5, #1
000B8814      ldr      r1, [pc, #0x4C]
000B8818      cmp    r5, r1
000B881C      blt      000B8800

The actual loop in the native code runs from 011E14A4 to 011E14CC.  The non-optimized native code stores the values of result and r2 in memory, so it is incurring a pretty hefty performance expense reading and writing memory each time through the loop which explains most of the disparity.

Managed vs. Optimized Native

Let's turn to the optimized native code, which had the rather startling result of taking no time to run.  This example highlights a key source of performance differences (unrelated to iteration) that will come up again when we look at the performance of the math samples.

Optimized native:

__declspec(dllexport) int DoNativeLoopTest(int * pDuration)
{
011E1234      stmdb  sp!, {r4, r5, lr}
011E1238      mov    r5, r0 

    int start = GetTickCount();

011E123C      bl       011E125C
011E1240      mov    r4, r0 

    int result;
    int r2 = 0;

    for (result = 0; result < 100000000; result++)
    {
        r2++;
    }

    *pDuration = GetTickCount() - start;
011E1244      bl       011E125C
011E1248      sub     r3, r0, r4 

    return result + r2;
011E124C      ldr      r0, [pc, #4]
011E1250      str      r3, pDuration
}

Taking a quick look at this code, you'll notice that there are no instructions to actually perform the iteration.  The native compiler was able to pre-compute the result of the loop and generate code to calculate the return value without iterating.  So this doesn't tell us much about the performance of iteration, but it does demonstrate that the native compiler is able to find optimizations that the managed compiler misses.  We'll take another look at loop performance in the math section when the native optimizer isn't able to find a way to eliminate the loop altogether!

Math

Managed code dramatically outperforms non-optimized native code, but appears to be crushed by optimized native code.  Let's take a look at the assembly code again to find the answers.

Managed vs. Non-Optimized Native

Native:

    for (int i = 0; i < 10000000; i++)
011E1078      mov    r3, #0
011E107C      str      r3, [sp, #8]
011E1080      b        |DoNativeMathTest + 0x40 ( 11e1090h )|
011E1084      ldr      r3, [sp, #8]
011E1088      add     r3, r3, #1
011E108C      str      r3, [sp, #8]
011E1090      ldr      r2, [sp, #8]
011E1094      ldr      r3, [pc, #0xA4]
011E1098      cmp    r2, r3
011E109C      bge     |DoNativeMathTest + 0xc0 ( 11e1110h )| 
    {
        result = result + i;
011E10A0      ldr      r2, result
011E10A4      ldr      r3, [sp, #8]
011E10A8      add     r3, r2, r3
011E10AC      str      r3, result 

        result = result * i;
011E10B0      ldr      r2, result
011E10B4      ldr      r3, [sp, #8]
011E10B8      mul     r3, r2, r3
011E10BC      str      r3, result 

        result = result - i;
011E10C0      ldr      r2, result
011E10C4      ldr      r3, [sp, #8]
011E10C8      sub     r3, r2, r3
011E10CC      str      r3, result 

        result = result - i * i;
011E10D0      ldr      r1, [sp, #8]
011E10D4      ldr      r3, [sp, #8]
011E10D8      mul     r2, r1, r3
011E10DC      ldr      r3, result
011E10E0      sub     r3, r3, r2
011E10E4      str      r3, result 

        result = result << 5;
011E10E8      ldr      r3, result
011E10EC      mov    r3, r3, lsl #5
011E10F0      str      r3, result 

        result = result + i * i;
011E10F4      ldr      r1, [sp, #8]
011E10F8      ldr      r3, [sp, #8]
011E10FC      mul     r2, r1, r3
011E1100      lde      r3, result
011E1104      add     r3, r3, r2
011E1108      str      r3, result 
    }
011E110C      b        |DoNativeMathTest + 0x34 ( 11e1084h )|

JIT:

000B2430      ldr      lr, [r11, #0x28]
000B2434      ands   lr, lr, #0x17
000B2438      blne    000A00BC
000B243C      add     r5, r5, r6
000B2440      mul     r5, r6, r5
000B2444      sub     r5, r5, r6
000B2448      mul     r1, r6, r6
000B244C      sub     r5, r5, r1
000B2450      mov    r5, r5, lsl #5
000B2454      mul     r1, r6, r6
000B2458      add     r5, r5, r1
000B245C      add     r6, r6, #1
000B2460      ldr      r1, [pc, #0x4C]
000B2464      cmp    r6, r1
000B2468      blt      000B2430

We can see immediately that the C code suffers from using the stack to store the values of return and i.  Every operation requires a series of loads and stores to update the values.  The JIT compiled code uses registers for return and i which makes things a lot faster.

Managed vs. Optimized Native

The JIT seems to be significantly outclassed by the native compiler when the native compiler is allowed to optimize.  Let's find out why by looking at the assembly code generated by the native compiler:

    for (int i = 0; i < 10000000; i++)
011E101C      ldr      r1, [pc, #0x34]
011E1020      mov    r6, r0
011E1024      mov    r4, #0
011E1028      mov    r2, #0 

    {
        result = result + i;
        result = result * i;
        result = result - i;

        result = result - i * i;
        result = result << 5;
        result = result + i * i;
011E102C      add     r3, r2, r4, lsl #5
011E1030      sub     r3, r3, #0x20
011E1034      mul     r4, r3, r2
011E1038      add     r2, r2, #1
011E103C      cmp    r2, r1
011E1040      blt      |DoNativeMathTest + 0x1c ( 11e102ch )| 
    }

First of all, the native compiler, like the JIT, is taking advantage of registers instead of operating on values in memory.  But it also seems to have generated a lot less code--how is this possible?  Well, it turns out "Ryan's Formula" isn't terribly well conceived.  Using a pencil and a piece of engineering paper you'll find that the equation can be simplified to the following in C#:

C#
private int DoOptimizedManagedMathTest(out int duration)
{
    int start = Environment.TickCount;

    int result = 0;

    for (int i = 0; i < 10000000; i++)
    {
        int tmp = i + (result << 5);
        tmp = tmp - 0x20;
        result = tmp * i;
    }

    duration = Environment.TickCount - start;

    return result;
}

It is interesting to note that the native compiler identified this optimization whereas the managed compiler simply converted each step of the formula into an IL instruction, as we can see from the IL generated for the original method:

IL_0000:        call     int32 [mscorlib]System.Environment::get_TickCount()
IL_0005:        stloc.0
IL_0006:        ldc.i4.0
IL_0007:        stloc.1
IL_0008:        ldc.i4.0
IL_0009:        stloc.2
IL_000a:        br.s     IL_002c
IL_000c:        ldloc.1
IL_000d:        ldloc.2
IL_000e:        add
IL_000f:        stloc.1
IL_0010:        ldloc.1
IL_0011:        ldloc.2
IL_0012:        mul
IL_0013:        stloc.1
IL_0014:        ldloc.1
IL_0015:        ldloc.2
IL_0016:        sub
IL_0017:        stloc.1
IL_0018:        ldloc.1
IL_0019:        ldloc.2
IL_001a:        ldloc.2
IL_001b:        mul
IL_001c:        sub
IL_001d:        stloc.1
IL_001e:        ldloc.1
IL_001f:        ldc.i4.5
IL_0020:        shl
IL_0021:        stloc.1
IL_0022:        ldloc.1
IL_0023:        ldloc.2
IL_0024:        ldloc.2
IL_0025:        mul
IL_0026:        add
IL_0027:        stloc.1
IL_0028:        ldloc.2
IL_0029:        ldc.i4.1
IL_002a:        add
IL_002b:        stloc.2
IL_002c:        ldloc.2
IL_002d:        ldc.i4   0x989680
IL_0032:        blt.s
IL_000c
IL_0034:        ldarg.1

The JIT doesn't look for these sorts of optimizations at all and faithfully converted each line of the formula into a processor instruction.  So how did the optimized C# version fare?  The run time...drum roll please...250ms.  We cut the running time almost in half, but we're still quite a bit slower than the native code.  Let's look at the assembly code generated by the JIT for the optimized version:

000B2564      ldr      lr, [r11, #0x28]
000B2568      ands   lr, lr, #0x17
000B256C      blne    000A00BC
000B2570      mov    r1, r6, lsl #5
000B2574      add     r7, r5, r1
000B2578      sub     r7, r7, #0x20
000B257C      mul     r6, r5, r7
000B2580      add     r5, r5, #1
000B2584      ldr      r1, [pc, #0x4C]
000B2588      cmp    r5, r1
000B258C      blt      000B2564

One simple way to start analyzing the performance difference is to simply look at the number of instructions executed in each case.  The optimized native code takes 6 instructions to execute each iteration of the loop.  The JIT-compiled code takes 11.  This alone should cause the managed version to run twice as long.  Furthermore, two of the instructions in the managed code are loading a value from memory.  The extra instructions and the memory latency explain the 2.5x longer running time for the JIT-compiled code.

It is interesting to note that seven of the JIT instructions are handling the loop compared with three in the native case.  Ignoring these instructions, the native code executes the formula itself in three instructions whereas the managed code takes four.  How is this possible?  Notice the first native instruction related to the formula:

011E102C      add     r3, r2, r4, lsl #5

This is actually taking advantage of a feature of the ARM add instruction that lets you specify a shift for the right operand.  This feature is useful for indexing into arrays.  The native compiler was able to take advantage of this feature to implement the first line of the formula as re-written above with one instruction.  The JIT doesn't look for this particular optimization and implements that line with two instructions:

000B2570      mov    r1, r6, lsl #5
000B2574      add     r7, r5, r1

Aside from that line, the other two instructions generated by the JIT are identical to those generated by the native compiler.  The performance difference in this run thus seems to be dominated by the loop overhead.  For my final test, I unrolled the loop and repeated the operation five times per iteration in the C# code:

private int DoOptimizedUnrolledManagedMathTest(out int duration)
{
    int start = Environment.TickCount;

    int result = 0;

    for (int i = 0; i < 10000000; i++)
    {
        int tmp = i + (result << 5);
        tmp = tmp - 0x20;
        result = tmp * i;

        i++;

        tmp = i + (result << 5);
        tmp = tmp - 0x20;
        result = tmp * i;

        i++;

        tmp = i + (result << 5);
        tmp = tmp - 0x20;
        result = tmp * i;

        i++;

        tmp = i + (result << 5);
        tmp = tmp - 0x20;
        result = tmp * i;

        i++;

        tmp = i + (result << 5);
        tmp = tmp - 0x20;
        result = tmp * i;
    }

    duration = Environment.TickCount - start;

    return result;
}

I wanted it to be a fair fight, so I still increment i 5 times since this counts as a math operation.  Running this version, we get a managed execution time of around 150ms.  This is much closer to the native running time and much better than the original C#.  A big chunk of the difference is that the JIT is generating four instructions to the native compiler's three because of the native optimization mentioned above.  Adding this 33% to the native execution time gives us 132ms.  The rest of the difference can be explained by the added overhead due to the iteration.

There's one final issue to look at here: I casually mentioned that the JIT-compiled code had 7 loop-related instructions where the native version had 3.  What is the extra JIT-compiled code doing?  Looking at the code itself, it seems as though the first 3 instructions in the JIT version aren't doing anything related to looping:

000B2430      ldr       lr, [r11, #0x28]
000B2434      ands   lr, lr, #0x17
000B2438      blne    000A00BC
000B243C      add     r5, r5, r6
...
000B2460      ldr      r1, [pc, #0x4C]
000B2464      cmp    r6, r1
000B2468      blt      000B2430

The final line could jump to 000B243C instead of 000B2430 and the loop would still behave properly.

The answer is that the JIT inserts an event check preamble into every loop.  This check is how a managed thread finds out if there is anything the EE needs to do-for example, if the GC needs the thread to suspend itself, if the debugger needs to suspend it, if the thread is aborted, etc.  Register r11 is reserved for an internal "ThreadState" structure that includes a set of flags that indicate that we need to call into the EE to do some work.  This preamble is necessary to guarantee that events like Suspend or Abort can happen eventually-since a thread can theoretically loop forever, this approach lets the EE get work done even if the code that's executing isn't playing nice.

Conclusion

Hopefully this article de-mystified some of what the JIT is doing behind the scenes.  The samples above show that the JIT can generate code that competes very well with native code in most circumstances.  There is a recurring theme that the quality of the JIT-compiled code depends on the quality of the IL stream it is processing.  The managed compilers are not as aggressive as the native compilers when it comes to optimization, so there is a little more burden on developers to write efficient code.  But isn't that how we like it anyway?!