Waste

Waste is the number one multiplier why programs get very slow. This giving the CPU instructions that do nothing related to the task at hand.

An example

Let’s consider two different ways an x64 CPU might perform an addition

ADD a, b // a += b
// or
LEA c, [A+B] // c = a + b

Waste in a C program

int add(int A, int B) {
  return A + B;
}

int main() {
  return add(1234, 5678);
}

This is translated to the following instructions in the worst case if the compiler does now compile time optimizations. There is no waste in this code.

LEA eax, [rcx+rdx]
RET // return

Waste in a Python program

def add(a, b):
  return a + b

This is translated first to bytecode and then to assembly. Because of this interpretation, a lot of waste is introduced. Included are the 181 lines of assembly required to add two numbers together in python.

mov         qword ptr [r11+38h],r14
add         r14,2
jmp         _PyEval_EvalFrameDefault+5872h (07FFDFD04F462h)
mov         rdi,qword ptr [r12-8]
lea         rax,[__ImageBase (07FFDFCE20000h)]
mov         rsi,qword ptr [r12-10h]
sub         r12,8
mov         rdx,rdi
mov         rcx,rsi
call        qword ptr [rax+r13*8+3E7A70h]
mov         qword ptr [rsp+8],rbx
push        rdi
sub         rsp,30h
xor         r8d,r8d
mov         rdi,rdx
mov         rbx,rcx
call        binary_op1 (07FFDFCF06C70h)
mov         qword ptr [rsp+10h],rbp
mov         qword ptr [rsp+18h],rsi
mov         qword ptr [rsp+20h],rdi
push        r14
sub         rsp,20h
mov         rsi,rdx
movsxd      r9,r8d
mov         rdx,qword ptr [rcx+8]
mov         rbp,rcx
mov         rdi,qword ptr [rdx+60h]
test        rdi,rdi
je          binary_op1+2Fh (07FFDFCF06C9Fh)
mov         rdi,qword ptr [rdi+r9]
mov         rcx,qword ptr [rsi+8]
mov         qword ptr [rsp+30h],rbx
cmp         rcx,rdx
je          binary_op1+55h (07FFDFCF06CC5h)
xor         ebx,ebx
lea         r14,[_Py_NotImplementedStruct (07FFDFD2E6AF0h)]
test        rdi,rdi
je          binary_op1+0ADh (07FFDFCF06D1Dh)
test        rbx,rbx
je          binary_op1+90h (07FFDFCF06D00h)
mov         rdx,rsi
mov         rcx,rbp
call        rdi
mov         rax,qword ptr [rcx+8]
test        dword ptr [rax+0A8h],1000000h
je          long_add+24h (07FFDFCF60434h)
mov         rax,qword ptr [rdx+8]
test        dword ptr [rax+0A8h],1000000h
jne         _PyLong_Add (07FFDFCF60380h)
sub         rsp,28h
mov         r8,rdx
mov         rdx,qword ptr [rcx+10h]
lea         rax,[rdx+1]
cmp         rax,3
jae         _PyLong_Add+3Eh (07FFDFCF603BEh)
mov         r9,qword ptr [r8+10h]
lea         rax,[r9+1]
cmp         rax,3
jae         _PyLong_Add+3Eh (07FFDFCF603BEh)
mov         ecx,dword ptr [rcx+18h]
mov         eax,dword ptr [r8+18h]
imul        rcx,rdx
imul        rax,r9
add         rcx,rax # RIGHT HERE
add         rsp,28h
jmp         _PyLong_FromSTwoDigits (07FFDFCF5AF10h)
push        rdi
sub         rsp,20h
lea         rax,[rcx+5]
mov         rdi,rcx
cmp         rax,105h
ja          _PyLong_FromSTwoDigits+2Fh (07FFDFCF5AF3Fh)
lea         rax,[rcx+3FFFFFFFh]
cmp         rax,7FFFFFFFh
jae         _PyLong_FromSTwoDigits+48h (07FFDFCF5AF58h)
add         rsp,20h
pop         rdi
jmp         _PyLong_FromMedium (07FFDFCF5AE50h)
mov         qword ptr [rsp+10h],rbx
push        rsi
sub         rsp,20h
movsxd      rsi,ecx
mov         edx,20h
mov         rcx,qword ptr [_PyObject (07FFDFD3B8CA8h)]
call        qword ptr [_PyObject+8h (07FFDFD3B8CB0h)]
push        rbx
sub         rsp,20h
mov         rbx,rdx
call        pymalloc_alloc (07FFDFCF73C80h)
sub         rsp,28h
lea         rax,[rdx-1]
cmp         rax,1FFh
jbe         pymalloc_alloc+17h (07FFDFCF73C97h)
mov         qword ptr [rsp+30h],rbx
mov         qword ptr [rsp+38h],rsi
mov         qword ptr [rsp+40h],rdi
lea         edi,[rdx-1]
shr         edi,4
mov         qword ptr [rsp+20h],r14
lea         r14,[__ImageBase (07FFDFCE20000h)]
lea         eax,[rdi+rdi]
lea         rsi,[rax*8+4C7120h]
mov         rdx,qword ptr [rsi+r14]
mov         rcx,qword ptr [rdx+10h]
cmp         rdx,rcx
je          pymalloc_alloc+9Fh (07FFDFCF73D1Fh)
mov         r9,qword ptr [rdx+8]
inc         dword ptr [rdx]
mov         rax,qword ptr [r9]
mov         qword ptr [rdx+8],rax
test        rax,rax
jne         pymalloc_alloc+1C1h (07FFDFCF73E41h)
mov         r14,qword ptr [rsp+20h]
mov         rax,r9
mov         rdi,qword ptr [rsp+40h]
mov         rsi,qword ptr [rsp+38h]
mov         rbx,qword ptr [rsp+30h]
add         rsp,28h
ret
test        rax,rax
jne         _PyObject_Malloc+46h (07FFDFCF73EA6h)
add         rsp,20h
pop         rbx
ret
mov         rbx,rax
test        rax,rax
jne         _PyLong_FromMedium+4Ah (07FFDFCF5AE9Ah)
mov         qword ptr [rsp+30h],rdi
mov         eax,esi
cdq
mov         rcx,rsi
sar         rcx,3Fh
mov         edi,eax
and         rcx,0FFFFFFFFFFFFFFFEh
lea         rax,[PyLong_Type (07FFDFD2E52E0h)]
xor         edi,edx
mov         qword ptr [rbx+8],rax
inc         rcx
sub         edi,edx
mov         qword ptr [rbx+10h],rcx
test        dword ptr [PyLong_Type+0A8h (07FFDFD2E5388h)],200h
je          _PyLong_FromMedium+88h (07FFDFCF5AED8h)
cmp         dword ptr [_Py_tracemalloc_config+4h (07FFDFD2E7324h)],0
je          _PyLong_FromMedium+99h (07FFDFCF5AEE9h)
mov         dword ptr [rbx+18h],edi
mov         rax,rbx
mov         rdi,qword ptr [rsp+30h]
mov         qword ptr [rbx],1
mov         rbx,qword ptr [rsp+38h]
add         rsp,20h
pop         rsi
ret
mov         rcx,rax
cmp         rax,r14
jne         binary_op1+0D9h (07FFDFCF06D49h)
mov         rbx,qword ptr [rsp+30h]
mov         rbp,qword ptr [rsp+38h]
mov         rsi,qword ptr [rsp+40h]
mov         rdi,qword ptr [rsp+48h]
add         rsp,20h
pop         r14
ret
mov         rcx,rax
lea         rax,[_Py_NotImplementedStruct (07FFDFD2E6AF0h)]
cmp         rcx,rax
je          PyNumber_Add+35h (07FFDFCF073B5h)
mov         rax,rcx
mov         rbx,qword ptr [rsp+40h]
add         rsp,30h
pop         rdi
ret
sub         qword ptr [rsi],1
mov         r15,rax
jne         _PyEval_EvalFrameDefault+58AFh (07FFDFD04F49Fh)
sub         qword ptr [rdi],1
jne         _PyEval_EvalFrameDefault+58C6h (07FFDFD04F4B6h)
mov         qword ptr [r12-8],r15
test        r15,r15
mov         r15,qword ptr [rsp+30h]
je          _PyEval_EvalFrameDefault+161h (07FFDFD049D51h)
add         r14,2
jmp         _PyEval_EvalFrameDefault+59FCh (07FFDFD04F5ECh)

How can we do better?

The only way to make a python program fast is to look for ways to avoid using python. Python packages that are known for speed like numpy and tensorflow will call out to C programs to do the bulk of their work. It is also possible to leverage a JIT compiler that reduces some code and loops into just the necessary instructions.

Failing to do any of these things means you will be run code at a 130x slowdown just for a simple addition. This means a worse experience for the end user and much high server costs.