## Counting bits set to 1 in bytes with Python / Popcount or Hamming Weight

I want to run popcount on a large set of 144-byte-long bytearrays. Doing so in Python can be a pita! First, I was using the naive approach with bin(b).count('1') but it was very slow. I did some research on the internet for faster algorithms and I found some. But running an algorithm on a single byte and on a larger set of bytes (Python 3 type bytes with len() = 144) is a different story. So I decided to do some benchmarks with different algorithms:

def count_A(bb):
s = 0
for b in bb:
s += bin(b).count('1')
return s

def count_B(bb):
s = 0
for b in bb:
s += sum( b == '1' for b in bin(b)[2:] )
return s

def count_C(bb):
s = 0
for b in bb:
while b > 0:
b &= b - 1
s += 1
return s

def count_D(bb):
"""
based on http://go.klaus.pw/hamming-weights_python
bb should be of type bytes and should contain be a multiple of 8 bytes
"""
s = 0
for n in struct.unpack('Q'*(len(bb)//8), bb):
n -= (n >> 1) & 0x5555555555555555
n = (n & 0x3333333333333333) + ((n >> 2) & 0x3333333333333333)
n = (n + (n >> 4)) & 0x0f0f0f0f0f0f0f0f
s += ((n * 0x0101010101010101) & 0xffffffffffffffff ) >> 56
return s

def count_E(bb):
"""
bb should be of type bytes and should contain be a multiple of 8 bytes
"""
s = 0
for n in struct.unpack('Q'*(len(bb)//8), bb):
s += bin(n).count('1')
return s

a = b"\xFF\xFF\x01\x00\x30\x00\x00\x01" * 18

## All of them work and show the same result:
count_A(a)
# 360
count_B(a)
# 360
count_C(a)
# 360
count_D(a)
# 360
count_E(a)
# 360

timeit.timeit('count_A(a)', 'from __main__ import a, count_A')
# 62.88516672200058
timeit.timeit('count_B(a)', 'from __main__ import a, count_C')
# 177.9137850170082
timeit.timeit('count_C(a)', 'from __main__ import a, count_B')
# 66.40554738498759
timeit.timeit('count_D(a)', 'from __main__ import a, count_D')
# 17.54908391900244
timeit.timeit('count_E(a)', 'from __main__ import a, count_E')
# 17.919566555006895