diff --git a/thesis/chapters/appendix.tex b/thesis/chapters/appendix.tex
index b89aca8..ef0fd48 100644
--- a/thesis/chapters/appendix.tex
+++ b/thesis/chapters/appendix.tex
@@ -74,9 +74,12 @@ It has been mentioned several times that the implementation using
 implementation.  To support this statement a simple benchmark can be used. The
 relatively simple Pauli $X$ is used, more complicated gates like $CX$ or $H$
 have worse performance when implemented in \lstinline{python}. The performance
-improvement when using the \lstinline{ufunc} is around $1.7$ in this tested
+improvement when using the \lstinline{ufunc} is a factor around $6.4$ in this tested
 case. One must however note that the tested \lstinline{python} code is not
 realistic and in a possible applications there would be a significant overhead.
 
 \lstinputlisting[title={Code to Benchmark \lstinline{ufunc} Gates against Python}, language=Python, breaklines=True]{extra_benchmark/benchmark.py}
 
+When using \lstinline{result_py[0::2] = qm_state[1::2]} the result is identical and
+the performance is only increased by a factor around $1.7$. This method is however not
+applicable to general act-qbits and the bit mask has to be used.
diff --git a/thesis/extra_benchmark/benchmark.py b/thesis/extra_benchmark/benchmark.py
index 36b1ef7..a0759a9 100644
--- a/thesis/extra_benchmark/benchmark.py
+++ b/thesis/extra_benchmark/benchmark.py
@@ -23,13 +23,15 @@ time_uf = timeit.repeat("result_uf = gate_uf(qm_state, cl_state)"
             , number=1_000_000
             )
 
-time_py = timeit.repeat("result_py = np.zeros(2**10, dtype=np.cdouble);"
-                    "result_py[0::2] = qm_state[1::2];"
-                    "result_py[1::2] = qm_state[0::2];"
-                    "cl_py = np.zeros(10, dtype=np.int8)"
+time_py = timeit.repeat(
+                "result_py = np.zeros(2**10, dtype=np.cdouble);"
+                "result_py[~bit_mask] = qm_state[bit_mask];"
+                "result_py[bit_mask] = qm_state[~bit_mask];"
+                "cl_py = np.zeros(10, dtype=np.int8)"
             , setup="import numpy as np;"
                     "qm_state = np.zeros(2**10, dtype=np.cdouble);"
                     "qm_state[0] = 1;"
+                    "bit_mask = np.array([1 if (i & (1 << 0)) else 0 for i in range(2**10)])"
             , repeat=5
             , number=1_000_000
             )
@@ -38,10 +40,11 @@ print(" done")
 print("running test ...", end="", flush=True)
 result_uf, cl, m = gate_uf(qm_state, cl_state);
 
+bit_mask = np.array([1 if (i & (1 << 0)) else 0 for i in range(2**10)]).astype(np.bool)
 result_py = np.zeros(2**10, dtype=np.cdouble)
 cl_py = np.zeros(10, dtype=np.int8)
-result_py[0::2] = qm_state[1::2]
-result_py[1::2] = qm_state[0::2]
+result_py[~bit_mask] = qm_state[bit_mask]
+result_py[bit_mask] = qm_state[~bit_mask]
 
 assert np.allclose(result_py, result_uf)
 print(" done")