From 2f1b34b4cd94360e6565000925e9ea02342fb616 Mon Sep 17 00:00:00 2001
From: Timothee Leclaire-Fournier <timotheel-f@protonmail.com>
Date: Sat, 2 Mar 2024 12:48:49 -0500
Subject: [PATCH] Meta: Add more documentation and correct naming.

---
 README.md     | 25 +++++++++++++++++++++----
 allocPool.hpp | 12 +++++++++---
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 355c061..8de7375 100644
--- a/README.md
+++ b/README.md
@@ -5,17 +5,22 @@ avoid expensive allocations during runtime. This preallocates objects in the
 constructor (with threads) then offers you two functions: `getPtr()` and `returnPtr(ptr)`.
 
 Using C++ concepts, we can use templates and require the class given to have a
-default constructor and to have a .reset() function. It will be used to clean the
+default constructor and to have a `.reset()` function. It will be used to clean the
 objects before giving them to another caller.
 
-This pool uses a hashmap and a pivot to make returnPtr(ptr) extremely fast.
+We avoid false sharing by keeping a high amount of work per thread. This should
+lead to cache lines not being shared between threads. While this pool uses a hashmap 
+and a pivot to make `returnPtr(ptr)` extremely fast, the construction's bottleneck is
+in the locking and unlocking of the hashmap's mutex. We need to do this since we cannot
+write in a `std::unordered_map` at different hashes concurrently.
 
 It will automatically grow when the max capacity is reached, though there will
 be a performance penalty.
 
 ## Performance
 With a simple stub class and a pool of 10000 objects, using the pool to take a pointer 
-and give it back takes 3 ms vs 19 ms when allocating and deallocating by hand.
+and give it back for each element is significantly faster than doing it by hand.
+
 ```
 class stub {
 public:
@@ -27,4 +32,16 @@ public:
 private:
     int i = 15;
 };
-```
\ No newline at end of file
+```
+```
+Time (milliseconds) required for allocations without pool: 21
+Time (milliseconds) required for allocations with pool: 3
+Time (milliseconds) required for real allocations when constructing pool: 9
+```
+
+This trivial example show some performance improvements that would be much more
+important should the allocation and construction of objects be more complex.
+
+## Safety
+AddressSanitizer, LeakSanitizer and ThreadSanitizer have been used to ensure the safety
+of the class. Tests have been added to ensure the correct behavior in all cases.
\ No newline at end of file
diff --git a/allocPool.hpp b/allocPool.hpp
index f57d437..0cadab6 100644
--- a/allocPool.hpp
+++ b/allocPool.hpp
@@ -59,13 +59,17 @@ private:
     void initArray(size_t amount) {
         const auto amountOfThreads{std::thread::hardware_concurrency()};
         assert(amountOfThreads);
-        const auto amountPerThreads{amount / amountOfThreads};
+        const auto amountPerThread{amount / amountOfThreads};
 
         std::vector<std::thread> threads;
         threads.reserve(amountOfThreads);
 
+        // Using an allocPool, we estimate that we want to allocate a lot of objects, therefore
+        // the amount per thread *should* be higher than a cache line. This means we should, for
+        // the most part, avoid false sharing. In the case that it isn't, then the total amount
+        // should be pretty low, therefore false sharing shouldn't matter.
         for (size_t i{}; i < amountOfThreads; i++)
-            threads.emplace_back(&allocPool::initObjects, this, i * amountPerThreads, amountPerThreads);
+            threads.emplace_back(&allocPool::initObjects, this, i * amountPerThread, amountPerThread);
 
         for (auto &t: threads)
             t.join();
@@ -76,9 +80,11 @@ private:
 
     void initObjects(size_t startIdx, size_t amount) {
         for (size_t i{}; i < amount; i++) {
-            // TODO: Be more cache friendly by making a vector per thread, then doing memcpy into the original vector.
             vec[startIdx + i] = new T;
         }
+
+        // In the future, it should be possible to write a custom hashmap with sections
+        // with independent locks, or use a data structure which would be contiguous.
         std::lock_guard<std::mutex> guard(positionMapMutex);
         for (size_t i{}; i < amount; i++) {
             positionMap[vec[startIdx + i]] = i;