Skip to content

Commit c38a576

Browse files
+ Added pure python chunk spliting implementation
1 parent c0e5610 commit c38a576

File tree

1 file changed

+53
-0
lines changed

1 file changed

+53
-0
lines changed

src/thread/utils/algorithm.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""
2+
This file shall host the shared algorithms
3+
4+
If it gets too dense, we could consider splitting it into a library import
5+
|_ algorithm/
6+
|_ __init__.py
7+
|_ a.py
8+
|_ b.py
9+
"""
10+
11+
from typing import List, Sequence, Any
12+
13+
14+
def chunk_split(dataset: Sequence[Any], number_of_chunks: int) -> List[List[Any]]:
15+
"""
16+
Splits a dataset into balanced chunks
17+
18+
If the size of the dataset is not fully divisible by the number of chunks, it is split like this
19+
> `[ [n+1], [n+1], [n+1], [n], [n], [n] ]`
20+
21+
22+
Parameters
23+
----------
24+
:param dataset: This should be the dataset you want to split into chunks
25+
:param number_of_chunks: The should be the number of chunks it will attempt to split into
26+
27+
28+
Returns
29+
-------
30+
:returns list[list[Any]]: The split dataset
31+
32+
Raises
33+
------
34+
AssertionError: The number of chunks specified is larger than the dataset size
35+
"""
36+
length = len(dataset)
37+
assert length >= number_of_chunks, 'The number of chunks specified is larger than the dataset size'
38+
39+
chunk_count = length // number_of_chunks
40+
overflow = length % number_of_chunks
41+
42+
i = 0
43+
split = []
44+
while i < length:
45+
chunk_length = chunk_count + int(overflow > 0)
46+
b = i + chunk_length
47+
48+
split.append(dataset[i:b])
49+
overflow -= 1
50+
i = b
51+
52+
return split
53+

0 commit comments

Comments
 (0)