-
Notifications
You must be signed in to change notification settings - Fork 46
Add FP32 operators and tiling support for MicroLlama on Snitch #153
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: devel
Are you sure you want to change the base?
Changes from all commits
1a6308e
1bdf9c9
35d51ef
76a4678
c222810
497e9c1
bf5ddb7
7e3659d
28280fb
ac5d541
c90b35c
5669c28
cf4d9bd
c04bd6a
bdc550e
b53ff75
b355624
5306134
32d88c0
7092e35
b2199cb
7ad03a3
d76f6f1
1c62b68
32d4bfa
66c4b4f
be96413
89d382a
e55c7bc
13a4e64
55b6750
06010e4
85a68fd
182a2c3
03125c0
027ccab
064981a
9be8768
fdc0c82
7813684
4865516
fc8ea3f
b6b6eb5
4e8448b
7003801
e693be7
1633a71
0a66cf4
6b357cf
c7b9771
6e736f4
1062ec2
f7d62b0
ebf9009
ebe3e46
94aea9e
9fa6fd7
f87f98b
1924e5e
dfe3f0a
56e2d57
ba1b27e
c54195f
05b01ef
8338159
79c47fa
4ec3fb3
aae6667
e3c0ba8
205a3a5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
|
diaconuccalin marked this conversation as resolved.
diaconuccalin marked this conversation as resolved.
|
|
diaconuccalin marked this conversation as resolved.
diaconuccalin marked this conversation as resolved.
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,3 +31,5 @@ ignore: | |
| - "**/toolchain/" | ||
| # Ignore all files in .git | ||
| - "**/.git/**" | ||
| # Ignore all files in .venv | ||
| - "**/.venv/" | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,6 +11,37 @@ | |
| from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeParser, VariableBuffer | ||
|
|
||
|
|
||
| def compute_broadcast_strides(shape1, shape2, out_shape): | ||
| """Compute strides for ONNX/NumPy-style broadcasting. | ||
|
|
||
| Pads both input shapes from the left to match the output ndim, | ||
| then computes strides where broadcast dimensions (size 1) get stride 0. | ||
|
|
||
| Example: | ||
| shape1=[8,8,8], shape2=[8] | ||
| -> strides1=[64,8,1], strides2=[0,0,1] | ||
| """ | ||
| ndim = len(out_shape) | ||
|
|
||
| pad1 = [1] * (ndim - len(shape1)) + shape1 | ||
| pad2 = [1] * (ndim - len(shape2)) + shape2 | ||
|
|
||
| def _calc_strides(padded_shape, out_shape): | ||
| strides = [] | ||
| stride = 1 | ||
| for i in range(ndim - 1, -1, -1): | ||
| if padded_shape[i] == 1 and out_shape[i] > 1: | ||
| strides.insert(0, 0) | ||
| else: | ||
| strides.insert(0, stride) | ||
| stride *= padded_shape[i] if padded_shape[i] > 1 else 1 | ||
| return strides | ||
|
|
||
| strides1 = _calc_strides(pad1, out_shape) | ||
| strides2 = _calc_strides(pad2, out_shape) | ||
| return strides1, strides2 | ||
|
|
||
|
|
||
| class UnaryElementWiseParser(NodeParser): | ||
|
|
||
| def parseNode(self, node: gs.Node) -> bool: | ||
|
|
@@ -72,6 +103,10 @@ def parseNode(self, node: gs.Node) -> (bool): | |
| self.operatorRepresentation['n_levels'] = int(node.attrs['n_levels']) | ||
| self.operatorRepresentation['log2D'] = int(math.log2(node.attrs['D'])) | ||
|
|
||
| stash_type = node.attrs.get('stash_type', 1) | ||
| if stash_type != 1: | ||
| raise ValueError(f"iRMSNorm: only stash_type=1 (FP32) is supported, got {stash_type}") | ||
|
|
||
| return ret | ||
|
|
||
| def parseNodeCtxt(self, | ||
|
|
@@ -87,8 +122,19 @@ def parseNodeCtxt(self, | |
| for idx, outputNode in enumerate(node.outputs): | ||
| self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name | ||
|
|
||
| self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape) | ||
| self.operatorRepresentation['lastDimLength'] = ctxt.lookup(node.inputs[0].name).shape[-1] | ||
| input_shape = list(ctxt.lookup(node.inputs[0].name).shape) | ||
|
|
||
| axis = node.attrs.get('axis', -1) | ||
| if axis < 0: | ||
| axis = len(input_shape) + axis | ||
|
|
||
| self.operatorRepresentation['inputSize'] = int(np.prod(input_shape)) | ||
| self.operatorRepresentation['NormalizedAxesSize'] = int(np.prod(input_shape[axis:])) | ||
| self.operatorRepresentation['scale'] = node.inputs[1].values | ||
|
|
||
| # Keep old keys for C template compatibility | ||
| self.operatorRepresentation['size'] = int(np.prod(input_shape)) | ||
| self.operatorRepresentation['lastDimLength'] = int(input_shape[-1]) | ||
|
Comment on lines
+136
to
+137
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please propagate the name change of the |
||
|
|
||
| return ctxt, True | ||
|
|
||
|
|
@@ -488,23 +534,37 @@ def __init__(self): | |
| super().__init__() | ||
|
|
||
| def parseNode(self, node: gs.Node) -> bool: | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remember to run make format to fix these weird blank line changes |
||
| ret = all([len(node.inputs) == 2, len(node.outputs) == 1]) | ||
|
|
||
| return ret | ||
|
|
||
| def parseNodeCtxt(self, | ||
| ctxt: NetworkContext, | ||
| node: gs.Node, | ||
| channels_first: bool = True) -> Tuple[NetworkContext, bool]: | ||
|
|
||
| data_in_1 = ctxt.lookup(node.inputs[0].name) | ||
| data_in_2 = ctxt.lookup(node.inputs[1].name) | ||
| data_out = ctxt.lookup(node.outputs[0].name) | ||
|
|
||
| self.operatorRepresentation['data_in_1'] = data_in_1.name | ||
| self.operatorRepresentation['data_in_2'] = data_in_2.name | ||
| self.operatorRepresentation['data_out'] = data_out.name | ||
| self.operatorRepresentation['size'] = np.prod(data_in_1.shape) | ||
| self.operatorRepresentation['size'] = np.prod(data_out.shape) | ||
|
|
||
| # Check if broadcasting is needed | ||
| shape1 = list(data_in_1.shape) | ||
| shape2 = list(data_in_2.shape) | ||
| out_shape = list(data_out.shape) | ||
|
|
||
| need_broadcast = (shape1 != out_shape) or (shape2 != out_shape) | ||
| self.operatorRepresentation['need_broadcast'] = need_broadcast | ||
|
|
||
| if need_broadcast: | ||
| strides1, strides2 = compute_broadcast_strides(shape1, shape2, out_shape) | ||
|
Comment on lines
+553
to
+562
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Broadcasting should be handled in the Here I am confused at what is happening with strides and all.... |
||
|
|
||
| self.operatorRepresentation['ndim'] = len(out_shape) | ||
| self.operatorRepresentation['strides1'] = strides1 | ||
| self.operatorRepresentation['strides2'] = strides2 | ||
|
diaconuccalin marked this conversation as resolved.
|
||
| self.operatorRepresentation['out_shape'] = out_shape | ||
|
|
||
| return ctxt, True | ||
|
|
||
|
|
@@ -2097,15 +2157,15 @@ def parseNodeCtxt(self, | |
| node: gs.Node, | ||
| channels_first: bool = True) -> Tuple[NetworkContext, bool]: | ||
|
|
||
| inputs = ["input1", "input2"] | ||
| outputs = ["output"] | ||
| inputs = ["A", "B"] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why changing that?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might have been a past nitpick comment of mine (it's been 4-5 months so I don't really remember 😅), but this is the naming used in the onnx standard. It would make sense for us to keep it like this, but we can also revert it, since it's slightly outside of the scope of this PR. |
||
| outputs = ["C"] | ||
| for idx, inputNode in enumerate(node.inputs): | ||
| if idx < len(inputs): | ||
| self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name | ||
| for idx, outputNode in enumerate(node.outputs): | ||
| self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name | ||
|
|
||
| self.operatorRepresentation['size'] = np.prod(ctxt.lookup(self.operatorRepresentation['input1']).shape) | ||
| self.operatorRepresentation['size'] = np.prod(ctxt.lookup(self.operatorRepresentation['A']).shape) | ||
|
|
||
| return ctxt, True | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,5 +6,5 @@ | |
|
|
||
| referenceTemplate = NodeTemplate(""" | ||
| // Division (Name: ${nodeName}, Op: ${nodeOp}) | ||
| SINGLE_CORE Div_fp${input1_type.referencedType.typeWidth}_fp${input2_type.referencedType.typeWidth}_fp${output_type.referencedType.typeWidth}(${input1}, ${input2}, ${output}, ${size}); | ||
| SINGLE_CORE Div_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${C_type.referencedType.typeWidth}(${A}, ${B}, ${C}, ${size}); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change seems unnecessary.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I remember Calin advised me to do it to keep the input naming convention consistent.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, possibly an old nitpick comment of mine, this is the naming convention for Div in ONNX, can be reverted, no problem. |
||
| """) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,6 +34,12 @@ def alignToContext(self, ctxt: NetworkContext, | |
| bufferIn.aliases.add(bufferOut.name) | ||
| bufferOut.aliases.add(bufferIn.name) | ||
|
|
||
| # Tiling still reads the legacy single-valued `_alias` attribute | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would be better practice to do the fix in the tiler (prevent it from relying on _alias and instead use the new alises parameter)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This workload is too heavy, it would touch the deeploy core:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggest to open a new pr to fix this
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's get a second opinion on this one from @Victor-Jung maybe
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is the intended behavior to instantiate a subclass to put this hack in place, just as it's done for the @lee2716 can you open an issue describing your fix to the tiler such that one can open a PR against it later? |
||
| # (TilerExtension / MemoryScheduler). Set it here so platforms that | ||
| # rely on Reshape pointer-passthrough during tiling don't each need | ||
| # to carry the same workaround in a subclass. | ||
| bufferOut._alias = bufferIn.name | ||
|
|
||
| return ctxt, operatorRepresentation, [] | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,11 +12,26 @@ | |
| from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint | ||
| from Deeploy.TilingExtension.TileConstraint import TileConstraint | ||
| from Deeploy.TilingExtension.TilerModel import TilerModel | ||
| from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme | ||
| from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ | ||
| VariableReplacementScheme | ||
|
|
||
|
|
||
| class BOPTileConstraint(TileConstraint): | ||
| """Tile constraint class for binary operators, i.e. operators that use two input tensors of equal dimensions | ||
| """Tile constraint class for binary operators, i.e. operators that have exactly 2 inputs and 1 output. | ||
|
|
||
| When the second input is a scalar (total size 1), it is kept full-size and only | ||
|
diaconuccalin marked this conversation as resolved.
|
||
| the first input and the output are tiled together. This supports ONNX | ||
| broadcasting in operators that have a corresponding scalar kernel. | ||
|
diaconuccalin marked this conversation as resolved.
|
||
|
|
||
| Warning: | ||
| Broadcasting support is partial -- only the case of a fully-scalar | ||
| second input (np.prod(input2.shape) == 1) is handled. Other ONNX | ||
| broadcasting patterns -- input1 scalar, partial broadcasting such | ||
| as (N, 1) + (1, M), single-dim broadcasting such as (N, M, K) + | ||
| (N, 1, K), or rank-mismatched shapes such as (N, M) + (M,) -- | ||
| fall through to the non-scalar branch, where the dim-equality | ||
| constraints will fail to satisfy. Operators that need full ONNX | ||
| broadcasting must use a different tile constraint. | ||
| """ | ||
|
|
||
| dataIn1Name = 'data_in_1' #: str: Name of the first input tensor as defined by the operator's parser | ||
|
|
@@ -34,14 +49,27 @@ def addGeometricalConstraint(cls, tilerModel: TilerModel, parseDict: Dict, ctxt: | |
| tilerModel.addTensorDimToModel(ctxt, bufferName) | ||
|
|
||
| input1Shape = ctxt.lookup(inputBuffer1Name).shape | ||
|
|
||
| for dim in range(len(input1Shape)): | ||
| inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) | ||
| inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim) | ||
| outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) | ||
|
|
||
| tilerModel.addConstraint(inputDim1Var == inputDim2Var) | ||
| tilerModel.addConstraint(inputDim1Var == outputDimVar) | ||
| input2Shape = list(ctxt.lookup(inputBuffer2Name).shape) | ||
| is_scalar = (np.prod(input2Shape) == 1) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NITPICK: Rename
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you Victor, I can rename it later. Also I'll open a issue to describe my fix to the tiler and open a new pr to fix it. |
||
|
|
||
| if is_scalar: | ||
| # Scalar broadcasting: tile input1 and output together; input2 stays full-size. | ||
| for dim in range(len(input1Shape)): | ||
| inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) | ||
| outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) | ||
| tilerModel.addConstraint(inputDim1Var == outputDimVar) | ||
| for dim in range(len(input2Shape)): | ||
| inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim) | ||
| tilerModel.addConstraint(inputDim2Var == input2Shape[dim]) | ||
| else: | ||
| # Element-wise: all three tensors tiled identically. | ||
| for dim in range(len(input1Shape)): | ||
| inputDim1Var = tilerModel.getTensorDimVar(tensorName = inputBuffer1Name, dimIdx = dim) | ||
| inputDim2Var = tilerModel.getTensorDimVar(tensorName = inputBuffer2Name, dimIdx = dim) | ||
| outputDimVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = dim) | ||
|
|
||
| tilerModel.addConstraint(inputDim1Var == inputDim2Var) | ||
| tilerModel.addConstraint(inputDim1Var == outputDimVar) | ||
|
|
||
| return tilerModel | ||
|
|
||
|
|
@@ -64,11 +92,18 @@ def serializeTilingSolution( | |
| newSize = np.prod(cube.dims) | ||
| replacements["size"].append(newSize) | ||
|
|
||
| input2Shape = list(ctxt.lookup(operatorRepresentation[cls.dataIn2Name]).shape) | ||
| is_scalar = (np.prod(input2Shape) == 1) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NITPICK: Same renaming here |
||
|
|
||
| inputLoadSchedule = [] | ||
| outputLoadSchedule = [] | ||
|
|
||
| for cube in outputCubes: | ||
| inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: cube}) | ||
| if is_scalar: | ||
| in2Cube = HyperRectangle(tuple([0] * len(input2Shape)), tuple(input2Shape)) | ||
|
diaconuccalin marked this conversation as resolved.
|
||
| inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: in2Cube}) | ||
| else: | ||
| inputLoadSchedule.append({cls.dataIn1Name: cube, cls.dataIn2Name: cube}) | ||
|
|
||
| for out in outputCubes: | ||
| outputLoadSchedule.append({cls.dataOutName: out}) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.